1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-SLOW,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-FAST,FALLBACK9
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK10
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK11
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK12
15 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK13
17 define void @vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
18 ; SSE2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
20 ; SSE2-NEXT: movdqa (%rdi), %xmm0
21 ; SSE2-NEXT: paddb (%rsi), %xmm0
22 ; SSE2-NEXT: pxor %xmm1, %xmm1
23 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
24 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
25 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
26 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,2,4,5,6,7]
27 ; SSE2-NEXT: packuswb %xmm0, %xmm0
28 ; SSE2-NEXT: paddb (%rdx), %xmm0
29 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
32 ; SSE42-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
34 ; SSE42-NEXT: movdqa (%rdi), %xmm0
35 ; SSE42-NEXT: paddb (%rsi), %xmm0
36 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
37 ; SSE42-NEXT: paddb (%rdx), %xmm0
38 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
41 ; AVX-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
43 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
44 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
45 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
46 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
47 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
50 ; AVX2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
52 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
53 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
54 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
55 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
56 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
57 ; AVX2-NEXT: vzeroupper
60 ; AVX512F-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
62 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
63 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
64 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
65 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
66 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
67 ; AVX512F-NEXT: vzeroupper
70 ; AVX512DQ-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
72 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
73 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
74 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
75 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
76 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
77 ; AVX512DQ-NEXT: vzeroupper
80 ; AVX512BW-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
82 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
83 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
84 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
85 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
86 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
87 ; AVX512BW-NEXT: vzeroupper
89 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
90 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
91 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
92 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
93 %out.bytevec.padded = shufflevector <4 x i8> %broadcast.of.zextinreg, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
94 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
95 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
96 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
100 define void @vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
101 ; SSE2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
103 ; SSE2-NEXT: movdqa (%rdi), %xmm0
104 ; SSE2-NEXT: paddb (%rsi), %xmm0
105 ; SSE2-NEXT: pxor %xmm1, %xmm1
106 ; SSE2-NEXT: movdqa %xmm0, %xmm2
107 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
108 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
109 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
110 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
111 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
112 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
113 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
114 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
115 ; SSE2-NEXT: packuswb %xmm2, %xmm2
116 ; SSE2-NEXT: paddb (%rdx), %xmm2
117 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
120 ; SSE42-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
122 ; SSE42-NEXT: movdqa (%rdi), %xmm0
123 ; SSE42-NEXT: paddb (%rsi), %xmm0
124 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
125 ; SSE42-NEXT: paddb (%rdx), %xmm0
126 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
129 ; AVX-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
131 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
132 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
133 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
134 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
135 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
138 ; AVX2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
140 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
141 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
142 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
143 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
144 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
145 ; AVX2-NEXT: vzeroupper
148 ; AVX512F-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
150 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
151 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
152 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
153 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
154 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
155 ; AVX512F-NEXT: vzeroupper
158 ; AVX512DQ-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
160 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
161 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
162 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
163 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
164 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
165 ; AVX512DQ-NEXT: vzeroupper
166 ; AVX512DQ-NEXT: retq
168 ; AVX512BW-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
170 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
171 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
172 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
173 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
174 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
175 ; AVX512BW-NEXT: vzeroupper
176 ; AVX512BW-NEXT: retq
177 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
178 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
179 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
180 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
181 %out.bytevec.padded = shufflevector <8 x i8> %broadcast.of.zextinreg, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
182 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
183 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
184 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
188 define void @vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
189 ; SSE2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
191 ; SSE2-NEXT: movdqa (%rdi), %xmm0
192 ; SSE2-NEXT: paddb (%rsi), %xmm0
193 ; SSE2-NEXT: pxor %xmm1, %xmm1
194 ; SSE2-NEXT: movdqa %xmm0, %xmm2
195 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
196 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535]
197 ; SSE2-NEXT: pand %xmm3, %xmm2
198 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
199 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
200 ; SSE2-NEXT: pandn %xmm0, %xmm3
201 ; SSE2-NEXT: por %xmm2, %xmm3
202 ; SSE2-NEXT: packuswb %xmm3, %xmm3
203 ; SSE2-NEXT: paddb (%rdx), %xmm3
204 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
207 ; SSE42-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
209 ; SSE42-NEXT: movdqa (%rdi), %xmm0
210 ; SSE42-NEXT: paddb (%rsi), %xmm0
211 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
212 ; SSE42-NEXT: paddb (%rdx), %xmm0
213 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
216 ; AVX-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
218 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
219 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
220 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
221 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
222 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
225 ; AVX2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
227 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
228 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
229 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
230 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
231 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
232 ; AVX2-NEXT: vzeroupper
235 ; AVX512F-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
237 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
238 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
239 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
240 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
241 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
242 ; AVX512F-NEXT: vzeroupper
245 ; AVX512DQ-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
247 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
248 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
249 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
250 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
251 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
252 ; AVX512DQ-NEXT: vzeroupper
253 ; AVX512DQ-NEXT: retq
255 ; AVX512BW-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
257 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
258 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
259 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
260 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
261 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
262 ; AVX512BW-NEXT: vzeroupper
263 ; AVX512BW-NEXT: retq
264 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
265 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
266 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
267 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
268 %out.bytevec.padded = shufflevector <8 x i8> %broadcast.of.zextinreg, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
269 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
270 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
271 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
275 define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
276 ; SSE2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
278 ; SSE2-NEXT: movdqa (%rdi), %xmm0
279 ; SSE2-NEXT: paddb (%rsi), %xmm0
280 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
281 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
282 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,2,4,5,6,7]
283 ; SSE2-NEXT: paddb (%rdx), %xmm0
284 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
287 ; SSE42-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
289 ; SSE42-NEXT: movdqa (%rdi), %xmm0
290 ; SSE42-NEXT: paddb (%rsi), %xmm0
291 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
292 ; SSE42-NEXT: paddb (%rdx), %xmm0
293 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
296 ; AVX-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
298 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
299 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
300 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
301 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
302 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
305 ; AVX2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
307 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
308 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
309 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
310 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
311 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
312 ; AVX2-NEXT: vzeroupper
315 ; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
317 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
318 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
319 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
320 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
321 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
322 ; AVX512F-NEXT: vzeroupper
325 ; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
327 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
328 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
329 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
330 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
331 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
332 ; AVX512DQ-NEXT: vzeroupper
333 ; AVX512DQ-NEXT: retq
335 ; AVX512BW-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
337 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
338 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
339 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
340 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
341 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
342 ; AVX512BW-NEXT: vzeroupper
343 ; AVX512BW-NEXT: retq
344 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
345 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
346 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
347 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
348 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
349 %out.bytevec = bitcast <4 x i16> %broadcast.of.zextinreg to <8 x i8>
350 %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
351 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
352 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
353 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
357 define void @vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
358 ; SSE2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
360 ; SSE2-NEXT: movdqa (%rdi), %xmm0
361 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
362 ; SSE2-NEXT: paddb (%rsi), %xmm0
363 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
364 ; SSE2-NEXT: psrlw $8, %xmm1
365 ; SSE2-NEXT: packuswb %xmm1, %xmm1
366 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
367 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
368 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
369 ; SSE2-NEXT: paddb (%rdx), %xmm0
370 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
373 ; SSE42-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
375 ; SSE42-NEXT: movdqa (%rdi), %xmm0
376 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
377 ; SSE42-NEXT: paddb (%rsi), %xmm0
378 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
379 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
380 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
381 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
382 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
383 ; SSE42-NEXT: paddb (%rdx), %xmm0
384 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
387 ; AVX-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
389 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
390 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
391 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
392 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
393 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
394 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
395 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
396 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
397 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
398 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
401 ; AVX2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
403 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
404 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
405 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
406 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
407 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
408 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
409 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
410 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
411 ; AVX2-NEXT: vzeroupper
414 ; AVX512F-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
416 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
417 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
418 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
419 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
420 ; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0
421 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
422 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
423 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
424 ; AVX512F-NEXT: vzeroupper
427 ; AVX512DQ-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
429 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
430 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
431 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
432 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
433 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm0
434 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
435 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
436 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
437 ; AVX512DQ-NEXT: vzeroupper
438 ; AVX512DQ-NEXT: retq
440 ; AVX512BW-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
442 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
443 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
444 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
445 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
446 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0
447 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
448 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
449 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
450 ; AVX512BW-NEXT: vzeroupper
451 ; AVX512BW-NEXT: retq
452 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
453 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
454 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
455 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31>
456 %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.zextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
457 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
458 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
459 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
463 define void @vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
464 ; SSE2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
466 ; SSE2-NEXT: movdqa (%rdi), %xmm0
467 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
468 ; SSE2-NEXT: paddb (%rsi), %xmm0
469 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
470 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
471 ; SSE2-NEXT: pand %xmm2, %xmm1
472 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
473 ; SSE2-NEXT: pandn %xmm0, %xmm2
474 ; SSE2-NEXT: por %xmm1, %xmm2
475 ; SSE2-NEXT: paddb (%rdx), %xmm2
476 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
479 ; SSE42-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
481 ; SSE42-NEXT: movdqa (%rdi), %xmm0
482 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
483 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
484 ; SSE42-NEXT: paddb (%rsi), %xmm0
485 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
486 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
487 ; SSE42-NEXT: paddb (%rdx), %xmm0
488 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
491 ; AVX-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
493 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
494 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
495 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
496 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
497 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
498 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
499 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
500 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
503 ; AVX2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
505 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
506 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
507 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
508 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
509 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
510 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
511 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
512 ; AVX2-NEXT: vzeroupper
515 ; AVX512F-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
517 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
518 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
519 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
520 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
521 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
522 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
523 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
524 ; AVX512F-NEXT: vzeroupper
527 ; AVX512DQ-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
529 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
530 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
531 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
532 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
533 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
534 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
535 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
536 ; AVX512DQ-NEXT: vzeroupper
537 ; AVX512DQ-NEXT: retq
539 ; AVX512BW-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
541 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
542 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
543 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
544 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
545 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
546 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
547 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
548 ; AVX512BW-NEXT: vzeroupper
549 ; AVX512BW-NEXT: retq
550 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
551 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
552 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
553 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31>
554 %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.zextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
555 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
556 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
557 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
561 define void @vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
562 ; SSE2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
564 ; SSE2-NEXT: movdqa (%rdi), %xmm0
565 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
566 ; SSE2-NEXT: paddb (%rsi), %xmm0
567 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
568 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
569 ; SSE2-NEXT: pand %xmm2, %xmm1
570 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
571 ; SSE2-NEXT: pandn %xmm0, %xmm2
572 ; SSE2-NEXT: por %xmm1, %xmm2
573 ; SSE2-NEXT: paddb (%rdx), %xmm2
574 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
577 ; SSE42-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
579 ; SSE42-NEXT: movdqa (%rdi), %xmm0
580 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
581 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
582 ; SSE42-NEXT: paddb (%rsi), %xmm0
583 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
584 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
585 ; SSE42-NEXT: paddb (%rdx), %xmm0
586 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
589 ; AVX-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
591 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
592 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
593 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
594 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
595 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
596 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
597 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
598 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
601 ; AVX2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
603 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
604 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
605 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
606 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
607 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
608 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
609 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
610 ; AVX2-NEXT: vzeroupper
613 ; AVX512F-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
615 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
616 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
617 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
618 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
619 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
620 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
621 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
622 ; AVX512F-NEXT: vzeroupper
625 ; AVX512DQ-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
627 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
628 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
629 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
630 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
631 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
632 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
633 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
634 ; AVX512DQ-NEXT: vzeroupper
635 ; AVX512DQ-NEXT: retq
637 ; AVX512BW-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
639 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
640 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
641 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
642 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
643 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
644 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
645 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
646 ; AVX512BW-NEXT: vzeroupper
647 ; AVX512BW-NEXT: retq
648 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
649 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
650 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
651 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
652 %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.zextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
653 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
654 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
655 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
659 define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
660 ; SSE2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
662 ; SSE2-NEXT: movdqa (%rdi), %xmm0
663 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
664 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
665 ; SSE2-NEXT: paddb (%rsi), %xmm0
666 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
667 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
668 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
669 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
670 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
671 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
672 ; SSE2-NEXT: paddb (%rdx), %xmm0
673 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
676 ; SSE42-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
678 ; SSE42-NEXT: movdqa (%rdi), %xmm0
679 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
680 ; SSE42-NEXT: paddb (%rsi), %xmm0
681 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
682 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
683 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
684 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
685 ; SSE42-NEXT: paddb (%rdx), %xmm0
686 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
689 ; AVX-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
691 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
692 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
693 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
694 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
695 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
696 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
697 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
698 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
699 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
702 ; AVX2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
704 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
705 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
706 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
707 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
708 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
709 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
710 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
711 ; AVX2-NEXT: vzeroupper
714 ; AVX512F-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
716 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
717 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
718 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
719 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
720 ; AVX512F-NEXT: vmovd %xmm0, %eax
721 ; AVX512F-NEXT: vpinsrw $2, %eax, %xmm2, %xmm0
722 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
723 ; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
724 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
725 ; AVX512F-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
726 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
727 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
728 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
729 ; AVX512F-NEXT: vzeroupper
732 ; AVX512DQ-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
734 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
735 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
736 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
737 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
738 ; AVX512DQ-NEXT: vmovd %xmm0, %eax
739 ; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm2, %xmm0
740 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
741 ; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
742 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
743 ; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
744 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
745 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
746 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
747 ; AVX512DQ-NEXT: vzeroupper
748 ; AVX512DQ-NEXT: retq
750 ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
751 ; AVX512BW-SLOW: # %bb.0:
752 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
753 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
754 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
755 ; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0
756 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
757 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
758 ; AVX512BW-SLOW-NEXT: vzeroupper
759 ; AVX512BW-SLOW-NEXT: retq
761 ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
762 ; AVX512BW-FAST: # %bb.0:
763 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
764 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
765 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
766 ; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
767 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax
768 ; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
769 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
770 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
771 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
772 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
773 ; AVX512BW-FAST-NEXT: vzeroupper
774 ; AVX512BW-FAST-NEXT: retq
775 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
776 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
777 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
778 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
779 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
780 %out.bytevec = bitcast <8 x i16> %broadcast.of.zextinreg to <16 x i8>
781 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
782 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
783 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
784 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
788 define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
789 ; SSE2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
791 ; SSE2-NEXT: movdqa (%rdi), %xmm0
792 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
793 ; SSE2-NEXT: paddb (%rsi), %xmm0
794 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
795 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
796 ; SSE2-NEXT: pand %xmm2, %xmm1
797 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
798 ; SSE2-NEXT: pandn %xmm0, %xmm2
799 ; SSE2-NEXT: por %xmm1, %xmm2
800 ; SSE2-NEXT: paddb (%rdx), %xmm2
801 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
804 ; SSE42-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
806 ; SSE42-NEXT: movdqa (%rdi), %xmm0
807 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
808 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
809 ; SSE42-NEXT: paddb (%rsi), %xmm0
810 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
811 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
812 ; SSE42-NEXT: paddb (%rdx), %xmm0
813 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
816 ; AVX-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
818 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
819 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
820 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
821 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
822 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
823 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
824 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
825 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
828 ; AVX2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
830 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
831 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
832 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
833 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
834 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
835 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
836 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
837 ; AVX2-NEXT: vzeroupper
840 ; AVX512F-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
842 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
843 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
844 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
845 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
846 ; AVX512F-NEXT: vmovd %xmm0, %eax
847 ; AVX512F-NEXT: vpinsrw $4, %eax, %xmm2, %xmm0
848 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
849 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
850 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
851 ; AVX512F-NEXT: vzeroupper
854 ; AVX512DQ-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
856 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
857 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
858 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
859 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
860 ; AVX512DQ-NEXT: vmovd %xmm0, %eax
861 ; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm2, %xmm0
862 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
863 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
864 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
865 ; AVX512DQ-NEXT: vzeroupper
866 ; AVX512DQ-NEXT: retq
868 ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
869 ; AVX512BW-SLOW: # %bb.0:
870 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
871 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
872 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
873 ; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
874 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
875 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
876 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
877 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
878 ; AVX512BW-SLOW-NEXT: vzeroupper
879 ; AVX512BW-SLOW-NEXT: retq
881 ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
882 ; AVX512BW-FAST: # %bb.0:
883 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
884 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
885 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
886 ; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
887 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
888 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
889 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
890 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
891 ; AVX512BW-FAST-NEXT: vzeroupper
892 ; AVX512BW-FAST-NEXT: retq
893 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
894 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
895 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
896 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
897 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
898 %out.bytevec = bitcast <8 x i16> %broadcast.of.zextinreg to <16 x i8>
899 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
900 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
901 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
902 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
906 define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
907 ; SSE2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
909 ; SSE2-NEXT: movdqa (%rdi), %xmm0
910 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
911 ; SSE2-NEXT: paddb (%rsi), %xmm0
912 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
913 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
914 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
915 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
916 ; SSE2-NEXT: paddb (%rdx), %xmm0
917 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
920 ; SSE42-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
922 ; SSE42-NEXT: movdqa (%rdi), %xmm0
923 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
924 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
925 ; SSE42-NEXT: paddb (%rsi), %xmm0
926 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
927 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
928 ; SSE42-NEXT: paddb (%rdx), %xmm0
929 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
932 ; AVX-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
934 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
935 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
936 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
937 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
938 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
939 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
940 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
941 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
944 ; AVX2-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
945 ; AVX2-SLOW: # %bb.0:
946 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
947 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
948 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
949 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
950 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
951 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
952 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
953 ; AVX2-SLOW-NEXT: vzeroupper
954 ; AVX2-SLOW-NEXT: retq
956 ; AVX2-FAST-PERLANE-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
957 ; AVX2-FAST-PERLANE: # %bb.0:
958 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
959 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0
960 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
961 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %xmm0
962 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
963 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
964 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
965 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
966 ; AVX2-FAST-PERLANE-NEXT: retq
968 ; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
969 ; AVX2-FAST: # %bb.0:
970 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
971 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7]
972 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
973 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
974 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
975 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
976 ; AVX2-FAST-NEXT: vzeroupper
977 ; AVX2-FAST-NEXT: retq
979 ; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
981 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7]
982 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
983 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
984 ; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0
985 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
986 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
987 ; AVX512F-NEXT: vzeroupper
990 ; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
992 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7]
993 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
994 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
995 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0
996 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
997 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
998 ; AVX512DQ-NEXT: vzeroupper
999 ; AVX512DQ-NEXT: retq
1001 ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
1002 ; AVX512BW: # %bb.0:
1003 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1004 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7]
1005 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1006 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
1007 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1008 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1009 ; AVX512BW-NEXT: vzeroupper
1010 ; AVX512BW-NEXT: retq
1011 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1012 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1013 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1014 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
1015 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
1016 %out.bytevec = bitcast <4 x i32> %broadcast.of.zextinreg to <16 x i8>
1017 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1018 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1019 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1020 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1024 define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1025 ; SSE2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1027 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1028 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1029 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1030 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1031 ; SSE2-NEXT: paddb (%rsi), %xmm0
1032 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1033 ; SSE2-NEXT: psrlw $8, %xmm1
1034 ; SSE2-NEXT: packuswb %xmm1, %xmm1
1035 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1036 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1037 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1038 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1039 ; SSE2-NEXT: psrlw $8, %xmm2
1040 ; SSE2-NEXT: packuswb %xmm2, %xmm2
1041 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1042 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
1043 ; SSE2-NEXT: paddb (%rdx), %xmm3
1044 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
1045 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
1048 ; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1050 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1051 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1052 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1053 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1054 ; SSE42-NEXT: paddb (%rsi), %xmm0
1055 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1056 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
1057 ; SSE42-NEXT: pshufb %xmm3, %xmm1
1058 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1059 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1060 ; SSE42-NEXT: movdqa %xmm0, %xmm4
1061 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
1062 ; SSE42-NEXT: pshufb %xmm3, %xmm2
1063 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1064 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1065 ; SSE42-NEXT: paddb (%rdx), %xmm4
1066 ; SSE42-NEXT: movdqa %xmm4, (%rcx)
1067 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1070 ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1072 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1073 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1074 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1075 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1076 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1077 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1078 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1079 ; AVX-NEXT: # xmm3 = mem[0,0]
1080 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1081 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1082 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1083 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1084 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1085 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1086 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1087 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1088 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1089 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1092 ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1094 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
1095 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1096 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1097 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1098 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1099 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
1100 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1101 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1102 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1103 ; AVX2-NEXT: vzeroupper
1106 ; AVX512F-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1108 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
1109 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1110 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1111 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1112 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1113 ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
1114 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1115 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1116 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1117 ; AVX512F-NEXT: vzeroupper
1118 ; AVX512F-NEXT: retq
1120 ; AVX512DQ-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1121 ; AVX512DQ: # %bb.0:
1122 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
1123 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1124 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1125 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
1126 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1127 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
1128 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1129 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1130 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1131 ; AVX512DQ-NEXT: vzeroupper
1132 ; AVX512DQ-NEXT: retq
1134 ; AVX512BW-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1135 ; AVX512BW: # %bb.0:
1136 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1137 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1138 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1139 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1140 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
1141 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1142 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1143 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1144 ; AVX512BW-NEXT: vzeroupper
1145 ; AVX512BW-NEXT: retq
1146 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1147 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1148 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1149 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47, i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63>
1150 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.zextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1151 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1152 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1153 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1157 define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1158 ; SSE2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1160 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1161 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1162 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1163 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1164 ; SSE2-NEXT: paddb (%rsi), %xmm0
1165 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1166 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1167 ; SSE2-NEXT: pand %xmm3, %xmm1
1168 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1169 ; SSE2-NEXT: pand %xmm3, %xmm2
1170 ; SSE2-NEXT: pandn %xmm0, %xmm3
1171 ; SSE2-NEXT: por %xmm3, %xmm1
1172 ; SSE2-NEXT: por %xmm2, %xmm3
1173 ; SSE2-NEXT: paddb 16(%rdx), %xmm3
1174 ; SSE2-NEXT: paddb (%rdx), %xmm1
1175 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
1176 ; SSE2-NEXT: movdqa %xmm3, 16(%rcx)
1179 ; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1181 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1182 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1183 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1184 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1185 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1186 ; SSE42-NEXT: paddb (%rsi), %xmm0
1187 ; SSE42-NEXT: movdqa %xmm0, %xmm3
1188 ; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
1189 ; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
1190 ; SSE42-NEXT: pshufb %xmm1, %xmm3
1191 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1192 ; SSE42-NEXT: pshufb %xmm1, %xmm0
1193 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1194 ; SSE42-NEXT: paddb (%rdx), %xmm3
1195 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
1196 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1199 ; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1201 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1202 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1203 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1204 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1205 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1206 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1207 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1208 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
1209 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1210 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1211 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1212 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1213 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1214 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1215 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1218 ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1220 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
1221 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1222 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1223 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1224 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
1225 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1226 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1227 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1228 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1229 ; AVX2-NEXT: vzeroupper
1232 ; AVX512F-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1234 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
1235 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1236 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1237 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1238 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
1239 ; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
1240 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
1241 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1242 ; AVX512F-NEXT: vzeroupper
1243 ; AVX512F-NEXT: retq
1245 ; AVX512DQ-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1246 ; AVX512DQ: # %bb.0:
1247 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
1248 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1249 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
1250 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1251 ; AVX512DQ-NEXT: vpbroadcastd %xmm1, %ymm1
1252 ; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
1253 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
1254 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1255 ; AVX512DQ-NEXT: vzeroupper
1256 ; AVX512DQ-NEXT: retq
1258 ; AVX512BW-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1259 ; AVX512BW: # %bb.0:
1260 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1261 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1262 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1263 ; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0
1264 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
1265 ; AVX512BW-NEXT: kmovd %eax, %k1
1266 ; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
1267 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
1268 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1269 ; AVX512BW-NEXT: vzeroupper
1270 ; AVX512BW-NEXT: retq
1271 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1272 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1273 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1274 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63>
1275 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.zextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1276 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1277 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1278 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1282 define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1283 ; SSE2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1285 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1286 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1287 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1288 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1289 ; SSE2-NEXT: paddb (%rsi), %xmm0
1290 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1291 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
1292 ; SSE2-NEXT: pand %xmm3, %xmm1
1293 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1294 ; SSE2-NEXT: pand %xmm3, %xmm2
1295 ; SSE2-NEXT: pandn %xmm0, %xmm3
1296 ; SSE2-NEXT: por %xmm3, %xmm1
1297 ; SSE2-NEXT: por %xmm2, %xmm3
1298 ; SSE2-NEXT: paddb 16(%rdx), %xmm3
1299 ; SSE2-NEXT: paddb (%rdx), %xmm1
1300 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
1301 ; SSE2-NEXT: movdqa %xmm3, 16(%rcx)
1304 ; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1306 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1307 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1308 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1309 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1310 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1311 ; SSE42-NEXT: paddb (%rsi), %xmm0
1312 ; SSE42-NEXT: movdqa %xmm0, %xmm3
1313 ; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
1314 ; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
1315 ; SSE42-NEXT: pshufb %xmm1, %xmm3
1316 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1317 ; SSE42-NEXT: pshufb %xmm1, %xmm0
1318 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1319 ; SSE42-NEXT: paddb (%rdx), %xmm3
1320 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
1321 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1324 ; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1326 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1327 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1328 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1329 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1330 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1331 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1332 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1333 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
1334 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1335 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1336 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1337 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1338 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1339 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1340 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1343 ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1345 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
1346 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1347 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1348 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1349 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
1350 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
1351 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1352 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1353 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1354 ; AVX2-NEXT: vzeroupper
1357 ; AVX512F-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1359 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
1360 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1361 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1362 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1363 ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
1364 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1
1365 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
1366 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1367 ; AVX512F-NEXT: vzeroupper
1368 ; AVX512F-NEXT: retq
1370 ; AVX512DQ-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1371 ; AVX512DQ: # %bb.0:
1372 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
1373 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1374 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
1375 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1376 ; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
1377 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1
1378 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
1379 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1380 ; AVX512DQ-NEXT: vzeroupper
1381 ; AVX512DQ-NEXT: retq
1383 ; AVX512BW-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1384 ; AVX512BW: # %bb.0:
1385 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1386 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1387 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1388 ; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm0
1389 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
1390 ; AVX512BW-NEXT: kmovd %eax, %k1
1391 ; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
1392 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
1393 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1394 ; AVX512BW-NEXT: vzeroupper
1395 ; AVX512BW-NEXT: retq
1396 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1397 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1398 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1399 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1400 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.zextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1401 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1402 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1403 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1407 define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1408 ; SSE2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1410 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1411 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1412 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1413 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1414 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1415 ; SSE2-NEXT: paddb (%rsi), %xmm0
1416 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1417 ; SSE2-NEXT: pand %xmm3, %xmm1
1418 ; SSE2-NEXT: pand %xmm3, %xmm2
1419 ; SSE2-NEXT: pandn %xmm0, %xmm3
1420 ; SSE2-NEXT: por %xmm3, %xmm1
1421 ; SSE2-NEXT: por %xmm3, %xmm2
1422 ; SSE2-NEXT: paddb 16(%rdx), %xmm2
1423 ; SSE2-NEXT: paddb (%rdx), %xmm1
1424 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
1425 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
1428 ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1430 ; SSE42-NEXT: movdqa (%rdi), %xmm1
1431 ; SSE42-NEXT: movdqa 32(%rdi), %xmm2
1432 ; SSE42-NEXT: movdqa 48(%rdi), %xmm3
1433 ; SSE42-NEXT: paddb 48(%rsi), %xmm3
1434 ; SSE42-NEXT: paddb 32(%rsi), %xmm2
1435 ; SSE42-NEXT: paddb (%rsi), %xmm1
1436 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1437 ; SSE42-NEXT: movdqa %xmm1, %xmm4
1438 ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm4
1439 ; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1
1440 ; SSE42-NEXT: paddb 16(%rdx), %xmm1
1441 ; SSE42-NEXT: paddb (%rdx), %xmm4
1442 ; SSE42-NEXT: movdqa %xmm4, (%rcx)
1443 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
1446 ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1448 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1449 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1450 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1451 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1452 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1453 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1454 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1455 ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm1
1456 ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
1457 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1458 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1459 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1460 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1463 ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1465 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1466 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
1467 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1468 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1469 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1470 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1471 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
1472 ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
1473 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1474 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1475 ; AVX2-NEXT: vzeroupper
1478 ; AVX512F-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1480 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1481 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
1482 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1483 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1484 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1485 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1486 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1487 ; AVX512F-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm2
1488 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1489 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1490 ; AVX512F-NEXT: vzeroupper
1491 ; AVX512F-NEXT: retq
1493 ; AVX512DQ-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1494 ; AVX512DQ: # %bb.0:
1495 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
1496 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
1497 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1498 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1499 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1500 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1501 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
1502 ; AVX512DQ-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm2
1503 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1504 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1505 ; AVX512DQ-NEXT: vzeroupper
1506 ; AVX512DQ-NEXT: retq
1508 ; AVX512BW-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1509 ; AVX512BW: # %bb.0:
1510 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1511 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1512 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1513 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1514 ; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
1515 ; AVX512BW-NEXT: kmovd %eax, %k1
1516 ; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
1517 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
1518 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1519 ; AVX512BW-NEXT: vzeroupper
1520 ; AVX512BW-NEXT: retq
1521 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1522 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1523 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1524 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1525 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.zextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1526 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1527 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1528 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1532 define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1533 ; SSE2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1535 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1536 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1537 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1538 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1539 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1540 ; SSE2-NEXT: paddb (%rsi), %xmm0
1541 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1542 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
1543 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
1544 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1545 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1546 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1547 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1548 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7]
1549 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
1550 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1551 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1552 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1553 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
1554 ; SSE2-NEXT: paddb (%rdx), %xmm3
1555 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
1556 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
1559 ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1561 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1562 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1563 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1564 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1565 ; SSE42-NEXT: paddb (%rsi), %xmm0
1566 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1567 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
1568 ; SSE42-NEXT: pshufb %xmm3, %xmm1
1569 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1570 ; SSE42-NEXT: movdqa %xmm0, %xmm4
1571 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
1572 ; SSE42-NEXT: pshufb %xmm3, %xmm2
1573 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1574 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1575 ; SSE42-NEXT: paddb (%rdx), %xmm4
1576 ; SSE42-NEXT: movdqa %xmm4, (%rcx)
1577 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1580 ; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1582 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1583 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1584 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1585 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1586 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1587 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1588 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
1589 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1590 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1591 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1592 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1593 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1594 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1595 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1596 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1597 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1600 ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1602 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
1603 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1604 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1605 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1606 ; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
1607 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1608 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1609 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1610 ; AVX2-NEXT: vzeroupper
1613 ; AVX512F-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1615 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
1616 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1617 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1618 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1619 ; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1
1620 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1621 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1622 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1623 ; AVX512F-NEXT: vzeroupper
1624 ; AVX512F-NEXT: retq
1626 ; AVX512DQ-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1627 ; AVX512DQ: # %bb.0:
1628 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
1629 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1630 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
1631 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1632 ; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1
1633 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1634 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1635 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1636 ; AVX512DQ-NEXT: vzeroupper
1637 ; AVX512DQ-NEXT: retq
1639 ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1640 ; AVX512BW: # %bb.0:
1641 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1642 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1643 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1644 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
1645 ; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
1646 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
1647 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1648 ; AVX512BW-NEXT: vzeroupper
1649 ; AVX512BW-NEXT: retq
1650 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1651 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1652 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1653 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
1654 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31>
1655 %out.bytevec = bitcast <16 x i16> %broadcast.of.zextinreg to <32 x i8>
1656 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1657 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1658 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1659 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1663 define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1664 ; SSE2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1666 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1667 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1668 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1669 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1670 ; SSE2-NEXT: paddb (%rsi), %xmm0
1671 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1672 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535]
1673 ; SSE2-NEXT: pand %xmm3, %xmm1
1674 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1675 ; SSE2-NEXT: pand %xmm3, %xmm2
1676 ; SSE2-NEXT: pandn %xmm0, %xmm3
1677 ; SSE2-NEXT: por %xmm3, %xmm1
1678 ; SSE2-NEXT: por %xmm2, %xmm3
1679 ; SSE2-NEXT: paddb 16(%rdx), %xmm3
1680 ; SSE2-NEXT: paddb (%rdx), %xmm1
1681 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
1682 ; SSE2-NEXT: movdqa %xmm3, 16(%rcx)
1685 ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1687 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1688 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1689 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1690 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1691 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1692 ; SSE42-NEXT: paddb (%rsi), %xmm0
1693 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1694 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1695 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1696 ; SSE42-NEXT: paddb 16(%rdx), %xmm2
1697 ; SSE42-NEXT: paddb (%rdx), %xmm1
1698 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
1699 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
1702 ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1704 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1705 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1706 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1707 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1708 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1709 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1710 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1711 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1712 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1713 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1714 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1715 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1716 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1719 ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1721 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
1722 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1723 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1724 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1725 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
1726 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1727 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1728 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1729 ; AVX2-NEXT: vzeroupper
1732 ; AVX512F-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1734 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
1735 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1736 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1737 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1738 ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
1739 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1740 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1741 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1742 ; AVX512F-NEXT: vzeroupper
1743 ; AVX512F-NEXT: retq
1745 ; AVX512DQ-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1746 ; AVX512DQ: # %bb.0:
1747 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
1748 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1749 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
1750 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1751 ; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
1752 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1753 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1754 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1755 ; AVX512DQ-NEXT: vzeroupper
1756 ; AVX512DQ-NEXT: retq
1758 ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1759 ; AVX512BW: # %bb.0:
1760 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1761 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1762 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1763 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
1764 ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
1765 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
1766 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1767 ; AVX512BW-NEXT: vzeroupper
1768 ; AVX512BW-NEXT: retq
1769 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1770 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1771 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1772 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
1773 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31>
1774 %out.bytevec = bitcast <16 x i16> %broadcast.of.zextinreg to <32 x i8>
1775 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1776 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1777 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1778 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1782 define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1783 ; SSE2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1785 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1786 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1787 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1788 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1789 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1790 ; SSE2-NEXT: paddb (%rsi), %xmm0
1791 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535]
1792 ; SSE2-NEXT: pand %xmm3, %xmm1
1793 ; SSE2-NEXT: pand %xmm3, %xmm2
1794 ; SSE2-NEXT: pandn %xmm0, %xmm3
1795 ; SSE2-NEXT: por %xmm3, %xmm1
1796 ; SSE2-NEXT: por %xmm3, %xmm2
1797 ; SSE2-NEXT: paddb 16(%rdx), %xmm2
1798 ; SSE2-NEXT: paddb (%rdx), %xmm1
1799 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
1800 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
1803 ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1805 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1806 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1807 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1808 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1809 ; SSE42-NEXT: paddb (%rsi), %xmm0
1810 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1811 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1812 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
1813 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1814 ; SSE42-NEXT: paddb (%rdx), %xmm1
1815 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
1816 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1819 ; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1821 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1822 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1823 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1824 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1825 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1826 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1827 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1828 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
1829 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1830 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1831 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1832 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1835 ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1837 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1838 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
1839 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1840 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1841 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1842 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1843 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1844 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1845 ; AVX2-NEXT: vzeroupper
1848 ; AVX512F-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1850 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1851 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
1852 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1853 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1854 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1855 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1856 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1857 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1858 ; AVX512F-NEXT: vzeroupper
1859 ; AVX512F-NEXT: retq
1861 ; AVX512DQ-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1862 ; AVX512DQ: # %bb.0:
1863 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
1864 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
1865 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1866 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1867 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1868 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1869 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1870 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1871 ; AVX512DQ-NEXT: vzeroupper
1872 ; AVX512DQ-NEXT: retq
1874 ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1875 ; AVX512BW: # %bb.0:
1876 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1877 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1878 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1879 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
1880 ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
1881 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
1882 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1883 ; AVX512BW-NEXT: vzeroupper
1884 ; AVX512BW-NEXT: retq
1885 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1886 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1887 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1888 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
1889 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1890 %out.bytevec = bitcast <16 x i16> %broadcast.of.zextinreg to <32 x i8>
1891 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1892 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1893 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1894 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1898 define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1899 ; SSE2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1901 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1902 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1903 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1904 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1905 ; SSE2-NEXT: paddb (%rsi), %xmm0
1906 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1907 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1908 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1909 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1910 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1911 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
1912 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1913 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
1914 ; SSE2-NEXT: paddb (%rdx), %xmm3
1915 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
1916 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
1919 ; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1921 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1922 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1923 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1924 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1925 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1926 ; SSE42-NEXT: paddb (%rsi), %xmm0
1927 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1928 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1929 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1930 ; SSE42-NEXT: paddb 16(%rdx), %xmm2
1931 ; SSE42-NEXT: paddb (%rdx), %xmm1
1932 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
1933 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
1936 ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1938 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1939 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1940 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1941 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1942 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1943 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1944 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1945 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1946 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
1947 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1948 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
1949 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
1950 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
1951 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
1952 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
1953 ; AVX-NEXT: vzeroupper
1956 ; AVX2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1958 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
1959 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1960 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1961 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1962 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
1963 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
1964 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1965 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1966 ; AVX2-NEXT: vzeroupper
1969 ; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1970 ; AVX512F-SLOW: # %bb.0:
1971 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0
1972 ; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1973 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1
1974 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1975 ; AVX512F-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
1976 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
1977 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1978 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
1979 ; AVX512F-SLOW-NEXT: vzeroupper
1980 ; AVX512F-SLOW-NEXT: retq
1982 ; AVX512F-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1983 ; AVX512F-FAST: # %bb.0:
1984 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
1985 ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1986 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1
1987 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1988 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15]
1989 ; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
1990 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1991 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
1992 ; AVX512F-FAST-NEXT: vzeroupper
1993 ; AVX512F-FAST-NEXT: retq
1995 ; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1996 ; AVX512DQ-SLOW: # %bb.0:
1997 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0
1998 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1999 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1
2000 ; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
2001 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
2002 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
2003 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2004 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
2005 ; AVX512DQ-SLOW-NEXT: vzeroupper
2006 ; AVX512DQ-SLOW-NEXT: retq
2008 ; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
2009 ; AVX512DQ-FAST: # %bb.0:
2010 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
2011 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2012 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1
2013 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1
2014 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15]
2015 ; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
2016 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0
2017 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx)
2018 ; AVX512DQ-FAST-NEXT: vzeroupper
2019 ; AVX512DQ-FAST-NEXT: retq
2021 ; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
2022 ; AVX512BW-SLOW: # %bb.0:
2023 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
2024 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2025 ; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2026 ; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0
2027 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
2028 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2029 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
2030 ; AVX512BW-SLOW-NEXT: vzeroupper
2031 ; AVX512BW-SLOW-NEXT: retq
2033 ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
2034 ; AVX512BW-FAST: # %bb.0:
2035 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
2036 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31]
2037 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2038 ; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm0
2039 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2040 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
2041 ; AVX512BW-FAST-NEXT: vzeroupper
2042 ; AVX512BW-FAST-NEXT: retq
2043 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2044 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2045 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2046 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
2047 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
2048 %out.bytevec = bitcast <8 x i32> %broadcast.of.zextinreg to <32 x i8>
2049 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2050 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2051 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2052 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2056 define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2057 ; SSE2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2059 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2060 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
2061 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
2062 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
2063 ; SSE2-NEXT: paddb (%rsi), %xmm0
2064 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
2065 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2066 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
2067 ; SSE2-NEXT: paddb 16(%rdx), %xmm2
2068 ; SSE2-NEXT: paddb (%rdx), %xmm1
2069 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
2070 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
2073 ; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2075 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2076 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
2077 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
2078 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
2079 ; SSE42-NEXT: paddb (%rsi), %xmm0
2080 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
2081 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2082 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
2083 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
2084 ; SSE42-NEXT: paddb (%rdx), %xmm1
2085 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
2086 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
2089 ; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2091 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2092 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
2093 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
2094 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
2095 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
2096 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2097 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2098 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2099 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2100 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
2101 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
2102 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
2103 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
2104 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
2105 ; AVX-NEXT: vzeroupper
2108 ; AVX2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2110 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2111 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
2112 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2113 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2114 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2115 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2116 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2117 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
2118 ; AVX2-NEXT: vzeroupper
2121 ; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2122 ; AVX512F-SLOW: # %bb.0:
2123 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0
2124 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
2125 ; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2126 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2127 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2128 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2129 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2130 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
2131 ; AVX512F-SLOW-NEXT: vzeroupper
2132 ; AVX512F-SLOW-NEXT: retq
2134 ; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2135 ; AVX512F-FAST: # %bb.0:
2136 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0
2137 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
2138 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2139 ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2140 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7]
2141 ; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
2142 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0
2143 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
2144 ; AVX512F-FAST-NEXT: vzeroupper
2145 ; AVX512F-FAST-NEXT: retq
2147 ; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2148 ; AVX512DQ-SLOW: # %bb.0:
2149 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0
2150 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
2151 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2152 ; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2153 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2154 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2155 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2156 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
2157 ; AVX512DQ-SLOW-NEXT: vzeroupper
2158 ; AVX512DQ-SLOW-NEXT: retq
2160 ; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2161 ; AVX512DQ-FAST: # %bb.0:
2162 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0
2163 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
2164 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2165 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2166 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7]
2167 ; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
2168 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0
2169 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx)
2170 ; AVX512DQ-FAST-NEXT: vzeroupper
2171 ; AVX512DQ-FAST-NEXT: retq
2173 ; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2174 ; AVX512BW-SLOW: # %bb.0:
2175 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
2176 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2177 ; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2178 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2179 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2180 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2181 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
2182 ; AVX512BW-SLOW-NEXT: vzeroupper
2183 ; AVX512BW-SLOW-NEXT: retq
2185 ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2186 ; AVX512BW-FAST: # %bb.0:
2187 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
2188 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15]
2189 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2190 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
2191 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2192 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
2193 ; AVX512BW-FAST-NEXT: vzeroupper
2194 ; AVX512BW-FAST-NEXT: retq
2195 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2196 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2197 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2198 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
2199 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
2200 %out.bytevec = bitcast <8 x i32> %broadcast.of.zextinreg to <32 x i8>
2201 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2202 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2203 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2204 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2208 define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2209 ; SSE2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2211 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2212 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
2213 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
2214 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
2215 ; SSE2-NEXT: paddb (%rsi), %xmm0
2216 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
2217 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2218 ; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
2219 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
2220 ; SSE2-NEXT: paddb (%rdx), %xmm1
2221 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
2222 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
2225 ; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2227 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2228 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
2229 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
2230 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
2231 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
2232 ; SSE42-NEXT: paddb (%rsi), %xmm0
2233 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2234 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2235 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
2236 ; SSE42-NEXT: paddb (%rdx), %xmm1
2237 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
2238 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
2241 ; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2243 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2244 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
2245 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
2246 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
2247 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
2248 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2249 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2250 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2251 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2252 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
2253 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
2254 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
2255 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
2256 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
2257 ; AVX-NEXT: vzeroupper
2260 ; AVX2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2262 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
2263 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2264 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
2265 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
2266 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
2267 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2268 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2269 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
2270 ; AVX2-NEXT: vzeroupper
2273 ; AVX512F-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2274 ; AVX512F-SLOW: # %bb.0:
2275 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0
2276 ; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2277 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1
2278 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
2279 ; AVX512F-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
2280 ; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2281 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2282 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
2283 ; AVX512F-SLOW-NEXT: vzeroupper
2284 ; AVX512F-SLOW-NEXT: retq
2286 ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2287 ; AVX512F-FAST: # %bb.0:
2288 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0
2289 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
2290 ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2291 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2292 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7]
2293 ; AVX512F-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
2294 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0
2295 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
2296 ; AVX512F-FAST-NEXT: vzeroupper
2297 ; AVX512F-FAST-NEXT: retq
2299 ; AVX512DQ-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2300 ; AVX512DQ-SLOW: # %bb.0:
2301 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0
2302 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2303 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1
2304 ; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
2305 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
2306 ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2307 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2308 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
2309 ; AVX512DQ-SLOW-NEXT: vzeroupper
2310 ; AVX512DQ-SLOW-NEXT: retq
2312 ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2313 ; AVX512DQ-FAST: # %bb.0:
2314 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0
2315 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
2316 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2317 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2318 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7]
2319 ; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
2320 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0
2321 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx)
2322 ; AVX512DQ-FAST-NEXT: vzeroupper
2323 ; AVX512DQ-FAST-NEXT: retq
2325 ; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2326 ; AVX512BW-SLOW: # %bb.0:
2327 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
2328 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2329 ; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2330 ; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0
2331 ; AVX512BW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2332 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2333 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
2334 ; AVX512BW-SLOW-NEXT: vzeroupper
2335 ; AVX512BW-SLOW-NEXT: retq
2337 ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2338 ; AVX512BW-FAST: # %bb.0:
2339 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
2340 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7]
2341 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2342 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0
2343 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2344 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
2345 ; AVX512BW-FAST-NEXT: vzeroupper
2346 ; AVX512BW-FAST-NEXT: retq
2347 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2348 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2349 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2350 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
2351 %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
2352 %out.bytevec = bitcast <4 x i64> %broadcast.of.zextinreg to <32 x i8>
2353 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2354 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2355 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2356 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2360 define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2361 ; SSE2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2363 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2364 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
2365 ; SSE2-NEXT: paddb (%rsi), %xmm0
2366 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
2367 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2368 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,0,0,0,4,5,6,7]
2369 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
2370 ; SSE2-NEXT: pand %xmm2, %xmm3
2371 ; SSE2-NEXT: pandn %xmm1, %xmm2
2372 ; SSE2-NEXT: por %xmm3, %xmm2
2373 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2374 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2375 ; SSE2-NEXT: pxor %xmm1, %xmm1
2376 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2377 ; SSE2-NEXT: paddb (%rdx), %xmm2
2378 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
2379 ; SSE2-NEXT: paddb %xmm0, %xmm1
2380 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
2381 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
2382 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
2383 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
2386 ; SSE42-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2388 ; SSE42-NEXT: movdqa (%rdi), %xmm1
2389 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
2390 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
2391 ; SSE42-NEXT: paddb (%rsi), %xmm1
2392 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
2393 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
2394 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2395 ; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm2
2396 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero
2397 ; SSE42-NEXT: paddb (%rdx), %xmm2
2398 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0
2399 ; SSE42-NEXT: paddb %xmm1, %xmm0
2400 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
2401 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
2402 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
2403 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
2406 ; AVX-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2408 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2409 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
2410 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2411 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2412 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
2413 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
2414 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2415 ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
2416 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
2417 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
2418 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
2419 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
2420 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
2421 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
2422 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
2425 ; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2427 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2428 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
2429 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2430 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2431 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
2432 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,0,u,0,u,0,u,0,u,0,u,0,u,0,u,16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero
2433 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
2434 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2435 ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
2436 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
2437 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2438 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2439 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2440 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
2441 ; AVX2-NEXT: vzeroupper
2444 ; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2446 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2447 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
2448 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2449 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2450 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2451 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
2452 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2453 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2454 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2455 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2456 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2457 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
2458 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2459 ; AVX512F-NEXT: vzeroupper
2460 ; AVX512F-NEXT: retq
2462 ; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2463 ; AVX512DQ: # %bb.0:
2464 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2465 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
2466 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2467 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2468 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2469 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
2470 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2471 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2472 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2473 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2474 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2475 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
2476 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
2477 ; AVX512DQ-NEXT: vzeroupper
2478 ; AVX512DQ-NEXT: retq
2480 ; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2481 ; AVX512BW: # %bb.0:
2482 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2483 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2484 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2485 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2486 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2487 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
2488 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2489 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2490 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2491 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
2492 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2493 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2494 ; AVX512BW-NEXT: vzeroupper
2495 ; AVX512BW-NEXT: retq
2496 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2497 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2498 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2499 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63, i32 0, i32 65, i32 0, i32 67, i32 0, i32 69, i32 0, i32 71, i32 0, i32 73, i32 0, i32 75, i32 0, i32 77, i32 0, i32 79, i32 0, i32 81, i32 0, i32 83, i32 0, i32 85, i32 0, i32 87, i32 0, i32 89, i32 0, i32 91, i32 0, i32 93, i32 0, i32 95>
2500 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2501 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2502 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2503 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2507 define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2508 ; SSE2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2510 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2511 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
2512 ; SSE2-NEXT: paddb (%rsi), %xmm0
2513 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
2514 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
2515 ; SSE2-NEXT: pand %xmm2, %xmm1
2516 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2517 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2518 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2519 ; SSE2-NEXT: pandn %xmm0, %xmm2
2520 ; SSE2-NEXT: por %xmm1, %xmm2
2521 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0]
2522 ; SSE2-NEXT: pand %xmm0, %xmm1
2523 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2524 ; SSE2-NEXT: paddb (%rdx), %xmm2
2525 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
2526 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
2527 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
2528 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
2529 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
2532 ; SSE42-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2534 ; SSE42-NEXT: movdqa (%rdi), %xmm1
2535 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
2536 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
2537 ; SSE42-NEXT: paddb (%rsi), %xmm1
2538 ; SSE42-NEXT: pxor %xmm0, %xmm0
2539 ; SSE42-NEXT: movdqa %xmm1, %xmm3
2540 ; SSE42-NEXT: pshufb %xmm0, %xmm3
2541 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
2542 ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3
2543 ; SSE42-NEXT: movdqa %xmm1, %xmm0
2544 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero
2545 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero
2546 ; SSE42-NEXT: paddb (%rdx), %xmm3
2547 ; SSE42-NEXT: paddb 16(%rdx), %xmm1
2548 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
2549 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
2550 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
2551 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
2554 ; AVX-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2556 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2557 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
2558 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2559 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2560 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
2561 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2
2562 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
2563 ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
2564 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero
2565 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero
2566 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
2567 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
2568 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
2569 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
2570 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
2571 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
2574 ; AVX2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2576 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
2577 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2578 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
2579 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
2580 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm2
2581 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero
2582 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2583 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
2584 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
2585 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
2586 ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
2587 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero
2588 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2589 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
2590 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
2591 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
2592 ; AVX2-NEXT: vzeroupper
2595 ; AVX512F-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2597 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2598 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
2599 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2600 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2601 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2602 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2603 ; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm2
2604 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2605 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2606 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero
2607 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2608 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2609 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2610 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
2611 ; AVX512F-NEXT: vzeroupper
2612 ; AVX512F-NEXT: retq
2614 ; AVX512DQ-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2615 ; AVX512DQ: # %bb.0:
2616 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2617 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
2618 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2619 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2620 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2621 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2622 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm2
2623 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2624 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2625 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero
2626 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2627 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2628 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
2629 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
2630 ; AVX512DQ-NEXT: vzeroupper
2631 ; AVX512DQ-NEXT: retq
2633 ; AVX512BW-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2634 ; AVX512BW: # %bb.0:
2635 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2636 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2637 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2638 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2639 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2640 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2641 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm2
2642 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2643 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2644 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero
2645 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
2646 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2647 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2648 ; AVX512BW-NEXT: vzeroupper
2649 ; AVX512BW-NEXT: retq
2650 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2651 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2652 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2653 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 0, i32 52, i32 53, i32 0, i32 55, i32 56, i32 0, i32 58, i32 59, i32 0, i32 61, i32 62, i32 0, i32 64, i32 65, i32 0, i32 67, i32 68, i32 0, i32 70, i32 71, i32 0, i32 73, i32 74, i32 0, i32 76, i32 77, i32 0, i32 79, i32 80, i32 0, i32 82, i32 83, i32 0, i32 85, i32 86, i32 0, i32 88, i32 89, i32 0, i32 91, i32 92, i32 0, i32 94, i32 95>
2654 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2655 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2656 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2657 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2661 define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2662 ; SSE2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2664 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2665 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
2666 ; SSE2-NEXT: paddb (%rsi), %xmm0
2667 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
2668 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2669 ; SSE2-NEXT: pand %xmm2, %xmm1
2670 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2671 ; SSE2-NEXT: pandn %xmm0, %xmm2
2672 ; SSE2-NEXT: por %xmm1, %xmm2
2673 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2674 ; SSE2-NEXT: paddb (%rdx), %xmm2
2675 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
2676 ; SSE2-NEXT: paddb %xmm0, %xmm1
2677 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
2678 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
2679 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
2680 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
2683 ; SSE42-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2685 ; SSE42-NEXT: movdqa (%rdi), %xmm1
2686 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
2687 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
2688 ; SSE42-NEXT: paddb (%rsi), %xmm1
2689 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
2690 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2691 ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3
2692 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
2693 ; SSE42-NEXT: paddb (%rdx), %xmm3
2694 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0
2695 ; SSE42-NEXT: paddb %xmm1, %xmm0
2696 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
2697 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
2698 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
2699 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
2702 ; AVX-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2704 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2705 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
2706 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2707 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2708 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
2709 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2710 ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
2711 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
2712 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
2713 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
2714 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
2715 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
2716 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
2717 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
2720 ; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2722 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2723 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
2724 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2725 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2726 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
2727 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u,16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero
2728 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
2729 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2730 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2731 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
2732 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2733 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2734 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2735 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
2736 ; AVX2-NEXT: vzeroupper
2739 ; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2741 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2742 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
2743 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2744 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2745 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
2746 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2747 ; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm3
2748 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2
2749 ; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
2750 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
2751 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2752 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2753 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2754 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
2755 ; AVX512F-NEXT: vzeroupper
2756 ; AVX512F-NEXT: retq
2758 ; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2759 ; AVX512DQ: # %bb.0:
2760 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2761 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
2762 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2763 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2764 ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
2765 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2766 ; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm3
2767 ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2
2768 ; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
2769 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
2770 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2771 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2772 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
2773 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
2774 ; AVX512DQ-NEXT: vzeroupper
2775 ; AVX512DQ-NEXT: retq
2777 ; AVX512BW-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2778 ; AVX512BW: # %bb.0:
2779 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2780 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2781 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2782 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2783 ; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
2784 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
2785 ; AVX512BW-NEXT: kmovd %eax, %k1
2786 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
2787 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2788 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
2789 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
2790 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2791 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2792 ; AVX512BW-NEXT: vzeroupper
2793 ; AVX512BW-NEXT: retq
2794 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2795 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2796 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2797 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 0, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 0, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 0, i32 93, i32 94, i32 95>
2798 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2799 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2800 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2801 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2805 define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2806 ; SSE2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2808 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2809 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
2810 ; SSE2-NEXT: paddb (%rsi), %xmm0
2811 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
2812 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
2813 ; SSE2-NEXT: pand %xmm2, %xmm1
2814 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2815 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2816 ; SSE2-NEXT: pandn %xmm0, %xmm2
2817 ; SSE2-NEXT: por %xmm1, %xmm2
2818 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,255,0,0,0,0,0,255,0,0,0,0,0]
2819 ; SSE2-NEXT: pand %xmm0, %xmm1
2820 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2821 ; SSE2-NEXT: paddb (%rdx), %xmm2
2822 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
2823 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
2824 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
2825 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
2826 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
2829 ; SSE42-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2831 ; SSE42-NEXT: movdqa (%rdi), %xmm1
2832 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
2833 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
2834 ; SSE42-NEXT: paddb (%rsi), %xmm1
2835 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
2836 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
2837 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
2838 ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3
2839 ; SSE42-NEXT: movdqa %xmm1, %xmm0
2840 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero
2841 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero
2842 ; SSE42-NEXT: paddb (%rdx), %xmm3
2843 ; SSE42-NEXT: paddb 16(%rdx), %xmm1
2844 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
2845 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
2846 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
2847 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
2850 ; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2852 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2853 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
2854 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2855 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2856 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
2857 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
2858 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
2859 ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
2860 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero
2861 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero
2862 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
2863 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
2864 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
2865 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
2866 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
2867 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
2870 ; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2872 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
2873 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2874 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
2875 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
2876 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm2
2877 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero
2878 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
2879 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
2880 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
2881 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
2882 ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
2883 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero
2884 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2885 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
2886 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
2887 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
2888 ; AVX2-NEXT: vzeroupper
2891 ; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2893 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2894 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
2895 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2896 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2897 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2898 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2899 ; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm2
2900 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2901 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2902 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero
2903 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2904 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2905 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2906 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
2907 ; AVX512F-NEXT: vzeroupper
2908 ; AVX512F-NEXT: retq
2910 ; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2911 ; AVX512DQ: # %bb.0:
2912 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2913 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
2914 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2915 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2916 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2917 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2918 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm2
2919 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2920 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2921 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero
2922 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2923 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2924 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
2925 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
2926 ; AVX512DQ-NEXT: vzeroupper
2927 ; AVX512DQ-NEXT: retq
2929 ; AVX512BW-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2930 ; AVX512BW: # %bb.0:
2931 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2932 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2933 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2934 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2935 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2936 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2937 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm2
2938 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2939 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2940 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero
2941 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
2942 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2943 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2944 ; AVX512BW-NEXT: vzeroupper
2945 ; AVX512BW-NEXT: retq
2946 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2947 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2948 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2949 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 0, i32 55, i32 56, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 64, i32 65, i32 0, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 0, i32 79, i32 80, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 88, i32 89, i32 0, i32 91, i32 92, i32 93, i32 94, i32 95>
2950 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2951 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2952 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2953 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2957 define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2958 ; SSE2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2960 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2961 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
2962 ; SSE2-NEXT: paddb (%rsi), %xmm0
2963 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
2964 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2965 ; SSE2-NEXT: pand %xmm2, %xmm1
2966 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2967 ; SSE2-NEXT: pandn %xmm0, %xmm2
2968 ; SSE2-NEXT: por %xmm1, %xmm2
2969 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2970 ; SSE2-NEXT: paddb (%rdx), %xmm2
2971 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
2972 ; SSE2-NEXT: paddb %xmm0, %xmm1
2973 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
2974 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
2975 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
2976 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
2979 ; SSE42-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2981 ; SSE42-NEXT: movdqa (%rdi), %xmm1
2982 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
2983 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
2984 ; SSE42-NEXT: paddb (%rsi), %xmm1
2985 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
2986 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2987 ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3
2988 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
2989 ; SSE42-NEXT: paddb (%rdx), %xmm3
2990 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0
2991 ; SSE42-NEXT: paddb %xmm1, %xmm0
2992 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
2993 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
2994 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
2995 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
2998 ; AVX-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
3000 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3001 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3002 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3003 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3004 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
3005 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
3006 ; AVX-NEXT: # xmm3 = mem[0,0]
3007 ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
3008 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3009 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3010 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
3011 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
3012 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
3013 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
3014 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3017 ; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
3019 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
3020 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
3021 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3022 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3023 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3024 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,16],zero,zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,zero
3025 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
3026 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
3027 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
3028 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3029 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3030 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3031 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
3032 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
3033 ; AVX2-NEXT: vzeroupper
3036 ; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
3038 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3039 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
3040 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3041 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
3042 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
3043 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3044 ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm3
3045 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2
3046 ; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
3047 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3048 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
3049 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3050 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
3051 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
3052 ; AVX512F-NEXT: vzeroupper
3053 ; AVX512F-NEXT: retq
3055 ; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
3056 ; AVX512DQ: # %bb.0:
3057 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
3058 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
3059 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3060 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
3061 ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
3062 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3063 ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm3
3064 ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2
3065 ; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
3066 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3067 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
3068 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3069 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
3070 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
3071 ; AVX512DQ-NEXT: vzeroupper
3072 ; AVX512DQ-NEXT: retq
3074 ; AVX512BW-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
3075 ; AVX512BW: # %bb.0:
3076 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3077 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3078 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3079 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3080 ; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
3081 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
3082 ; AVX512BW-NEXT: kmovd %eax, %k1
3083 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
3084 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3085 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3086 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3087 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3088 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3089 ; AVX512BW-NEXT: vzeroupper
3090 ; AVX512BW-NEXT: retq
3091 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3092 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3093 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3094 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3095 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3096 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3097 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3098 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3102 define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3103 ; SSE2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3105 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3106 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
3107 ; SSE2-NEXT: paddb (%rsi), %xmm0
3108 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
3109 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
3110 ; SSE2-NEXT: pand %xmm2, %xmm1
3111 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3112 ; SSE2-NEXT: pandn %xmm3, %xmm2
3113 ; SSE2-NEXT: por %xmm1, %xmm2
3114 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
3115 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3116 ; SSE2-NEXT: movdqa %xmm0, %xmm1
3117 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
3118 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
3119 ; SSE2-NEXT: paddb (%rdx), %xmm2
3120 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
3121 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
3122 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
3123 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3124 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
3127 ; SSE42-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3129 ; SSE42-NEXT: movdqa (%rdi), %xmm1
3130 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
3131 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
3132 ; SSE42-NEXT: paddb (%rsi), %xmm1
3133 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
3134 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
3135 ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3
3136 ; SSE42-NEXT: movdqa %xmm1, %xmm0
3137 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3138 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
3139 ; SSE42-NEXT: paddb (%rdx), %xmm3
3140 ; SSE42-NEXT: paddb 16(%rdx), %xmm1
3141 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
3142 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
3143 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
3144 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
3147 ; AVX-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3149 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3150 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3151 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3152 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3153 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
3154 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
3155 ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
3156 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3157 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3158 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3159 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
3160 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
3161 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
3162 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
3163 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3166 ; AVX2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3168 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
3169 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
3170 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
3171 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
3172 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm2
3173 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
3174 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
3175 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
3176 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
3177 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
3178 ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
3179 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3180 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3181 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
3182 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
3183 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
3184 ; AVX2-NEXT: vzeroupper
3187 ; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3189 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3190 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
3191 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3192 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3193 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3194 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3195 ; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm2
3196 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
3197 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3198 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3199 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3200 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3201 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
3202 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
3203 ; AVX512F-NEXT: vzeroupper
3204 ; AVX512F-NEXT: retq
3206 ; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3207 ; AVX512DQ: # %bb.0:
3208 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
3209 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
3210 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3211 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3212 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3213 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3214 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm2
3215 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
3216 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3217 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3218 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3219 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3220 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
3221 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
3222 ; AVX512DQ-NEXT: vzeroupper
3223 ; AVX512DQ-NEXT: retq
3225 ; AVX512BW-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3226 ; AVX512BW: # %bb.0:
3227 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3228 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3229 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3230 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3231 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3232 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3233 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm2
3234 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
3235 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3236 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3237 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3238 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3239 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3240 ; AVX512BW-NEXT: vzeroupper
3241 ; AVX512BW-NEXT: retq
3242 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3243 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3244 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3245 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3246 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3247 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3248 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3249 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3253 define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3254 ; SSE2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3256 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3257 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
3258 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
3259 ; SSE2-NEXT: paddb (%rsi), %xmm0
3260 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3261 ; SSE2-NEXT: pand %xmm2, %xmm1
3262 ; SSE2-NEXT: pandn %xmm0, %xmm2
3263 ; SSE2-NEXT: por %xmm1, %xmm2
3264 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3265 ; SSE2-NEXT: paddb (%rdx), %xmm2
3266 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
3267 ; SSE2-NEXT: paddb %xmm0, %xmm1
3268 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
3269 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
3270 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
3271 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3274 ; SSE42-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3276 ; SSE42-NEXT: movdqa (%rdi), %xmm1
3277 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
3278 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
3279 ; SSE42-NEXT: paddb (%rsi), %xmm1
3280 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3281 ; SSE42-NEXT: movdqa %xmm1, %xmm3
3282 ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3
3283 ; SSE42-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
3284 ; SSE42-NEXT: paddb (%rdx), %xmm3
3285 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0
3286 ; SSE42-NEXT: paddb %xmm1, %xmm0
3287 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
3288 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
3289 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
3290 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
3293 ; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3295 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3296 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3297 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3298 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3299 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3300 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1
3301 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3302 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3303 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
3304 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
3305 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3306 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
3307 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
3310 ; AVX2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3312 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
3313 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
3314 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
3315 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3316 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3317 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
3318 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
3319 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3320 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
3321 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
3322 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3323 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3324 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3325 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
3326 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
3327 ; AVX2-NEXT: vzeroupper
3330 ; AVX512F-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3332 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
3333 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3334 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
3335 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3336 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3337 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
3338 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
3339 ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1]
3340 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2
3341 ; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
3342 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3343 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
3344 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3345 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
3346 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
3347 ; AVX512F-NEXT: vzeroupper
3348 ; AVX512F-NEXT: retq
3350 ; AVX512DQ-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3351 ; AVX512DQ: # %bb.0:
3352 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
3353 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3354 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
3355 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3356 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3357 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
3358 ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
3359 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1]
3360 ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2
3361 ; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
3362 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3363 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
3364 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3365 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
3366 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
3367 ; AVX512DQ-NEXT: vzeroupper
3368 ; AVX512DQ-NEXT: retq
3370 ; AVX512BW-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3371 ; AVX512BW: # %bb.0:
3372 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3373 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3374 ; AVX512BW-NEXT: movw $1, %ax
3375 ; AVX512BW-NEXT: kmovd %eax, %k1
3376 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} {z}
3377 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2
3378 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
3379 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
3380 ; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
3381 ; AVX512BW-NEXT: kmovd %eax, %k1
3382 ; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1}
3383 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
3384 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
3385 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3386 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3387 ; AVX512BW-NEXT: vzeroupper
3388 ; AVX512BW-NEXT: retq
3389 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3390 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3391 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3392 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3393 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3394 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3395 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3396 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3400 define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3401 ; SSE2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3403 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3404 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
3405 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
3406 ; SSE2-NEXT: paddb (%rsi), %xmm0
3407 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3408 ; SSE2-NEXT: pand %xmm2, %xmm1
3409 ; SSE2-NEXT: pandn %xmm0, %xmm2
3410 ; SSE2-NEXT: por %xmm1, %xmm2
3411 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
3412 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3413 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
3414 ; SSE2-NEXT: movaps 32(%rdx), %xmm1
3415 ; SSE2-NEXT: paddb (%rdx), %xmm2
3416 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
3417 ; SSE2-NEXT: movaps %xmm1, 32(%rcx)
3418 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3419 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
3422 ; SSE42-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3424 ; SSE42-NEXT: movdqa (%rdi), %xmm1
3425 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
3426 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
3427 ; SSE42-NEXT: paddb (%rsi), %xmm1
3428 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3429 ; SSE42-NEXT: movdqa %xmm1, %xmm3
3430 ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3
3431 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
3432 ; SSE42-NEXT: movaps 32(%rdx), %xmm0
3433 ; SSE42-NEXT: paddb (%rdx), %xmm3
3434 ; SSE42-NEXT: paddb 16(%rdx), %xmm1
3435 ; SSE42-NEXT: movaps %xmm0, 32(%rcx)
3436 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
3437 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
3440 ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3442 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3443 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3444 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3445 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3446 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3447 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1
3448 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
3449 ; AVX-NEXT: vmovaps 32(%rdx), %ymm2
3450 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3451 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
3452 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx)
3453 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3454 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
3455 ; AVX-NEXT: vzeroupper
3458 ; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3460 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
3461 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
3462 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
3463 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
3464 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm2
3465 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
3466 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
3467 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
3468 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
3469 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3470 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
3471 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3472 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
3473 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3474 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
3475 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
3476 ; AVX2-NEXT: vzeroupper
3479 ; AVX512F-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3481 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3482 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
3483 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3484 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3485 ; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
3486 ; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0
3487 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3488 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3489 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3490 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
3491 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
3492 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
3493 ; AVX512F-NEXT: vzeroupper
3494 ; AVX512F-NEXT: retq
3496 ; AVX512DQ-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3497 ; AVX512DQ: # %bb.0:
3498 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
3499 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
3500 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3501 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3502 ; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
3503 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm0
3504 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3505 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3506 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3507 ; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1
3508 ; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rcx)
3509 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
3510 ; AVX512DQ-NEXT: vzeroupper
3511 ; AVX512DQ-NEXT: retq
3513 ; AVX512BW-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3514 ; AVX512BW: # %bb.0:
3515 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3516 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3517 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3518 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3519 ; AVX512BW-NEXT: movw $1, %ax
3520 ; AVX512BW-NEXT: kmovd %eax, %k1
3521 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
3522 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0
3523 ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3524 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3525 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3526 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3527 ; AVX512BW-NEXT: vzeroupper
3528 ; AVX512BW-NEXT: retq
3529 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3530 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3531 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3532 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3533 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3534 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3535 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3536 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3540 define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3541 ; SSE2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3543 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3544 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
3545 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
3546 ; SSE2-NEXT: paddb (%rsi), %xmm0
3547 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3548 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
3549 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
3550 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3551 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
3552 ; SSE2-NEXT: movdqa %xmm0, %xmm2
3553 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3554 ; SSE2-NEXT: pxor %xmm1, %xmm1
3555 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3556 ; SSE2-NEXT: paddb (%rdx), %xmm2
3557 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
3558 ; SSE2-NEXT: paddb %xmm0, %xmm1
3559 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
3560 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
3561 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
3562 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3565 ; SSE42-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3567 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3568 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
3569 ; SSE42-NEXT: paddb (%rsi), %xmm0
3570 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
3571 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3572 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3573 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3574 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3575 ; SSE42-NEXT: paddb (%rdx), %xmm0
3576 ; SSE42-NEXT: movdqa 16(%rdx), %xmm1
3577 ; SSE42-NEXT: paddb %xmm2, %xmm1
3578 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
3579 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
3580 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
3581 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
3584 ; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3586 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3587 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3588 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3589 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3590 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3591 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
3592 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3593 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3594 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
3595 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3596 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
3597 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
3598 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
3599 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3600 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
3601 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
3604 ; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3606 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
3607 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
3608 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3609 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3610 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
3611 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3612 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3613 ; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3614 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3615 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3616 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
3617 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
3618 ; AVX2-NEXT: vzeroupper
3621 ; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3623 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3624 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
3625 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3626 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3627 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
3628 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3629 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3630 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3631 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3632 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3633 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
3634 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
3635 ; AVX512F-NEXT: vzeroupper
3636 ; AVX512F-NEXT: retq
3638 ; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3639 ; AVX512DQ: # %bb.0:
3640 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
3641 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
3642 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3643 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3644 ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
3645 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3646 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3647 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3648 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3649 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3650 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
3651 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
3652 ; AVX512DQ-NEXT: vzeroupper
3653 ; AVX512DQ-NEXT: retq
3655 ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3656 ; AVX512BW-SLOW: # %bb.0:
3657 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
3658 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47]
3659 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3660 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
3661 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
3662 ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0
3663 ; AVX512BW-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3664 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3665 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3666 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
3667 ; AVX512BW-SLOW-NEXT: vzeroupper
3668 ; AVX512BW-SLOW-NEXT: retq
3670 ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3671 ; AVX512BW-FAST: # %bb.0:
3672 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
3673 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47]
3674 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3675 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
3676 ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
3677 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
3678 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3679 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3680 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
3681 ; AVX512BW-FAST-NEXT: vzeroupper
3682 ; AVX512BW-FAST-NEXT: retq
3683 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3684 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3685 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3686 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
3687 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31, i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47>
3688 %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8>
3689 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3690 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3691 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3692 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3696 define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3697 ; SSE2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3699 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3700 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
3701 ; SSE2-NEXT: paddb (%rsi), %xmm0
3702 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
3703 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535]
3704 ; SSE2-NEXT: pand %xmm2, %xmm1
3705 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3706 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3707 ; SSE2-NEXT: pandn %xmm0, %xmm2
3708 ; SSE2-NEXT: por %xmm1, %xmm2
3709 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65535,0,0,65535,0,0]
3710 ; SSE2-NEXT: pand %xmm0, %xmm1
3711 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3712 ; SSE2-NEXT: paddb (%rdx), %xmm2
3713 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
3714 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
3715 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
3716 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
3717 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3720 ; SSE42-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3722 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3723 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
3724 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
3725 ; SSE42-NEXT: paddb (%rsi), %xmm0
3726 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3727 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3728 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3729 ; SSE42-NEXT: pxor %xmm2, %xmm2
3730 ; SSE42-NEXT: pxor %xmm3, %xmm3
3731 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7]
3732 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
3733 ; SSE42-NEXT: paddb (%rdx), %xmm1
3734 ; SSE42-NEXT: paddb 16(%rdx), %xmm2
3735 ; SSE42-NEXT: paddb 32(%rdx), %xmm3
3736 ; SSE42-NEXT: movdqa %xmm3, 32(%rcx)
3737 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
3738 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
3741 ; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3743 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3744 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3745 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3746 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3747 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3748 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3749 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3750 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
3751 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3752 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3753 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
3754 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
3755 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
3756 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3757 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
3758 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
3761 ; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3763 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
3764 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
3765 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3766 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3767 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
3768 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3769 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3770 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3771 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
3772 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3773 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3774 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3775 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
3776 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
3777 ; AVX2-NEXT: vzeroupper
3780 ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3781 ; AVX512F-SLOW: # %bb.0:
3782 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
3783 ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
3784 ; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3785 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3786 ; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
3787 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3788 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
3789 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15]
3790 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3791 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
3792 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3793 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3794 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3795 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
3796 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
3797 ; AVX512F-SLOW-NEXT: vzeroupper
3798 ; AVX512F-SLOW-NEXT: retq
3800 ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3801 ; AVX512F-FAST: # %bb.0:
3802 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
3803 ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
3804 ; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3805 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3806 ; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2
3807 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
3808 ; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3
3809 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15]
3810 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3811 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero
3812 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3813 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3814 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
3815 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx)
3816 ; AVX512F-FAST-NEXT: vzeroupper
3817 ; AVX512F-FAST-NEXT: retq
3819 ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3820 ; AVX512DQ-SLOW: # %bb.0:
3821 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0
3822 ; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
3823 ; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3824 ; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3825 ; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
3826 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3827 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
3828 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15]
3829 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3830 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
3831 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3832 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3833 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3834 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
3835 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
3836 ; AVX512DQ-SLOW-NEXT: vzeroupper
3837 ; AVX512DQ-SLOW-NEXT: retq
3839 ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3840 ; AVX512DQ-FAST: # %bb.0:
3841 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0
3842 ; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
3843 ; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3844 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3845 ; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2
3846 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
3847 ; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3
3848 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15]
3849 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3850 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero
3851 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3852 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3853 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
3854 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx)
3855 ; AVX512DQ-FAST-NEXT: vzeroupper
3856 ; AVX512DQ-FAST-NEXT: retq
3858 ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3859 ; AVX512BW-SLOW: # %bb.0:
3860 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
3861 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0]
3862 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3863 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
3864 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
3865 ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0
3866 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
3867 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3868 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3869 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3870 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
3871 ; AVX512BW-SLOW-NEXT: vzeroupper
3872 ; AVX512BW-SLOW-NEXT: retq
3874 ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3875 ; AVX512BW-FAST: # %bb.0:
3876 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
3877 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0]
3878 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3879 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
3880 ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
3881 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero
3882 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3883 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3884 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
3885 ; AVX512BW-FAST-NEXT: vzeroupper
3886 ; AVX512BW-FAST-NEXT: retq
3887 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3888 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3889 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3890 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
3891 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 0, i32 28, i32 29, i32 0, i32 31, i32 32, i32 0, i32 34, i32 35, i32 0, i32 37, i32 38, i32 0, i32 40, i32 41, i32 0, i32 43, i32 44, i32 0, i32 46, i32 47>
3892 %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8>
3893 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3894 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3895 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3896 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3900 define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3901 ; SSE2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3903 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3904 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
3905 ; SSE2-NEXT: paddb (%rsi), %xmm0
3906 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
3907 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
3908 ; SSE2-NEXT: pand %xmm2, %xmm1
3909 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3910 ; SSE2-NEXT: pandn %xmm0, %xmm2
3911 ; SSE2-NEXT: por %xmm1, %xmm2
3912 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3913 ; SSE2-NEXT: paddb (%rdx), %xmm2
3914 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
3915 ; SSE2-NEXT: paddb %xmm0, %xmm1
3916 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
3917 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
3918 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
3919 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3922 ; SSE42-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3924 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3925 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
3926 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
3927 ; SSE42-NEXT: paddb (%rsi), %xmm0
3928 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3929 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
3930 ; SSE42-NEXT: pxor %xmm2, %xmm2
3931 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
3932 ; SSE42-NEXT: paddb (%rdx), %xmm1
3933 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0
3934 ; SSE42-NEXT: paddb %xmm2, %xmm0
3935 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
3936 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
3937 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
3938 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
3941 ; AVX-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3943 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3944 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3945 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3946 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3947 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3948 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
3949 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
3950 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
3951 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3952 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
3953 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
3954 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
3955 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3956 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
3957 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
3960 ; AVX2-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3961 ; AVX2-SLOW: # %bb.0:
3962 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
3963 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
3964 ; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3965 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3966 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2
3967 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
3968 ; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3969 ; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0
3970 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
3971 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
3972 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3973 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3974 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
3975 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
3976 ; AVX2-SLOW-NEXT: vzeroupper
3977 ; AVX2-SLOW-NEXT: retq
3979 ; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3980 ; AVX2-FAST-PERLANE: # %bb.0:
3981 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
3982 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1
3983 ; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3984 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3985 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm2
3986 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
3987 ; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3988 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
3989 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3990 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3991 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
3992 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
3993 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
3994 ; AVX2-FAST-PERLANE-NEXT: retq
3996 ; AVX2-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3997 ; AVX2-FAST: # %bb.0:
3998 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
3999 ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
4000 ; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4001 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4002 ; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm2
4003 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
4004 ; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4005 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
4006 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4007 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4008 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
4009 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
4010 ; AVX2-FAST-NEXT: vzeroupper
4011 ; AVX2-FAST-NEXT: retq
4013 ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
4014 ; AVX512F-SLOW: # %bb.0:
4015 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
4016 ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
4017 ; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4018 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4019 ; AVX512F-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2
4020 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
4021 ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4022 ; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0
4023 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4024 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
4025 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4026 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4027 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
4028 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
4029 ; AVX512F-SLOW-NEXT: vzeroupper
4030 ; AVX512F-SLOW-NEXT: retq
4032 ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
4033 ; AVX512F-FAST: # %bb.0:
4034 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0
4035 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4036 ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
4037 ; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4038 ; AVX512F-FAST-NEXT: vpbroadcastq %xmm0, %ymm2
4039 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
4040 ; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4041 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
4042 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4043 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4044 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
4045 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx)
4046 ; AVX512F-FAST-NEXT: vzeroupper
4047 ; AVX512F-FAST-NEXT: retq
4049 ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
4050 ; AVX512DQ-SLOW: # %bb.0:
4051 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0
4052 ; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
4053 ; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4054 ; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4055 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2
4056 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
4057 ; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4058 ; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0
4059 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4060 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
4061 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4062 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4063 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
4064 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
4065 ; AVX512DQ-SLOW-NEXT: vzeroupper
4066 ; AVX512DQ-SLOW-NEXT: retq
4068 ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
4069 ; AVX512DQ-FAST: # %bb.0:
4070 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0
4071 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4072 ; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
4073 ; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4074 ; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm2
4075 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
4076 ; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4077 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
4078 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4079 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4080 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
4081 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx)
4082 ; AVX512DQ-FAST-NEXT: vzeroupper
4083 ; AVX512DQ-FAST-NEXT: retq
4085 ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
4086 ; AVX512BW-SLOW: # %bb.0:
4087 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
4088 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47]
4089 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4090 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4091 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
4092 ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0
4093 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4094 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
4095 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4096 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4097 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4098 ; AVX512BW-SLOW-NEXT: vzeroupper
4099 ; AVX512BW-SLOW-NEXT: retq
4101 ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
4102 ; AVX512BW-FAST: # %bb.0:
4103 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4104 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47]
4105 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4106 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4107 ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
4108 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
4109 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4110 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4111 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4112 ; AVX512BW-FAST-NEXT: vzeroupper
4113 ; AVX512BW-FAST-NEXT: retq
4114 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4115 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4116 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4117 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
4118 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31, i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47>
4119 %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8>
4120 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4121 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4122 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4123 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4127 define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4128 ; SSE2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4130 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4131 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
4132 ; SSE2-NEXT: paddb (%rsi), %xmm0
4133 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
4134 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535]
4135 ; SSE2-NEXT: pand %xmm2, %xmm1
4136 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
4137 ; SSE2-NEXT: pandn %xmm3, %xmm2
4138 ; SSE2-NEXT: por %xmm1, %xmm2
4139 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
4140 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4141 ; SSE2-NEXT: movdqa %xmm0, %xmm1
4142 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
4143 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4144 ; SSE2-NEXT: paddb (%rdx), %xmm2
4145 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
4146 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
4147 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
4148 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
4149 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
4152 ; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4154 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4155 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
4156 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
4157 ; SSE42-NEXT: paddb (%rsi), %xmm0
4158 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4159 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
4160 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
4161 ; SSE42-NEXT: pxor %xmm3, %xmm3
4162 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
4163 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4164 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7]
4165 ; SSE42-NEXT: paddb (%rdx), %xmm2
4166 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
4167 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
4168 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
4169 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
4170 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
4173 ; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4175 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4176 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
4177 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4178 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4179 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4180 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
4181 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
4182 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
4183 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
4184 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7]
4185 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
4186 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
4187 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2
4188 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
4189 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
4190 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
4191 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
4194 ; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4196 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
4197 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
4198 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4199 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4200 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
4201 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
4202 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
4203 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4204 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
4205 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
4206 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4207 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4208 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
4209 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
4210 ; AVX2-NEXT: vzeroupper
4213 ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4214 ; AVX512F-SLOW: # %bb.0:
4215 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
4216 ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
4217 ; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4218 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4219 ; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
4220 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
4221 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4222 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
4223 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4224 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4225 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
4226 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4227 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4228 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
4229 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
4230 ; AVX512F-SLOW-NEXT: vzeroupper
4231 ; AVX512F-SLOW-NEXT: retq
4233 ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4234 ; AVX512F-FAST: # %bb.0:
4235 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
4236 ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
4237 ; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4238 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4239 ; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2
4240 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
4241 ; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3
4242 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15]
4243 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4244 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4245 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4246 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4247 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
4248 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx)
4249 ; AVX512F-FAST-NEXT: vzeroupper
4250 ; AVX512F-FAST-NEXT: retq
4252 ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4253 ; AVX512DQ-SLOW: # %bb.0:
4254 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0
4255 ; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
4256 ; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4257 ; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4258 ; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
4259 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
4260 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4261 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
4262 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4263 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4264 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
4265 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4266 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4267 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
4268 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
4269 ; AVX512DQ-SLOW-NEXT: vzeroupper
4270 ; AVX512DQ-SLOW-NEXT: retq
4272 ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4273 ; AVX512DQ-FAST: # %bb.0:
4274 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0
4275 ; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
4276 ; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4277 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4278 ; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2
4279 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
4280 ; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3
4281 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15]
4282 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4283 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4284 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4285 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4286 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
4287 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx)
4288 ; AVX512DQ-FAST-NEXT: vzeroupper
4289 ; AVX512DQ-FAST-NEXT: retq
4291 ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4292 ; AVX512BW-SLOW: # %bb.0:
4293 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
4294 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47]
4295 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4296 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4297 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
4298 ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0
4299 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4300 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
4301 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4302 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4303 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4304 ; AVX512BW-SLOW-NEXT: vzeroupper
4305 ; AVX512BW-SLOW-NEXT: retq
4307 ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
4308 ; AVX512BW-FAST: # %bb.0:
4309 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4310 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47]
4311 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4312 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4313 ; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
4314 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4315 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4316 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4317 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4318 ; AVX512BW-FAST-NEXT: vzeroupper
4319 ; AVX512BW-FAST-NEXT: retq
4320 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4321 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4322 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4323 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
4324 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 0, i32 31, i32 32, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 40, i32 41, i32 0, i32 43, i32 44, i32 45, i32 46, i32 47>
4325 %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8>
4326 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4327 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4328 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4329 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4333 define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4334 ; SSE2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4336 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4337 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
4338 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
4339 ; SSE2-NEXT: paddb (%rsi), %xmm0
4340 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
4341 ; SSE2-NEXT: pand %xmm2, %xmm1
4342 ; SSE2-NEXT: pandn %xmm0, %xmm2
4343 ; SSE2-NEXT: por %xmm1, %xmm2
4344 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4345 ; SSE2-NEXT: paddb (%rdx), %xmm2
4346 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
4347 ; SSE2-NEXT: paddb %xmm0, %xmm1
4348 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
4349 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
4350 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
4351 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
4354 ; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4356 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4357 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
4358 ; SSE42-NEXT: paddb (%rsi), %xmm0
4359 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
4360 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4361 ; SSE42-NEXT: pxor %xmm2, %xmm2
4362 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4363 ; SSE42-NEXT: paddb (%rdx), %xmm1
4364 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0
4365 ; SSE42-NEXT: paddb %xmm2, %xmm0
4366 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
4367 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
4368 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
4369 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
4372 ; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4374 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4375 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
4376 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4377 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4378 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4379 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
4380 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4381 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
4382 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
4383 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
4384 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
4385 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
4386 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
4387 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
4390 ; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4392 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
4393 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4394 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
4395 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4396 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4397 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4398 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4399 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
4400 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4401 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4402 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4403 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
4404 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
4405 ; AVX2-NEXT: vzeroupper
4408 ; AVX512F-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4410 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4411 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4412 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
4413 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4414 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4415 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4416 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4417 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
4418 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4419 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4420 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4421 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
4422 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
4423 ; AVX512F-NEXT: vzeroupper
4424 ; AVX512F-NEXT: retq
4426 ; AVX512DQ-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4427 ; AVX512DQ: # %bb.0:
4428 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
4429 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4430 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
4431 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4432 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4433 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4434 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
4435 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
4436 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4437 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4438 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4439 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
4440 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
4441 ; AVX512DQ-NEXT: vzeroupper
4442 ; AVX512DQ-NEXT: retq
4444 ; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4445 ; AVX512BW: # %bb.0:
4446 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
4447 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47]
4448 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4449 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4450 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
4451 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4452 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4453 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4454 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4455 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
4456 ; AVX512BW-NEXT: vzeroupper
4457 ; AVX512BW-NEXT: retq
4458 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4459 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4460 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4461 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
4462 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4463 %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8>
4464 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4465 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4466 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4467 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4471 define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4472 ; SSE2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4474 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4475 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
4476 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
4477 ; SSE2-NEXT: paddb (%rsi), %xmm0
4478 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
4479 ; SSE2-NEXT: pand %xmm2, %xmm1
4480 ; SSE2-NEXT: pandn %xmm0, %xmm2
4481 ; SSE2-NEXT: por %xmm1, %xmm2
4482 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
4483 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4484 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4485 ; SSE2-NEXT: movaps 32(%rdx), %xmm1
4486 ; SSE2-NEXT: paddb (%rdx), %xmm2
4487 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
4488 ; SSE2-NEXT: movaps %xmm1, 32(%rcx)
4489 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
4490 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
4493 ; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4495 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4496 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
4497 ; SSE42-NEXT: paddb (%rsi), %xmm0
4498 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
4499 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4500 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4501 ; SSE42-NEXT: pxor %xmm2, %xmm2
4502 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7]
4503 ; SSE42-NEXT: movaps 32(%rdx), %xmm0
4504 ; SSE42-NEXT: paddb (%rdx), %xmm1
4505 ; SSE42-NEXT: paddb 16(%rdx), %xmm2
4506 ; SSE42-NEXT: movaps %xmm0, 32(%rcx)
4507 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
4508 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
4511 ; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4513 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4514 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
4515 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4516 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4517 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4518 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4519 ; AVX-NEXT: vmovaps 32(%rdx), %ymm2
4520 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
4521 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
4522 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
4523 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx)
4524 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
4525 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
4526 ; AVX-NEXT: vzeroupper
4529 ; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4531 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
4532 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
4533 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4534 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4535 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4536 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
4537 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
4538 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
4539 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4540 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
4541 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4542 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
4543 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
4544 ; AVX2-NEXT: vzeroupper
4547 ; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4549 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4550 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4551 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
4552 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4553 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4554 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
4555 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
4556 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
4557 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4558 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4559 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
4560 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
4561 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4562 ; AVX512F-NEXT: vzeroupper
4563 ; AVX512F-NEXT: retq
4565 ; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4566 ; AVX512DQ: # %bb.0:
4567 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
4568 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4569 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
4570 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4571 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4572 ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
4573 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
4574 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
4575 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4576 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4577 ; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1
4578 ; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rcx)
4579 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
4580 ; AVX512DQ-NEXT: vzeroupper
4581 ; AVX512DQ-NEXT: retq
4583 ; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4584 ; AVX512BW: # %bb.0:
4585 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
4586 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47]
4587 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4588 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4589 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm0
4590 ; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
4591 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4592 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
4593 ; AVX512BW-NEXT: vzeroupper
4594 ; AVX512BW-NEXT: retq
4595 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4596 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4597 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4598 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
4599 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4600 %out.bytevec = bitcast <24 x i16> %broadcast.of.zextinreg to <48 x i8>
4601 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4602 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4603 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4604 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4608 define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4609 ; SSE2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4611 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4612 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
4613 ; SSE2-NEXT: paddb (%rsi), %xmm0
4614 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
4615 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
4616 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
4617 ; SSE2-NEXT: movdqa %xmm0, %xmm2
4618 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4619 ; SSE2-NEXT: pxor %xmm1, %xmm1
4620 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4621 ; SSE2-NEXT: paddb (%rdx), %xmm2
4622 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
4623 ; SSE2-NEXT: paddb %xmm0, %xmm1
4624 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
4625 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
4626 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
4627 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
4630 ; SSE42-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4632 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4633 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
4634 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
4635 ; SSE42-NEXT: paddb (%rsi), %xmm0
4636 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4637 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
4638 ; SSE42-NEXT: pxor %xmm2, %xmm2
4639 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
4640 ; SSE42-NEXT: paddb (%rdx), %xmm1
4641 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0
4642 ; SSE42-NEXT: paddb %xmm2, %xmm0
4643 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
4644 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
4645 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
4646 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
4649 ; AVX-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4651 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4652 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
4653 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4654 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4655 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2
4656 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7]
4657 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3]
4658 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
4659 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
4660 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4661 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
4662 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
4663 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
4664 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
4665 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
4666 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
4667 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
4668 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
4669 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
4670 ; AVX-NEXT: vzeroupper
4673 ; AVX2-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4674 ; AVX2-SLOW: # %bb.0:
4675 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
4676 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
4677 ; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4678 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4679 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2
4680 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7]
4681 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4682 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
4683 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
4684 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4685 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
4686 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4687 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4688 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
4689 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
4690 ; AVX2-SLOW-NEXT: vzeroupper
4691 ; AVX2-SLOW-NEXT: retq
4693 ; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4694 ; AVX2-FAST-PERLANE: # %bb.0:
4695 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
4696 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1
4697 ; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4698 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4699 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm2
4700 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7]
4701 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2
4702 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
4703 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
4704 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4705 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4706 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
4707 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
4708 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
4709 ; AVX2-FAST-PERLANE-NEXT: retq
4711 ; AVX2-FAST-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4712 ; AVX2-FAST: # %bb.0:
4713 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
4714 ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
4715 ; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4716 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4717 ; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm2
4718 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7]
4719 ; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4720 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
4721 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
4722 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4723 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4724 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
4725 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
4726 ; AVX2-FAST-NEXT: vzeroupper
4727 ; AVX2-FAST-NEXT: retq
4729 ; AVX512F-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4731 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4732 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4733 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4734 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4735 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4736 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
4737 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u>
4738 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4739 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
4740 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4741 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
4742 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
4743 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
4744 ; AVX512F-NEXT: vzeroupper
4745 ; AVX512F-NEXT: retq
4747 ; AVX512DQ-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4748 ; AVX512DQ: # %bb.0:
4749 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
4750 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4751 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4752 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4753 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4754 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
4755 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u>
4756 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4757 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
4758 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4759 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
4760 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
4761 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
4762 ; AVX512DQ-NEXT: vzeroupper
4763 ; AVX512DQ-NEXT: retq
4765 ; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4766 ; AVX512BW-SLOW: # %bb.0:
4767 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
4768 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4769 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15]
4770 ; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
4771 ; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
4772 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4773 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
4774 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4775 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
4776 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4777 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4778 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4779 ; AVX512BW-SLOW-NEXT: vzeroupper
4780 ; AVX512BW-SLOW-NEXT: retq
4782 ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4783 ; AVX512BW-FAST: # %bb.0:
4784 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4785 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4786 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15]
4787 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
4788 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4789 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
4790 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
4791 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4792 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4793 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4794 ; AVX512BW-FAST-NEXT: vzeroupper
4795 ; AVX512BW-FAST-NEXT: retq
4796 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4797 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4798 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4799 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
4800 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 0, i32 15, i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23>
4801 %out.bytevec = bitcast <12 x i32> %broadcast.of.zextinreg to <48 x i8>
4802 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4803 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4804 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4805 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4809 define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4810 ; SSE2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4812 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4813 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
4814 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
4815 ; SSE2-NEXT: paddb (%rsi), %xmm0
4816 ; SSE2-NEXT: xorps %xmm2, %xmm2
4817 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
4818 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,2]
4819 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
4820 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,1,1]
4821 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
4822 ; SSE2-NEXT: paddb (%rdx), %xmm0
4823 ; SSE2-NEXT: paddb 16(%rdx), %xmm2
4824 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
4825 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
4826 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
4827 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
4830 ; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4832 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4833 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
4834 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
4835 ; SSE42-NEXT: paddb (%rsi), %xmm0
4836 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4837 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
4838 ; SSE42-NEXT: pxor %xmm1, %xmm1
4839 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4840 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
4841 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1]
4842 ; SSE42-NEXT: paddb (%rdx), %xmm2
4843 ; SSE42-NEXT: paddb 16(%rdx), %xmm1
4844 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
4845 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
4846 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
4847 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
4850 ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4852 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4853 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
4854 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4855 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4856 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4857 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4858 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
4859 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
4860 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,1,1]
4861 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4862 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4863 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
4864 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6],ymm3[7]
4865 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
4866 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
4867 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
4868 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
4869 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
4870 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
4871 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
4872 ; AVX-NEXT: vzeroupper
4875 ; AVX2-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4876 ; AVX2-SLOW: # %bb.0:
4877 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
4878 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
4879 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4880 ; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4881 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4882 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0]
4883 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1
4884 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4885 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
4886 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4887 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
4888 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
4889 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4890 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4891 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
4892 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
4893 ; AVX2-SLOW-NEXT: vzeroupper
4894 ; AVX2-SLOW-NEXT: retq
4896 ; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4897 ; AVX2-FAST-PERLANE: # %bb.0:
4898 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
4899 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
4900 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4901 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4902 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4903 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0]
4904 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm1
4905 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2
4906 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
4907 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
4908 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4909 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4910 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
4911 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
4912 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
4913 ; AVX2-FAST-PERLANE-NEXT: retq
4915 ; AVX2-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4916 ; AVX2-FAST: # %bb.0:
4917 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
4918 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
4919 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4920 ; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4921 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4922 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0]
4923 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
4924 ; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4925 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
4926 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
4927 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4928 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4929 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
4930 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
4931 ; AVX2-FAST-NEXT: vzeroupper
4932 ; AVX2-FAST-NEXT: retq
4934 ; AVX512F-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4936 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4937 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4938 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4939 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4940 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4941 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
4942 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u>
4943 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4944 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
4945 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4946 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
4947 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
4948 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
4949 ; AVX512F-NEXT: vzeroupper
4950 ; AVX512F-NEXT: retq
4952 ; AVX512DQ-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4953 ; AVX512DQ: # %bb.0:
4954 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
4955 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4956 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4957 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4958 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4959 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
4960 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u>
4961 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4962 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
4963 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4964 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
4965 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
4966 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
4967 ; AVX512DQ-NEXT: vzeroupper
4968 ; AVX512DQ-NEXT: retq
4970 ; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4971 ; AVX512BW-SLOW: # %bb.0:
4972 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
4973 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4974 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0]
4975 ; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
4976 ; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
4977 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4978 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
4979 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4980 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
4981 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4982 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4983 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4984 ; AVX512BW-SLOW-NEXT: vzeroupper
4985 ; AVX512BW-SLOW-NEXT: retq
4987 ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4988 ; AVX512BW-FAST: # %bb.0:
4989 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4990 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4991 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0]
4992 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
4993 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4994 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
4995 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
4996 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4997 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4998 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4999 ; AVX512BW-FAST-NEXT: vzeroupper
5000 ; AVX512BW-FAST-NEXT: retq
5001 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5002 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5003 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5004 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
5005 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 0, i32 16, i32 17, i32 0, i32 19, i32 20, i32 0, i32 22, i32 23>
5006 %out.bytevec = bitcast <12 x i32> %broadcast.of.zextinreg to <48 x i8>
5007 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5008 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5009 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5010 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5014 define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5015 ; SSE2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5017 ; SSE2-NEXT: movdqa (%rdi), %xmm0
5018 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
5019 ; SSE2-NEXT: paddb (%rsi), %xmm0
5020 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
5021 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
5022 ; SSE2-NEXT: xorps %xmm2, %xmm2
5023 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
5024 ; SSE2-NEXT: paddb (%rdx), %xmm1
5025 ; SSE2-NEXT: movdqa 16(%rdx), %xmm0
5026 ; SSE2-NEXT: paddb %xmm2, %xmm0
5027 ; SSE2-NEXT: paddb 32(%rdx), %xmm2
5028 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
5029 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
5030 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
5033 ; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5035 ; SSE42-NEXT: movdqa (%rdi), %xmm0
5036 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
5037 ; SSE42-NEXT: paddb (%rsi), %xmm0
5038 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
5039 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5040 ; SSE42-NEXT: pxor %xmm2, %xmm2
5041 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5042 ; SSE42-NEXT: paddb (%rdx), %xmm1
5043 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0
5044 ; SSE42-NEXT: paddb %xmm2, %xmm0
5045 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
5046 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
5047 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
5048 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
5051 ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5053 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5054 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
5055 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5056 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
5057 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5058 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
5059 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5060 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5061 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
5062 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
5063 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
5064 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
5065 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
5066 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
5067 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
5068 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
5069 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
5070 ; AVX-NEXT: vzeroupper
5073 ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5075 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5076 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5077 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
5078 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
5079 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
5080 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
5081 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
5082 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
5083 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
5084 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5085 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
5086 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5087 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
5088 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
5089 ; AVX2-NEXT: vzeroupper
5092 ; AVX512F-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5094 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5095 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
5096 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
5097 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5098 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5099 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
5100 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u>
5101 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
5102 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
5103 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5104 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
5105 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
5106 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
5107 ; AVX512F-NEXT: vzeroupper
5108 ; AVX512F-NEXT: retq
5110 ; AVX512DQ-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5111 ; AVX512DQ: # %bb.0:
5112 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
5113 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
5114 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
5115 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5116 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5117 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
5118 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u>
5119 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
5120 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
5121 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5122 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
5123 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
5124 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
5125 ; AVX512DQ-NEXT: vzeroupper
5126 ; AVX512DQ-NEXT: retq
5128 ; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
5129 ; AVX512BW: # %bb.0:
5130 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
5131 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
5132 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5133 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
5134 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5135 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
5136 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5137 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5138 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
5139 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5140 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5141 ; AVX512BW-NEXT: vzeroupper
5142 ; AVX512BW-NEXT: retq
5143 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5144 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5145 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5146 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
5147 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23>
5148 %out.bytevec = bitcast <12 x i32> %broadcast.of.zextinreg to <48 x i8>
5149 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5150 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5151 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5152 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5156 define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5157 ; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5159 ; SSE2-NEXT: movdqa (%rdi), %xmm0
5160 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
5161 ; SSE2-NEXT: paddb (%rsi), %xmm0
5162 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
5163 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
5164 ; SSE2-NEXT: xorps %xmm2, %xmm2
5165 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
5166 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1]
5167 ; SSE2-NEXT: movaps 32(%rdx), %xmm2
5168 ; SSE2-NEXT: paddb (%rdx), %xmm1
5169 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
5170 ; SSE2-NEXT: movaps %xmm2, 32(%rcx)
5171 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
5172 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
5175 ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5177 ; SSE42-NEXT: movdqa (%rdi), %xmm0
5178 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
5179 ; SSE42-NEXT: paddb (%rsi), %xmm0
5180 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
5181 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5182 ; SSE42-NEXT: pxor %xmm2, %xmm2
5183 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5184 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1]
5185 ; SSE42-NEXT: movaps 32(%rdx), %xmm2
5186 ; SSE42-NEXT: paddb (%rdx), %xmm1
5187 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
5188 ; SSE42-NEXT: movaps %xmm2, 32(%rcx)
5189 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
5190 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
5193 ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5195 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5196 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
5197 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5198 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
5199 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5200 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5201 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5202 ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
5203 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7]
5204 ; AVX-NEXT: vmovaps 32(%rdx), %ymm2
5205 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
5206 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
5207 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
5208 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx)
5209 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
5210 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
5211 ; AVX-NEXT: vzeroupper
5214 ; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5216 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5217 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
5218 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5219 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
5220 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5221 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,6,7]
5222 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
5223 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
5224 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7]
5225 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
5226 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5227 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
5228 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5229 ; AVX2-NEXT: vzeroupper
5232 ; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5234 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5235 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
5236 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
5237 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5238 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5239 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7]
5240 ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
5241 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
5242 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
5243 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
5244 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
5245 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
5246 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5247 ; AVX512F-NEXT: vzeroupper
5248 ; AVX512F-NEXT: retq
5250 ; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5251 ; AVX512DQ: # %bb.0:
5252 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
5253 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
5254 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
5255 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5256 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5257 ; AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7]
5258 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
5259 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
5260 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
5261 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
5262 ; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1
5263 ; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rcx)
5264 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
5265 ; AVX512DQ-NEXT: vzeroupper
5266 ; AVX512DQ-NEXT: retq
5268 ; AVX512BW-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
5269 ; AVX512BW: # %bb.0:
5270 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
5271 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5272 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15]
5273 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
5274 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
5275 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7]
5276 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5277 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5278 ; AVX512BW-NEXT: vzeroupper
5279 ; AVX512BW-NEXT: retq
5280 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5281 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5282 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5283 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
5284 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 16, i32 17, i32 0, i32 19, i32 20, i32 21, i32 22, i32 23>
5285 %out.bytevec = bitcast <12 x i32> %broadcast.of.zextinreg to <48 x i8>
5286 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5287 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5288 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5289 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5293 define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5294 ; SSE2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5296 ; SSE2-NEXT: movdqa (%rdi), %xmm0
5297 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
5298 ; SSE2-NEXT: paddb (%rsi), %xmm0
5299 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
5300 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5301 ; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
5302 ; SSE2-NEXT: paddb (%rdx), %xmm1
5303 ; SSE2-NEXT: movdqa 16(%rdx), %xmm2
5304 ; SSE2-NEXT: paddb %xmm0, %xmm2
5305 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
5306 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
5307 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
5308 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
5311 ; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5313 ; SSE42-NEXT: movdqa (%rdi), %xmm0
5314 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
5315 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
5316 ; SSE42-NEXT: paddb (%rsi), %xmm0
5317 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
5318 ; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
5319 ; SSE42-NEXT: paddb (%rdx), %xmm1
5320 ; SSE42-NEXT: movdqa 16(%rdx), %xmm2
5321 ; SSE42-NEXT: paddb %xmm0, %xmm2
5322 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
5323 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
5324 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
5325 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
5328 ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5330 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5331 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
5332 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
5333 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5334 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
5335 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = xmm0[0],zero
5336 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5337 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
5338 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
5339 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
5340 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
5341 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
5342 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
5343 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
5344 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
5345 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
5346 ; AVX-NEXT: vzeroupper
5349 ; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5351 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5352 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
5353 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
5354 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5355 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
5356 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3]
5357 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
5358 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
5359 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5360 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
5361 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5362 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
5363 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
5364 ; AVX2-NEXT: vzeroupper
5367 ; AVX512F-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5369 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5370 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
5371 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
5372 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5373 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5374 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
5375 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u>
5376 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5377 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
5378 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5379 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
5380 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
5381 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
5382 ; AVX512F-NEXT: vzeroupper
5383 ; AVX512F-NEXT: retq
5385 ; AVX512DQ-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5386 ; AVX512DQ: # %bb.0:
5387 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
5388 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
5389 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
5390 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5391 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5392 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
5393 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u>
5394 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5395 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
5396 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5397 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
5398 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
5399 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
5400 ; AVX512DQ-NEXT: vzeroupper
5401 ; AVX512DQ-NEXT: retq
5403 ; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5404 ; AVX512BW-SLOW: # %bb.0:
5405 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
5406 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,0,11]
5407 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5408 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5409 ; AVX512BW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
5410 ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5411 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
5412 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5413 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
5414 ; AVX512BW-SLOW-NEXT: vzeroupper
5415 ; AVX512BW-SLOW-NEXT: retq
5417 ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
5418 ; AVX512BW-FAST: # %bb.0:
5419 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
5420 ; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,7,0,7]
5421 ; AVX512BW-FAST-NEXT: # ymm1 = mem[0,1,0,1]
5422 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5423 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1
5424 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
5425 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
5426 ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5427 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
5428 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5429 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
5430 ; AVX512BW-FAST-NEXT: vzeroupper
5431 ; AVX512BW-FAST-NEXT: retq
5432 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5433 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5434 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5435 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
5436 %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 0, i32 9, i32 0, i32 11>
5437 %out.bytevec = bitcast <6 x i64> %broadcast.of.zextinreg to <48 x i8>
5438 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5439 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5440 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5441 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5445 define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5446 ; SSE2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5448 ; SSE2-NEXT: movdqa (%rdi), %xmm0
5449 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
5450 ; SSE2-NEXT: paddb (%rsi), %xmm0
5451 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
5452 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5453 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
5454 ; SSE2-NEXT: movaps 32(%rdx), %xmm2
5455 ; SSE2-NEXT: paddb (%rdx), %xmm1
5456 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
5457 ; SSE2-NEXT: movaps %xmm2, 32(%rcx)
5458 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
5459 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
5462 ; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5464 ; SSE42-NEXT: movdqa (%rdi), %xmm0
5465 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
5466 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
5467 ; SSE42-NEXT: paddb (%rsi), %xmm0
5468 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
5469 ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
5470 ; SSE42-NEXT: movaps 32(%rdx), %xmm2
5471 ; SSE42-NEXT: paddb (%rdx), %xmm1
5472 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
5473 ; SSE42-NEXT: movaps %xmm2, 32(%rcx)
5474 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
5475 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
5478 ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5480 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5481 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
5482 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
5483 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5484 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
5485 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5486 ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
5487 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2]
5488 ; AVX-NEXT: vmovaps 32(%rdx), %ymm2
5489 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
5490 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
5491 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
5492 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx)
5493 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
5494 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
5495 ; AVX-NEXT: vzeroupper
5498 ; AVX2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5500 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5501 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
5502 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
5503 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5504 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
5505 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,0]
5506 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
5507 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
5508 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
5509 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5510 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
5511 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5512 ; AVX2-NEXT: vzeroupper
5515 ; AVX512F-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5517 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5518 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
5519 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
5520 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5521 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5522 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
5523 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0]
5524 ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
5525 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5526 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
5527 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
5528 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
5529 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5530 ; AVX512F-NEXT: vzeroupper
5531 ; AVX512F-NEXT: retq
5533 ; AVX512DQ-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5534 ; AVX512DQ: # %bb.0:
5535 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
5536 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
5537 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
5538 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5539 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5540 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
5541 ; AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0]
5542 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
5543 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5544 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
5545 ; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1
5546 ; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rcx)
5547 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
5548 ; AVX512DQ-NEXT: vzeroupper
5549 ; AVX512DQ-NEXT: retq
5551 ; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5552 ; AVX512BW-SLOW: # %bb.0:
5553 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
5554 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,10,0]
5555 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5556 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5557 ; AVX512BW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
5558 ; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, %ymm0
5559 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5560 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
5561 ; AVX512BW-SLOW-NEXT: vzeroupper
5562 ; AVX512BW-SLOW-NEXT: retq
5564 ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
5565 ; AVX512BW-FAST: # %bb.0:
5566 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
5567 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,2,0]
5568 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5569 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0
5570 ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
5571 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
5572 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5573 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
5574 ; AVX512BW-FAST-NEXT: vzeroupper
5575 ; AVX512BW-FAST-NEXT: retq
5576 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5577 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5578 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5579 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
5580 %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 8, i32 0, i32 10, i32 11>
5581 %out.bytevec = bitcast <6 x i64> %broadcast.of.zextinreg to <48 x i8>
5582 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5583 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5584 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5585 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5589 define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5590 ; SSE2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5592 ; SSE2-NEXT: movdqa (%rdi), %xmm0
5593 ; SSE2-NEXT: paddb (%rsi), %xmm0
5594 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5595 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5596 ; SSE2-NEXT: pxor %xmm1, %xmm1
5597 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5598 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
5599 ; SSE2-NEXT: paddb %xmm0, %xmm1
5600 ; SSE2-NEXT: movdqa (%rdx), %xmm2
5601 ; SSE2-NEXT: paddb %xmm0, %xmm2
5602 ; SSE2-NEXT: movdqa 48(%rdx), %xmm3
5603 ; SSE2-NEXT: paddb %xmm0, %xmm3
5604 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
5605 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
5606 ; SSE2-NEXT: movdqa %xmm3, 48(%rcx)
5607 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
5608 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
5611 ; SSE42-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5613 ; SSE42-NEXT: movdqa (%rdi), %xmm0
5614 ; SSE42-NEXT: paddb (%rsi), %xmm0
5615 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
5616 ; SSE42-NEXT: movdqa 16(%rdx), %xmm1
5617 ; SSE42-NEXT: paddb %xmm0, %xmm1
5618 ; SSE42-NEXT: movdqa (%rdx), %xmm2
5619 ; SSE42-NEXT: paddb %xmm0, %xmm2
5620 ; SSE42-NEXT: movdqa 48(%rdx), %xmm3
5621 ; SSE42-NEXT: paddb %xmm0, %xmm3
5622 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
5623 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
5624 ; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
5625 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
5626 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
5629 ; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5631 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5632 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5633 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
5634 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1
5635 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
5636 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3
5637 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5638 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5639 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
5640 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
5641 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
5644 ; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5646 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5647 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5648 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5649 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero
5650 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5651 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5652 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5653 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
5654 ; AVX2-NEXT: vzeroupper
5657 ; AVX512F-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5659 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5660 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5661 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5662 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero
5663 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5664 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5665 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5666 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
5667 ; AVX512F-NEXT: vzeroupper
5668 ; AVX512F-NEXT: retq
5670 ; AVX512DQ-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5671 ; AVX512DQ: # %bb.0:
5672 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
5673 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5674 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5675 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero
5676 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5677 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5678 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
5679 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
5680 ; AVX512DQ-NEXT: vzeroupper
5681 ; AVX512DQ-NEXT: retq
5683 ; AVX512BW-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5684 ; AVX512BW: # %bb.0:
5685 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
5686 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5687 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5688 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
5689 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero
5690 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5691 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5692 ; AVX512BW-NEXT: vzeroupper
5693 ; AVX512BW-NEXT: retq
5694 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5695 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5696 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5697 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 0, i32 67, i32 0, i32 69, i32 0, i32 71, i32 0, i32 73, i32 0, i32 75, i32 0, i32 77, i32 0, i32 79, i32 0, i32 81, i32 0, i32 83, i32 0, i32 85, i32 0, i32 87, i32 0, i32 89, i32 0, i32 91, i32 0, i32 93, i32 0, i32 95, i32 0, i32 97, i32 0, i32 99, i32 0, i32 101, i32 0, i32 103, i32 0, i32 105, i32 0, i32 107, i32 0, i32 109, i32 0, i32 111, i32 0, i32 113, i32 0, i32 115, i32 0, i32 117, i32 0, i32 119, i32 0, i32 121, i32 0, i32 123, i32 0, i32 125, i32 0, i32 127>
5698 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5699 %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias
5700 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5704 define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5705 ; SSE2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5707 ; SSE2-NEXT: movdqa (%rdi), %xmm0
5708 ; SSE2-NEXT: paddb (%rsi), %xmm0
5709 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5710 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
5711 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
5712 ; SSE2-NEXT: paddb %xmm0, %xmm1
5713 ; SSE2-NEXT: movdqa (%rdx), %xmm2
5714 ; SSE2-NEXT: paddb %xmm0, %xmm2
5715 ; SSE2-NEXT: movdqa 48(%rdx), %xmm3
5716 ; SSE2-NEXT: paddb %xmm0, %xmm3
5717 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
5718 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
5719 ; SSE2-NEXT: movdqa %xmm3, 48(%rcx)
5720 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
5721 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
5724 ; SSE42-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5726 ; SSE42-NEXT: movdqa (%rdi), %xmm0
5727 ; SSE42-NEXT: paddb (%rsi), %xmm0
5728 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
5729 ; SSE42-NEXT: movdqa 16(%rdx), %xmm1
5730 ; SSE42-NEXT: paddb %xmm0, %xmm1
5731 ; SSE42-NEXT: movdqa (%rdx), %xmm2
5732 ; SSE42-NEXT: paddb %xmm0, %xmm2
5733 ; SSE42-NEXT: movdqa 48(%rdx), %xmm3
5734 ; SSE42-NEXT: paddb %xmm0, %xmm3
5735 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
5736 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
5737 ; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
5738 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
5739 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
5742 ; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5744 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5745 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5746 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
5747 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1
5748 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
5749 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3
5750 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5751 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5752 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
5753 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
5754 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
5757 ; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5759 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5760 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5761 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5762 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero
5763 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5764 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5765 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5766 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
5767 ; AVX2-NEXT: vzeroupper
5770 ; AVX512F-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5772 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5773 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5774 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5775 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero
5776 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5777 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5778 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5779 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
5780 ; AVX512F-NEXT: vzeroupper
5781 ; AVX512F-NEXT: retq
5783 ; AVX512DQ-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5784 ; AVX512DQ: # %bb.0:
5785 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
5786 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5787 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5788 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero
5789 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5790 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5791 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
5792 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
5793 ; AVX512DQ-NEXT: vzeroupper
5794 ; AVX512DQ-NEXT: retq
5796 ; AVX512BW-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5797 ; AVX512BW: # %bb.0:
5798 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
5799 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5800 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5801 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
5802 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero
5803 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5804 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5805 ; AVX512BW-NEXT: vzeroupper
5806 ; AVX512BW-NEXT: retq
5807 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5808 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5809 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5810 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 0, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 0, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 0, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 0, i32 101, i32 102, i32 103, i32 0, i32 105, i32 106, i32 107, i32 0, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 0, i32 117, i32 118, i32 119, i32 0, i32 121, i32 122, i32 123, i32 0, i32 125, i32 126, i32 127>
5811 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5812 %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias
5813 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5817 define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5818 ; SSE2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5820 ; SSE2-NEXT: movdqa (%rdi), %xmm0
5821 ; SSE2-NEXT: paddb (%rsi), %xmm0
5822 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5823 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
5824 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
5825 ; SSE2-NEXT: paddb %xmm0, %xmm1
5826 ; SSE2-NEXT: movdqa (%rdx), %xmm2
5827 ; SSE2-NEXT: paddb %xmm0, %xmm2
5828 ; SSE2-NEXT: movdqa 48(%rdx), %xmm3
5829 ; SSE2-NEXT: paddb %xmm0, %xmm3
5830 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
5831 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
5832 ; SSE2-NEXT: movdqa %xmm3, 48(%rcx)
5833 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
5834 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
5837 ; SSE42-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5839 ; SSE42-NEXT: movdqa (%rdi), %xmm0
5840 ; SSE42-NEXT: paddb (%rsi), %xmm0
5841 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
5842 ; SSE42-NEXT: movdqa 16(%rdx), %xmm1
5843 ; SSE42-NEXT: paddb %xmm0, %xmm1
5844 ; SSE42-NEXT: movdqa (%rdx), %xmm2
5845 ; SSE42-NEXT: paddb %xmm0, %xmm2
5846 ; SSE42-NEXT: movdqa 48(%rdx), %xmm3
5847 ; SSE42-NEXT: paddb %xmm0, %xmm3
5848 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
5849 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
5850 ; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
5851 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
5852 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
5855 ; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5857 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5858 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5859 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero
5860 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1
5861 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
5862 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3
5863 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5864 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5865 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
5866 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
5867 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
5870 ; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5872 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5873 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5874 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5875 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero
5876 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5877 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5878 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5879 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
5880 ; AVX2-NEXT: vzeroupper
5883 ; AVX512F-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5885 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5886 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5887 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5888 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero
5889 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5890 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5891 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5892 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
5893 ; AVX512F-NEXT: vzeroupper
5894 ; AVX512F-NEXT: retq
5896 ; AVX512DQ-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5897 ; AVX512DQ: # %bb.0:
5898 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
5899 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5900 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5901 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero
5902 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5903 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5904 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
5905 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
5906 ; AVX512DQ-NEXT: vzeroupper
5907 ; AVX512DQ-NEXT: retq
5909 ; AVX512BW-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5910 ; AVX512BW: # %bb.0:
5911 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
5912 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5913 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5914 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
5915 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero
5916 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5917 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5918 ; AVX512BW-NEXT: vzeroupper
5919 ; AVX512BW-NEXT: retq
5920 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5921 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5922 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5923 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 0, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 0, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5924 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5925 %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias
5926 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5930 define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5931 ; SSE-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5933 ; SSE-NEXT: movdqa (%rdi), %xmm0
5934 ; SSE-NEXT: paddb (%rsi), %xmm0
5935 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
5936 ; SSE-NEXT: movdqa 16(%rdx), %xmm1
5937 ; SSE-NEXT: paddb %xmm0, %xmm1
5938 ; SSE-NEXT: movdqa (%rdx), %xmm2
5939 ; SSE-NEXT: paddb %xmm0, %xmm2
5940 ; SSE-NEXT: movdqa 48(%rdx), %xmm3
5941 ; SSE-NEXT: paddb %xmm0, %xmm3
5942 ; SSE-NEXT: paddb 32(%rdx), %xmm0
5943 ; SSE-NEXT: movdqa %xmm0, 32(%rcx)
5944 ; SSE-NEXT: movdqa %xmm3, 48(%rcx)
5945 ; SSE-NEXT: movdqa %xmm2, (%rcx)
5946 ; SSE-NEXT: movdqa %xmm1, 16(%rcx)
5949 ; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5951 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5952 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5953 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5954 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1
5955 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
5956 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3
5957 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5958 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5959 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
5960 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
5961 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
5964 ; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5966 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5967 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5968 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5969 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
5970 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5971 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5972 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5973 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
5974 ; AVX2-NEXT: vzeroupper
5977 ; AVX512F-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5979 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5980 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5981 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5982 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
5983 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5984 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5985 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5986 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
5987 ; AVX512F-NEXT: vzeroupper
5988 ; AVX512F-NEXT: retq
5990 ; AVX512DQ-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5991 ; AVX512DQ: # %bb.0:
5992 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
5993 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5994 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
5995 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
5996 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5997 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5998 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
5999 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
6000 ; AVX512DQ-NEXT: vzeroupper
6001 ; AVX512DQ-NEXT: retq
6003 ; AVX512BW-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
6004 ; AVX512BW: # %bb.0:
6005 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
6006 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6007 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6008 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6009 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
6010 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6011 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6012 ; AVX512BW-NEXT: vzeroupper
6013 ; AVX512BW-NEXT: retq
6014 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6015 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6016 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6017 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
6018 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6019 %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias
6020 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6024 define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6025 ; SSE-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
6027 ; SSE-NEXT: movdqa (%rdi), %xmm0
6028 ; SSE-NEXT: paddb (%rsi), %xmm0
6029 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6030 ; SSE-NEXT: movaps 16(%rdx), %xmm1
6031 ; SSE-NEXT: movaps 48(%rdx), %xmm2
6032 ; SSE-NEXT: movdqa (%rdx), %xmm3
6033 ; SSE-NEXT: paddb %xmm0, %xmm3
6034 ; SSE-NEXT: paddb 32(%rdx), %xmm0
6035 ; SSE-NEXT: movaps %xmm2, 48(%rcx)
6036 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
6037 ; SSE-NEXT: movdqa %xmm0, 32(%rcx)
6038 ; SSE-NEXT: movdqa %xmm3, (%rcx)
6041 ; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
6043 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6044 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6045 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
6046 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1
6047 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
6048 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
6049 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3
6050 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
6051 ; AVX-NEXT: vmovaps %xmm3, 48(%rcx)
6052 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
6053 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
6056 ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
6058 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
6059 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6060 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
6061 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
6062 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6063 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6064 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
6065 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
6066 ; AVX2-NEXT: vzeroupper
6069 ; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
6071 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
6072 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6073 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
6074 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
6075 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6076 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6077 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
6078 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
6079 ; AVX512F-NEXT: vzeroupper
6080 ; AVX512F-NEXT: retq
6082 ; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
6083 ; AVX512DQ: # %bb.0:
6084 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
6085 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6086 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
6087 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
6088 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6089 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6090 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
6091 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
6092 ; AVX512DQ-NEXT: vzeroupper
6093 ; AVX512DQ-NEXT: retq
6095 ; AVX512BW-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
6096 ; AVX512BW: # %bb.0:
6097 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
6098 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6099 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6100 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
6101 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6102 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6103 ; AVX512BW-NEXT: vzeroupper
6104 ; AVX512BW-NEXT: retq
6105 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6106 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6107 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6108 %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
6109 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6110 %out.vec = add <64 x i8> %broadcast.of.zextinreg, %out.vec.bias
6111 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6115 define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6116 ; SSE2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6118 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6119 ; SSE2-NEXT: paddb (%rsi), %xmm0
6120 ; SSE2-NEXT: pxor %xmm1, %xmm1
6121 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
6122 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6123 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
6124 ; SSE2-NEXT: paddb %xmm0, %xmm1
6125 ; SSE2-NEXT: movdqa (%rdx), %xmm2
6126 ; SSE2-NEXT: paddb %xmm0, %xmm2
6127 ; SSE2-NEXT: movdqa 48(%rdx), %xmm3
6128 ; SSE2-NEXT: paddb %xmm0, %xmm3
6129 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
6130 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
6131 ; SSE2-NEXT: movdqa %xmm3, 48(%rcx)
6132 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
6133 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
6136 ; SSE42-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6138 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6139 ; SSE42-NEXT: paddb (%rsi), %xmm0
6140 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
6141 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
6142 ; SSE42-NEXT: movdqa 16(%rdx), %xmm1
6143 ; SSE42-NEXT: paddb %xmm0, %xmm1
6144 ; SSE42-NEXT: movdqa (%rdx), %xmm2
6145 ; SSE42-NEXT: paddb %xmm0, %xmm2
6146 ; SSE42-NEXT: movdqa 48(%rdx), %xmm3
6147 ; SSE42-NEXT: paddb %xmm0, %xmm3
6148 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
6149 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
6150 ; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
6151 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
6152 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
6155 ; AVX-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6157 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6158 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6159 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
6160 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
6161 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1
6162 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
6163 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3
6164 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
6165 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
6166 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
6167 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
6168 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
6171 ; AVX2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6173 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
6174 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6175 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6176 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero
6177 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6178 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6179 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
6180 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
6181 ; AVX2-NEXT: vzeroupper
6184 ; AVX512F-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6186 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
6187 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6188 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6189 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero
6190 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6191 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6192 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
6193 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
6194 ; AVX512F-NEXT: vzeroupper
6195 ; AVX512F-NEXT: retq
6197 ; AVX512DQ-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6198 ; AVX512DQ: # %bb.0:
6199 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
6200 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6201 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6202 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero
6203 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6204 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6205 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
6206 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
6207 ; AVX512DQ-NEXT: vzeroupper
6208 ; AVX512DQ-NEXT: retq
6210 ; AVX512BW-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
6211 ; AVX512BW: # %bb.0:
6212 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
6213 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6214 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6215 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6216 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero
6217 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6218 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6219 ; AVX512BW-NEXT: vzeroupper
6220 ; AVX512BW-NEXT: retq
6221 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6222 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6223 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6224 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6225 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47, i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63>
6226 %out.bytevec = bitcast <32 x i16> %broadcast.of.zextinreg to <64 x i8>
6227 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6228 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6229 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6233 define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6234 ; SSE2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6236 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6237 ; SSE2-NEXT: paddb (%rsi), %xmm0
6238 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6239 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6240 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
6241 ; SSE2-NEXT: paddb %xmm0, %xmm1
6242 ; SSE2-NEXT: movdqa (%rdx), %xmm2
6243 ; SSE2-NEXT: paddb %xmm0, %xmm2
6244 ; SSE2-NEXT: movdqa 48(%rdx), %xmm3
6245 ; SSE2-NEXT: paddb %xmm0, %xmm3
6246 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
6247 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
6248 ; SSE2-NEXT: movdqa %xmm3, 48(%rcx)
6249 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
6250 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
6253 ; SSE42-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6255 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6256 ; SSE42-NEXT: paddb (%rsi), %xmm0
6257 ; SSE42-NEXT: pxor %xmm1, %xmm1
6258 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6259 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
6260 ; SSE42-NEXT: movdqa 16(%rdx), %xmm1
6261 ; SSE42-NEXT: paddb %xmm0, %xmm1
6262 ; SSE42-NEXT: movdqa (%rdx), %xmm2
6263 ; SSE42-NEXT: paddb %xmm0, %xmm2
6264 ; SSE42-NEXT: movdqa 48(%rdx), %xmm3
6265 ; SSE42-NEXT: paddb %xmm0, %xmm3
6266 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
6267 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
6268 ; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
6269 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
6270 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
6273 ; AVX-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6275 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6276 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6277 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6278 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
6279 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
6280 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1
6281 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
6282 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3
6283 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
6284 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
6285 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
6286 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
6287 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
6290 ; AVX2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6292 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
6293 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6294 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6295 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero
6296 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6297 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6298 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
6299 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
6300 ; AVX2-NEXT: vzeroupper
6303 ; AVX512F-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6305 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
6306 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6307 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6308 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero
6309 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6310 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6311 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
6312 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
6313 ; AVX512F-NEXT: vzeroupper
6314 ; AVX512F-NEXT: retq
6316 ; AVX512DQ-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6317 ; AVX512DQ: # %bb.0:
6318 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
6319 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6320 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6321 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero
6322 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6323 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6324 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
6325 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
6326 ; AVX512DQ-NEXT: vzeroupper
6327 ; AVX512DQ-NEXT: retq
6329 ; AVX512BW-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
6330 ; AVX512BW: # %bb.0:
6331 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
6332 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6333 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6334 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6335 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero
6336 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6337 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6338 ; AVX512BW-NEXT: vzeroupper
6339 ; AVX512BW-NEXT: retq
6340 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6341 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6342 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6343 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6344 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63>
6345 %out.bytevec = bitcast <32 x i16> %broadcast.of.zextinreg to <64 x i8>
6346 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6347 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6348 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6352 define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6353 ; SSE2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6355 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6356 ; SSE2-NEXT: paddb (%rsi), %xmm0
6357 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6358 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
6359 ; SSE2-NEXT: paddb %xmm0, %xmm1
6360 ; SSE2-NEXT: movdqa (%rdx), %xmm2
6361 ; SSE2-NEXT: paddb %xmm0, %xmm2
6362 ; SSE2-NEXT: movdqa 48(%rdx), %xmm3
6363 ; SSE2-NEXT: paddb %xmm0, %xmm3
6364 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
6365 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
6366 ; SSE2-NEXT: movdqa %xmm3, 48(%rcx)
6367 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
6368 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
6371 ; SSE42-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6373 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6374 ; SSE42-NEXT: paddb (%rsi), %xmm0
6375 ; SSE42-NEXT: pxor %xmm1, %xmm1
6376 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6377 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0
6378 ; SSE42-NEXT: paddb %xmm1, %xmm0
6379 ; SSE42-NEXT: movdqa (%rdx), %xmm2
6380 ; SSE42-NEXT: paddb %xmm1, %xmm2
6381 ; SSE42-NEXT: movdqa 48(%rdx), %xmm3
6382 ; SSE42-NEXT: paddb %xmm1, %xmm3
6383 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
6384 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
6385 ; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
6386 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
6387 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
6390 ; AVX-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6392 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6393 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6394 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
6395 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6396 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1
6397 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
6398 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3
6399 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
6400 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
6401 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
6402 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
6403 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
6406 ; AVX2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6408 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
6409 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6410 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
6411 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6412 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
6413 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6414 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6415 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
6416 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
6417 ; AVX2-NEXT: vzeroupper
6420 ; AVX512F-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6422 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
6423 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6424 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6425 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
6426 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
6427 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6428 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6429 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
6430 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
6431 ; AVX512F-NEXT: vzeroupper
6432 ; AVX512F-NEXT: retq
6434 ; AVX512DQ-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6435 ; AVX512DQ: # %bb.0:
6436 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
6437 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6438 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6439 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
6440 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
6441 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6442 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6443 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
6444 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
6445 ; AVX512DQ-NEXT: vzeroupper
6446 ; AVX512DQ-NEXT: retq
6448 ; AVX512BW-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
6449 ; AVX512BW: # %bb.0:
6450 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
6451 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6452 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6453 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6454 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
6455 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6456 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6457 ; AVX512BW-NEXT: vzeroupper
6458 ; AVX512BW-NEXT: retq
6459 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6460 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6461 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6462 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6463 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
6464 %out.bytevec = bitcast <32 x i16> %broadcast.of.zextinreg to <64 x i8>
6465 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6466 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6467 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6471 define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6472 ; SSE2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6474 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6475 ; SSE2-NEXT: paddb (%rsi), %xmm0
6476 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6477 ; SSE2-NEXT: movaps 16(%rdx), %xmm1
6478 ; SSE2-NEXT: movaps 48(%rdx), %xmm2
6479 ; SSE2-NEXT: movdqa (%rdx), %xmm3
6480 ; SSE2-NEXT: paddb %xmm0, %xmm3
6481 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
6482 ; SSE2-NEXT: movaps %xmm2, 48(%rcx)
6483 ; SSE2-NEXT: movaps %xmm1, 16(%rcx)
6484 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
6485 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
6488 ; SSE42-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6490 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6491 ; SSE42-NEXT: paddb (%rsi), %xmm0
6492 ; SSE42-NEXT: pxor %xmm1, %xmm1
6493 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6494 ; SSE42-NEXT: movaps 16(%rdx), %xmm0
6495 ; SSE42-NEXT: movaps 48(%rdx), %xmm2
6496 ; SSE42-NEXT: movdqa (%rdx), %xmm3
6497 ; SSE42-NEXT: paddb %xmm1, %xmm3
6498 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
6499 ; SSE42-NEXT: movaps %xmm2, 48(%rcx)
6500 ; SSE42-NEXT: movaps %xmm0, 16(%rcx)
6501 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
6502 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
6505 ; AVX-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6507 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6508 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6509 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
6510 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6511 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1
6512 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
6513 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
6514 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3
6515 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
6516 ; AVX-NEXT: vmovaps %xmm3, 48(%rcx)
6517 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
6518 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
6521 ; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6523 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
6524 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6525 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
6526 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
6527 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6528 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6529 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
6530 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
6531 ; AVX2-NEXT: vzeroupper
6534 ; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6536 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
6537 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6538 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
6539 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
6540 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6541 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6542 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
6543 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
6544 ; AVX512F-NEXT: vzeroupper
6545 ; AVX512F-NEXT: retq
6547 ; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6548 ; AVX512DQ: # %bb.0:
6549 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
6550 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6551 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
6552 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
6553 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6554 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6555 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
6556 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
6557 ; AVX512DQ-NEXT: vzeroupper
6558 ; AVX512DQ-NEXT: retq
6560 ; AVX512BW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
6561 ; AVX512BW: # %bb.0:
6562 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
6563 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6564 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6565 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
6566 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6567 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6568 ; AVX512BW-NEXT: vzeroupper
6569 ; AVX512BW-NEXT: retq
6570 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6571 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6572 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6573 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6574 %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
6575 %out.bytevec = bitcast <32 x i16> %broadcast.of.zextinreg to <64 x i8>
6576 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6577 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6578 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6582 define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6583 ; SSE2-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6585 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6586 ; SSE2-NEXT: paddb (%rsi), %xmm0
6587 ; SSE2-NEXT: pxor %xmm1, %xmm1
6588 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
6589 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
6590 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
6591 ; SSE2-NEXT: paddb %xmm0, %xmm1
6592 ; SSE2-NEXT: movdqa (%rdx), %xmm2
6593 ; SSE2-NEXT: paddb %xmm0, %xmm2
6594 ; SSE2-NEXT: movdqa 48(%rdx), %xmm3
6595 ; SSE2-NEXT: paddb %xmm0, %xmm3
6596 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
6597 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
6598 ; SSE2-NEXT: movdqa %xmm3, 48(%rcx)
6599 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
6600 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
6603 ; SSE42-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6605 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6606 ; SSE42-NEXT: paddb (%rsi), %xmm0
6607 ; SSE42-NEXT: pxor %xmm1, %xmm1
6608 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6609 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
6610 ; SSE42-NEXT: movdqa 16(%rdx), %xmm1
6611 ; SSE42-NEXT: paddb %xmm0, %xmm1
6612 ; SSE42-NEXT: movdqa (%rdx), %xmm2
6613 ; SSE42-NEXT: paddb %xmm0, %xmm2
6614 ; SSE42-NEXT: movdqa 48(%rdx), %xmm3
6615 ; SSE42-NEXT: paddb %xmm0, %xmm3
6616 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
6617 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
6618 ; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
6619 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
6620 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
6623 ; AVX-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6625 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6626 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6627 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
6628 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
6629 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
6630 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
6631 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
6632 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2
6633 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
6634 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
6635 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
6636 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
6637 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
6638 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
6639 ; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
6640 ; AVX-NEXT: vzeroupper
6643 ; AVX2-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6645 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
6646 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
6647 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
6648 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
6649 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
6650 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6651 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6652 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
6653 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
6654 ; AVX2-NEXT: vzeroupper
6657 ; AVX512F-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6659 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
6660 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6661 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
6662 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
6663 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
6664 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
6665 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6666 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
6667 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
6668 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
6669 ; AVX512F-NEXT: vzeroupper
6670 ; AVX512F-NEXT: retq
6672 ; AVX512DQ-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6673 ; AVX512DQ: # %bb.0:
6674 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
6675 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6676 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
6677 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
6678 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
6679 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
6680 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6681 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
6682 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
6683 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
6684 ; AVX512DQ-NEXT: vzeroupper
6685 ; AVX512DQ-NEXT: retq
6687 ; AVX512BW-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
6688 ; AVX512BW: # %bb.0:
6689 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
6690 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
6691 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
6692 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
6693 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
6694 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
6695 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6696 ; AVX512BW-NEXT: vzeroupper
6697 ; AVX512BW-NEXT: retq
6698 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6699 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6700 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6701 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
6702 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31>
6703 %out.bytevec = bitcast <16 x i32> %broadcast.of.zextinreg to <64 x i8>
6704 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6705 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6706 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6710 define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6711 ; SSE2-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6713 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6714 ; SSE2-NEXT: paddb (%rsi), %xmm0
6715 ; SSE2-NEXT: xorps %xmm1, %xmm1
6716 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
6717 ; SSE2-NEXT: movdqa 16(%rdx), %xmm0
6718 ; SSE2-NEXT: paddb %xmm1, %xmm0
6719 ; SSE2-NEXT: movdqa (%rdx), %xmm2
6720 ; SSE2-NEXT: paddb %xmm1, %xmm2
6721 ; SSE2-NEXT: movdqa 48(%rdx), %xmm3
6722 ; SSE2-NEXT: paddb %xmm1, %xmm3
6723 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
6724 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
6725 ; SSE2-NEXT: movdqa %xmm3, 48(%rcx)
6726 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
6727 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
6730 ; SSE42-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6732 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6733 ; SSE42-NEXT: paddb (%rsi), %xmm0
6734 ; SSE42-NEXT: pxor %xmm1, %xmm1
6735 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
6736 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0
6737 ; SSE42-NEXT: paddb %xmm1, %xmm0
6738 ; SSE42-NEXT: movdqa (%rdx), %xmm2
6739 ; SSE42-NEXT: paddb %xmm1, %xmm2
6740 ; SSE42-NEXT: movdqa 48(%rdx), %xmm3
6741 ; SSE42-NEXT: paddb %xmm1, %xmm3
6742 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
6743 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
6744 ; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
6745 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
6746 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
6749 ; AVX-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6751 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6752 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6753 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
6754 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
6755 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
6756 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
6757 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2
6758 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
6759 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
6760 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
6761 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
6762 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
6763 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
6764 ; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
6765 ; AVX-NEXT: vzeroupper
6768 ; AVX2-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6770 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
6771 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6772 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
6773 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
6774 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
6775 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6776 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6777 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
6778 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
6779 ; AVX2-NEXT: vzeroupper
6782 ; AVX512F-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6784 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
6785 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6786 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
6787 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
6788 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
6789 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
6790 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6791 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
6792 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
6793 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
6794 ; AVX512F-NEXT: vzeroupper
6795 ; AVX512F-NEXT: retq
6797 ; AVX512DQ-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6798 ; AVX512DQ: # %bb.0:
6799 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
6800 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6801 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
6802 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
6803 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
6804 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
6805 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6806 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
6807 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
6808 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
6809 ; AVX512DQ-NEXT: vzeroupper
6810 ; AVX512DQ-NEXT: retq
6812 ; AVX512BW-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
6813 ; AVX512BW: # %bb.0:
6814 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
6815 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
6816 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
6817 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31]
6818 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
6819 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
6820 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6821 ; AVX512BW-NEXT: vzeroupper
6822 ; AVX512BW-NEXT: retq
6823 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6824 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6825 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6826 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
6827 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31>
6828 %out.bytevec = bitcast <16 x i32> %broadcast.of.zextinreg to <64 x i8>
6829 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6830 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6831 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6835 define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6836 ; SSE2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6838 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6839 ; SSE2-NEXT: paddb (%rsi), %xmm0
6840 ; SSE2-NEXT: xorps %xmm1, %xmm1
6841 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
6842 ; SSE2-NEXT: movaps 16(%rdx), %xmm0
6843 ; SSE2-NEXT: movaps 48(%rdx), %xmm2
6844 ; SSE2-NEXT: movdqa (%rdx), %xmm3
6845 ; SSE2-NEXT: paddb %xmm1, %xmm3
6846 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
6847 ; SSE2-NEXT: movaps %xmm2, 48(%rcx)
6848 ; SSE2-NEXT: movaps %xmm0, 16(%rcx)
6849 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
6850 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
6853 ; SSE42-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6855 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6856 ; SSE42-NEXT: paddb (%rsi), %xmm0
6857 ; SSE42-NEXT: pxor %xmm1, %xmm1
6858 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
6859 ; SSE42-NEXT: movaps 16(%rdx), %xmm0
6860 ; SSE42-NEXT: movaps 48(%rdx), %xmm2
6861 ; SSE42-NEXT: movdqa (%rdx), %xmm3
6862 ; SSE42-NEXT: paddb %xmm1, %xmm3
6863 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
6864 ; SSE42-NEXT: movaps %xmm2, 48(%rcx)
6865 ; SSE42-NEXT: movaps %xmm0, 16(%rcx)
6866 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
6867 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
6870 ; AVX-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6872 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6873 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6874 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
6875 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
6876 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1
6877 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
6878 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
6879 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3
6880 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
6881 ; AVX-NEXT: vmovaps %xmm3, 48(%rcx)
6882 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
6883 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
6886 ; AVX2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6888 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
6889 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6890 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
6891 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
6892 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6893 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6894 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
6895 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
6896 ; AVX2-NEXT: vzeroupper
6899 ; AVX512F-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6901 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
6902 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6903 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
6904 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
6905 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
6906 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
6907 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6908 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
6909 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
6910 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
6911 ; AVX512F-NEXT: vzeroupper
6912 ; AVX512F-NEXT: retq
6914 ; AVX512DQ-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6915 ; AVX512DQ: # %bb.0:
6916 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
6917 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6918 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
6919 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
6920 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
6921 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
6922 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6923 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
6924 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
6925 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
6926 ; AVX512DQ-NEXT: vzeroupper
6927 ; AVX512DQ-NEXT: retq
6929 ; AVX512BW-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2:
6930 ; AVX512BW: # %bb.0:
6931 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
6932 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6933 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6934 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
6935 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6936 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6937 ; AVX512BW-NEXT: vzeroupper
6938 ; AVX512BW-NEXT: retq
6939 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6940 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6941 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6942 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
6943 %broadcast.of.zextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
6944 %out.bytevec = bitcast <16 x i32> %broadcast.of.zextinreg to <64 x i8>
6945 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6946 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6947 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6951 define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6952 ; SSE-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
6954 ; SSE-NEXT: movdqa (%rdi), %xmm0
6955 ; SSE-NEXT: paddb (%rsi), %xmm0
6956 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
6957 ; SSE-NEXT: movdqa 16(%rdx), %xmm1
6958 ; SSE-NEXT: paddb %xmm0, %xmm1
6959 ; SSE-NEXT: movdqa (%rdx), %xmm2
6960 ; SSE-NEXT: paddb %xmm0, %xmm2
6961 ; SSE-NEXT: movdqa 48(%rdx), %xmm3
6962 ; SSE-NEXT: paddb %xmm0, %xmm3
6963 ; SSE-NEXT: paddb 32(%rdx), %xmm0
6964 ; SSE-NEXT: movdqa %xmm0, 32(%rcx)
6965 ; SSE-NEXT: movdqa %xmm3, 48(%rcx)
6966 ; SSE-NEXT: movdqa %xmm2, (%rcx)
6967 ; SSE-NEXT: movdqa %xmm1, 16(%rcx)
6970 ; AVX-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
6972 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6973 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6974 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
6975 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
6976 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
6977 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
6978 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2
6979 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
6980 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
6981 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
6982 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
6983 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
6984 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
6985 ; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
6986 ; AVX-NEXT: vzeroupper
6989 ; AVX2-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
6991 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
6992 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
6993 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
6994 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
6995 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
6996 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
6997 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6998 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
6999 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
7000 ; AVX2-NEXT: vzeroupper
7003 ; AVX512F-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
7005 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
7006 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7007 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
7008 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15]
7009 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
7010 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
7011 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7012 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
7013 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
7014 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
7015 ; AVX512F-NEXT: vzeroupper
7016 ; AVX512F-NEXT: retq
7018 ; AVX512DQ-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
7019 ; AVX512DQ: # %bb.0:
7020 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
7021 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7022 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
7023 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15]
7024 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
7025 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
7026 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7027 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
7028 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
7029 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
7030 ; AVX512DQ-NEXT: vzeroupper
7031 ; AVX512DQ-NEXT: retq
7033 ; AVX512BW-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
7034 ; AVX512BW: # %bb.0:
7035 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
7036 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
7037 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
7038 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15]
7039 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
7040 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
7041 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
7042 ; AVX512BW-NEXT: vzeroupper
7043 ; AVX512BW-NEXT: retq
7044 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7045 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7046 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7047 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
7048 %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
7049 %out.bytevec = bitcast <8 x i64> %broadcast.of.zextinreg to <64 x i8>
7050 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7051 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7052 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7056 define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7057 ; SSE-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7059 ; SSE-NEXT: movdqa (%rdi), %xmm0
7060 ; SSE-NEXT: paddb (%rsi), %xmm0
7061 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
7062 ; SSE-NEXT: movaps 16(%rdx), %xmm1
7063 ; SSE-NEXT: movaps 48(%rdx), %xmm2
7064 ; SSE-NEXT: movdqa (%rdx), %xmm3
7065 ; SSE-NEXT: paddb %xmm0, %xmm3
7066 ; SSE-NEXT: paddb 32(%rdx), %xmm0
7067 ; SSE-NEXT: movaps %xmm2, 48(%rcx)
7068 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
7069 ; SSE-NEXT: movdqa %xmm0, 32(%rcx)
7070 ; SSE-NEXT: movdqa %xmm3, (%rcx)
7073 ; AVX-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7075 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
7076 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7077 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
7078 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1
7079 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
7080 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
7081 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3
7082 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
7083 ; AVX-NEXT: vmovaps %xmm3, 48(%rcx)
7084 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
7085 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
7088 ; AVX2-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7090 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
7091 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7092 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
7093 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
7094 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7095 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
7096 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
7097 ; AVX2-NEXT: vzeroupper
7100 ; AVX512F-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7102 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
7103 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7104 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7]
7105 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
7106 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
7107 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
7108 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7109 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
7110 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
7111 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
7112 ; AVX512F-NEXT: vzeroupper
7113 ; AVX512F-NEXT: retq
7115 ; AVX512DQ-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7116 ; AVX512DQ: # %bb.0:
7117 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
7118 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7119 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7]
7120 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
7121 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
7122 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
7123 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7124 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
7125 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
7126 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
7127 ; AVX512DQ-NEXT: vzeroupper
7128 ; AVX512DQ-NEXT: retq
7130 ; AVX512BW-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2:
7131 ; AVX512BW: # %bb.0:
7132 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
7133 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7134 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
7135 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
7136 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7137 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
7138 ; AVX512BW-NEXT: vzeroupper
7139 ; AVX512BW-NEXT: retq
7140 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7141 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7142 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7143 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
7144 %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
7145 %out.bytevec = bitcast <8 x i64> %broadcast.of.zextinreg to <64 x i8>
7146 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7147 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7148 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7152 define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7153 ; SSE-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
7155 ; SSE-NEXT: movdqa (%rdi), %xmm0
7156 ; SSE-NEXT: paddb (%rsi), %xmm0
7157 ; SSE-NEXT: movaps 16(%rdx), %xmm1
7158 ; SSE-NEXT: movaps 48(%rdx), %xmm2
7159 ; SSE-NEXT: movdqa (%rdx), %xmm3
7160 ; SSE-NEXT: paddb %xmm0, %xmm3
7161 ; SSE-NEXT: paddb 32(%rdx), %xmm0
7162 ; SSE-NEXT: movaps %xmm2, 48(%rcx)
7163 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
7164 ; SSE-NEXT: movdqa %xmm0, 32(%rcx)
7165 ; SSE-NEXT: movdqa %xmm3, (%rcx)
7168 ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
7170 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
7171 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7172 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1
7173 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
7174 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
7175 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3
7176 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
7177 ; AVX-NEXT: vmovaps %xmm3, 48(%rcx)
7178 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
7179 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
7182 ; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
7184 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
7185 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7186 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
7187 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7188 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
7189 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
7190 ; AVX2-NEXT: vzeroupper
7193 ; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
7195 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
7196 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7197 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
7198 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15]
7199 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
7200 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
7201 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7202 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
7203 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
7204 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
7205 ; AVX512F-NEXT: vzeroupper
7206 ; AVX512F-NEXT: retq
7208 ; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
7209 ; AVX512DQ: # %bb.0:
7210 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
7211 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7212 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
7213 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15]
7214 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
7215 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
7216 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7217 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
7218 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
7219 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
7220 ; AVX512DQ-NEXT: vzeroupper
7221 ; AVX512DQ-NEXT: retq
7223 ; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
7224 ; AVX512BW: # %bb.0:
7225 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
7226 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7227 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
7228 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7229 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
7230 ; AVX512BW-NEXT: vzeroupper
7231 ; AVX512BW-NEXT: retq
7232 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7233 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7234 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7235 %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128>
7236 %broadcast.of.zextinreg = shufflevector <4 x i128> %in.vec.cast, <4 x i128> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
7237 %out.bytevec = bitcast <4 x i128> %broadcast.of.zextinreg to <64 x i8>
7238 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7239 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7240 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7243 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
7247 ; FALLBACK10: {{.*}}
7248 ; FALLBACK11: {{.*}}
7249 ; FALLBACK12: {{.*}}
7250 ; FALLBACK13: {{.*}}