1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-SLOW,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ,AVX512DQ-FAST,FALLBACK9
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK10
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK11
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK12
15 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK13
17 define void @vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
18 ; SSE2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
20 ; SSE2-NEXT: movdqa (%rdi), %xmm0
21 ; SSE2-NEXT: paddb (%rsi), %xmm0
22 ; SSE2-NEXT: pxor %xmm1, %xmm1
23 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
24 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
25 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
26 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,2,4,5,6,7]
27 ; SSE2-NEXT: packuswb %xmm0, %xmm0
28 ; SSE2-NEXT: paddb (%rdx), %xmm0
29 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
32 ; SSE42-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
34 ; SSE42-NEXT: movdqa (%rdi), %xmm0
35 ; SSE42-NEXT: paddb (%rsi), %xmm0
36 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
37 ; SSE42-NEXT: paddb (%rdx), %xmm0
38 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
41 ; AVX-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
43 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
44 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
45 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
46 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
47 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
50 ; AVX2-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
52 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
53 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
54 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
55 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
56 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
57 ; AVX2-NEXT: vzeroupper
60 ; AVX512F-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
62 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
63 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
64 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
65 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
66 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
67 ; AVX512F-NEXT: vzeroupper
70 ; AVX512DQ-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
72 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
73 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
74 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
75 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
76 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
77 ; AVX512DQ-NEXT: vzeroupper
80 ; AVX512BW-LABEL: vec32_i8_widen_to_i16_factor2_broadcast_to_v2i16_factor2:
82 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
83 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
84 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
85 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
86 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
87 ; AVX512BW-NEXT: vzeroupper
89 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
90 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
91 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
92 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
93 %out.bytevec.padded = shufflevector <4 x i8> %broadcast.of.aextinreg, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
94 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
95 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
96 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
100 define void @vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
101 ; SSE2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
103 ; SSE2-NEXT: movdqa (%rdi), %xmm0
104 ; SSE2-NEXT: paddb (%rsi), %xmm0
105 ; SSE2-NEXT: pxor %xmm1, %xmm1
106 ; SSE2-NEXT: movdqa %xmm0, %xmm2
107 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
108 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
109 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
110 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
111 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
112 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
113 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
114 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
115 ; SSE2-NEXT: packuswb %xmm2, %xmm2
116 ; SSE2-NEXT: paddb (%rdx), %xmm2
117 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
120 ; SSE42-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
122 ; SSE42-NEXT: movdqa (%rdi), %xmm0
123 ; SSE42-NEXT: paddb (%rsi), %xmm0
124 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
125 ; SSE42-NEXT: paddb (%rdx), %xmm0
126 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
129 ; AVX-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
131 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
132 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
133 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
134 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
135 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
138 ; AVX2-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
140 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
141 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
142 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
143 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
144 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
145 ; AVX2-NEXT: vzeroupper
148 ; AVX512F-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
150 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
151 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
152 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
153 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
154 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
155 ; AVX512F-NEXT: vzeroupper
158 ; AVX512DQ-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
160 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
161 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
162 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
163 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
164 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
165 ; AVX512DQ-NEXT: vzeroupper
166 ; AVX512DQ-NEXT: retq
168 ; AVX512BW-LABEL: vec64_i8_widen_to_i16_factor2_broadcast_to_v4i16_factor4:
170 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
171 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
172 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,0,11,0,13,0,15,u,u,u,u,u,u,u,u]
173 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
174 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
175 ; AVX512BW-NEXT: vzeroupper
176 ; AVX512BW-NEXT: retq
177 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
178 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
179 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
180 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
181 %out.bytevec.padded = shufflevector <8 x i8> %broadcast.of.aextinreg, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
182 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
183 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
184 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
188 define void @vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
189 ; SSE2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
191 ; SSE2-NEXT: movdqa (%rdi), %xmm0
192 ; SSE2-NEXT: paddb (%rsi), %xmm0
193 ; SSE2-NEXT: pxor %xmm1, %xmm1
194 ; SSE2-NEXT: movdqa %xmm0, %xmm2
195 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
196 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535]
197 ; SSE2-NEXT: pand %xmm3, %xmm2
198 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
199 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
200 ; SSE2-NEXT: pandn %xmm0, %xmm3
201 ; SSE2-NEXT: por %xmm2, %xmm3
202 ; SSE2-NEXT: packuswb %xmm3, %xmm3
203 ; SSE2-NEXT: paddb (%rdx), %xmm3
204 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
207 ; SSE42-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
209 ; SSE42-NEXT: movdqa (%rdi), %xmm0
210 ; SSE42-NEXT: paddb (%rsi), %xmm0
211 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
212 ; SSE42-NEXT: paddb (%rdx), %xmm0
213 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
216 ; AVX-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
218 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
219 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
220 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
221 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
222 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
225 ; AVX2-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
227 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
228 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
229 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
230 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
231 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
232 ; AVX2-NEXT: vzeroupper
235 ; AVX512F-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
237 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
238 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
239 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
240 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
241 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
242 ; AVX512F-NEXT: vzeroupper
245 ; AVX512DQ-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
247 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
248 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
249 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
250 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
251 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
252 ; AVX512DQ-NEXT: vzeroupper
253 ; AVX512DQ-NEXT: retq
255 ; AVX512BW-LABEL: vec64_i8_widen_to_i32_factor4_broadcast_to_v2i32_factor2:
257 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
258 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
259 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,9,10,11,0,13,14,15,u,u,u,u,u,u,u,u]
260 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
261 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
262 ; AVX512BW-NEXT: vzeroupper
263 ; AVX512BW-NEXT: retq
264 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
265 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
266 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
267 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
268 %out.bytevec.padded = shufflevector <8 x i8> %broadcast.of.aextinreg, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
269 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
270 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
271 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
275 define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
276 ; SSE2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
278 ; SSE2-NEXT: movdqa (%rdi), %xmm0
279 ; SSE2-NEXT: paddb (%rsi), %xmm0
280 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
281 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
282 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,2,4,5,6,7]
283 ; SSE2-NEXT: paddb (%rdx), %xmm0
284 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
287 ; SSE42-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
289 ; SSE42-NEXT: movdqa (%rdi), %xmm0
290 ; SSE42-NEXT: paddb (%rsi), %xmm0
291 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
292 ; SSE42-NEXT: paddb (%rdx), %xmm0
293 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
296 ; AVX-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
298 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
299 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
300 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
301 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
302 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
305 ; AVX2-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
307 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
308 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
309 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,14,15,10,11,12,13,14,15]
310 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
311 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
312 ; AVX2-NEXT: vzeroupper
315 ; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
317 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
318 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
319 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
320 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
321 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
322 ; AVX512F-NEXT: vzeroupper
325 ; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
327 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
328 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
329 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
330 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
331 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
332 ; AVX512DQ-NEXT: vzeroupper
333 ; AVX512DQ-NEXT: retq
335 ; AVX512BW-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
337 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
338 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
339 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
340 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
341 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
342 ; AVX512BW-NEXT: vzeroupper
343 ; AVX512BW-NEXT: retq
344 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
345 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
346 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
347 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
348 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
349 %out.bytevec = bitcast <4 x i16> %broadcast.of.aextinreg to <8 x i8>
350 %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
351 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
352 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
353 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
357 define void @vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
358 ; SSE2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
360 ; SSE2-NEXT: movdqa (%rdi), %xmm0
361 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
362 ; SSE2-NEXT: paddb (%rsi), %xmm0
363 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
364 ; SSE2-NEXT: psrlw $8, %xmm1
365 ; SSE2-NEXT: packuswb %xmm1, %xmm1
366 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
367 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
368 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
369 ; SSE2-NEXT: paddb (%rdx), %xmm0
370 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
373 ; SSE42-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
375 ; SSE42-NEXT: movdqa (%rdi), %xmm0
376 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
377 ; SSE42-NEXT: paddb (%rsi), %xmm0
378 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
379 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
380 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
381 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
382 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
383 ; SSE42-NEXT: paddb (%rdx), %xmm0
384 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
387 ; AVX-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
389 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
390 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
391 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
392 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
393 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
394 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
395 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
396 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
397 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
398 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
401 ; AVX2-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
403 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
404 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
405 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
406 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
407 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
408 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
409 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
410 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
411 ; AVX2-NEXT: vzeroupper
414 ; AVX512F-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
416 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
417 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
418 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
419 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
420 ; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0
421 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
422 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
423 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
424 ; AVX512F-NEXT: vzeroupper
427 ; AVX512DQ-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
429 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
430 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
431 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
432 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
433 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm0
434 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
435 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
436 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
437 ; AVX512DQ-NEXT: vzeroupper
438 ; AVX512DQ-NEXT: retq
440 ; AVX512BW-LABEL: vec128_i8_widen_to_i16_factor2_broadcast_to_v8i16_factor8:
442 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
443 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
444 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
445 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
446 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0
447 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
448 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
449 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
450 ; AVX512BW-NEXT: vzeroupper
451 ; AVX512BW-NEXT: retq
452 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
453 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
454 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
455 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31>
456 %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.aextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
457 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
458 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
459 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
463 define void @vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
464 ; SSE2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
466 ; SSE2-NEXT: movdqa (%rdi), %xmm0
467 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
468 ; SSE2-NEXT: paddb (%rsi), %xmm0
469 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
470 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
471 ; SSE2-NEXT: pand %xmm2, %xmm1
472 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
473 ; SSE2-NEXT: pandn %xmm0, %xmm2
474 ; SSE2-NEXT: por %xmm1, %xmm2
475 ; SSE2-NEXT: paddb (%rdx), %xmm2
476 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
479 ; SSE42-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
481 ; SSE42-NEXT: movdqa (%rdi), %xmm0
482 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
483 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
484 ; SSE42-NEXT: paddb (%rsi), %xmm0
485 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
486 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
487 ; SSE42-NEXT: paddb (%rdx), %xmm0
488 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
491 ; AVX-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
493 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
494 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
495 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
496 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
497 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
498 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
499 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
500 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
503 ; AVX2-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
505 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
506 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
507 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
508 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
509 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
510 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
511 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
512 ; AVX2-NEXT: vzeroupper
515 ; AVX512F-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
517 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
518 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
519 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
520 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
521 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
522 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
523 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
524 ; AVX512F-NEXT: vzeroupper
527 ; AVX512DQ-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
529 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
530 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
531 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
532 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
533 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
534 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
535 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
536 ; AVX512DQ-NEXT: vzeroupper
537 ; AVX512DQ-NEXT: retq
539 ; AVX512BW-LABEL: vec128_i8_widen_to_i32_factor4_broadcast_to_v4i32_factor4:
541 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
542 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
543 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
544 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
545 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
546 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
547 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
548 ; AVX512BW-NEXT: vzeroupper
549 ; AVX512BW-NEXT: retq
550 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
551 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
552 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
553 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31>
554 %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.aextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
555 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
556 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
557 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
561 define void @vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
562 ; SSE2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
564 ; SSE2-NEXT: movdqa (%rdi), %xmm0
565 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
566 ; SSE2-NEXT: paddb (%rsi), %xmm0
567 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
568 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
569 ; SSE2-NEXT: pand %xmm2, %xmm1
570 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
571 ; SSE2-NEXT: pandn %xmm0, %xmm2
572 ; SSE2-NEXT: por %xmm1, %xmm2
573 ; SSE2-NEXT: paddb (%rdx), %xmm2
574 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
577 ; SSE42-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
579 ; SSE42-NEXT: movdqa (%rdi), %xmm0
580 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
581 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
582 ; SSE42-NEXT: paddb (%rsi), %xmm0
583 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
584 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
585 ; SSE42-NEXT: paddb (%rdx), %xmm0
586 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
589 ; AVX-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
591 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
592 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
593 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
594 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
595 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
596 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
597 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
598 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
601 ; AVX2-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
603 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
604 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
605 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
606 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
607 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
608 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
609 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
610 ; AVX2-NEXT: vzeroupper
613 ; AVX512F-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
615 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
616 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
617 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
618 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
619 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
620 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
621 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
622 ; AVX512F-NEXT: vzeroupper
625 ; AVX512DQ-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
627 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
628 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
629 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
630 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
631 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
632 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
633 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
634 ; AVX512DQ-NEXT: vzeroupper
635 ; AVX512DQ-NEXT: retq
637 ; AVX512BW-LABEL: vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2:
639 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
640 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
641 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
642 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
643 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
644 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
645 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
646 ; AVX512BW-NEXT: vzeroupper
647 ; AVX512BW-NEXT: retq
648 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
649 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
650 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
651 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
652 %out.bytevec.padded = shufflevector <16 x i8> %broadcast.of.aextinreg, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
653 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
654 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
655 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
659 define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
660 ; SSE2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
662 ; SSE2-NEXT: movdqa (%rdi), %xmm0
663 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
664 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
665 ; SSE2-NEXT: paddb (%rsi), %xmm0
666 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
667 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
668 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
669 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
670 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
671 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
672 ; SSE2-NEXT: paddb (%rdx), %xmm0
673 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
676 ; SSE42-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
678 ; SSE42-NEXT: movdqa (%rdi), %xmm0
679 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
680 ; SSE42-NEXT: paddb (%rsi), %xmm0
681 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
682 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
683 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
684 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
685 ; SSE42-NEXT: paddb (%rdx), %xmm0
686 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
689 ; AVX-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
691 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
692 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
693 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
694 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
695 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
696 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
697 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
698 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
699 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
702 ; AVX2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
704 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
705 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
706 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
707 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
708 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
709 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
710 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
711 ; AVX2-NEXT: vzeroupper
714 ; AVX512F-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
716 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
717 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
718 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
719 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
720 ; AVX512F-NEXT: vmovd %xmm0, %eax
721 ; AVX512F-NEXT: vpinsrw $2, %eax, %xmm2, %xmm0
722 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
723 ; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
724 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
725 ; AVX512F-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
726 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
727 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
728 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
729 ; AVX512F-NEXT: vzeroupper
732 ; AVX512DQ-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
734 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
735 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
736 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
737 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
738 ; AVX512DQ-NEXT: vmovd %xmm0, %eax
739 ; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm2, %xmm0
740 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
741 ; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
742 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
743 ; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
744 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
745 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
746 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
747 ; AVX512DQ-NEXT: vzeroupper
748 ; AVX512DQ-NEXT: retq
750 ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
751 ; AVX512BW-SLOW: # %bb.0:
752 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
753 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
754 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
755 ; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0
756 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
757 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
758 ; AVX512BW-SLOW-NEXT: vzeroupper
759 ; AVX512BW-SLOW-NEXT: retq
761 ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
762 ; AVX512BW-FAST: # %bb.0:
763 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
764 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
765 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
766 ; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
767 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax
768 ; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
769 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
770 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
771 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
772 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
773 ; AVX512BW-FAST-NEXT: vzeroupper
774 ; AVX512BW-FAST-NEXT: retq
775 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
776 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
777 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
778 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
779 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
780 %out.bytevec = bitcast <8 x i16> %broadcast.of.aextinreg to <16 x i8>
781 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
782 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
783 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
784 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
788 define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
789 ; SSE2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
791 ; SSE2-NEXT: movdqa (%rdi), %xmm0
792 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
793 ; SSE2-NEXT: paddb (%rsi), %xmm0
794 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
795 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
796 ; SSE2-NEXT: pand %xmm2, %xmm1
797 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
798 ; SSE2-NEXT: pandn %xmm0, %xmm2
799 ; SSE2-NEXT: por %xmm1, %xmm2
800 ; SSE2-NEXT: paddb (%rdx), %xmm2
801 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
804 ; SSE42-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
806 ; SSE42-NEXT: movdqa (%rdi), %xmm0
807 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
808 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
809 ; SSE42-NEXT: paddb (%rsi), %xmm0
810 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
811 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
812 ; SSE42-NEXT: paddb (%rdx), %xmm0
813 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
816 ; AVX-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
818 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
819 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
820 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
821 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
822 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
823 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
824 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
825 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
828 ; AVX2-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
830 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
831 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
832 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
833 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
834 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
835 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
836 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
837 ; AVX2-NEXT: vzeroupper
840 ; AVX512F-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
842 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
843 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
844 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
845 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
846 ; AVX512F-NEXT: vmovd %xmm0, %eax
847 ; AVX512F-NEXT: vpinsrw $4, %eax, %xmm2, %xmm0
848 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
849 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
850 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
851 ; AVX512F-NEXT: vzeroupper
854 ; AVX512DQ-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
856 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
857 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
858 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
859 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
860 ; AVX512DQ-NEXT: vmovd %xmm0, %eax
861 ; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm2, %xmm0
862 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
863 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
864 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
865 ; AVX512DQ-NEXT: vzeroupper
866 ; AVX512DQ-NEXT: retq
868 ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
869 ; AVX512BW-SLOW: # %bb.0:
870 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
871 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
872 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
873 ; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
874 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
875 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
876 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
877 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
878 ; AVX512BW-SLOW-NEXT: vzeroupper
879 ; AVX512BW-SLOW-NEXT: retq
881 ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
882 ; AVX512BW-FAST: # %bb.0:
883 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
884 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
885 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
886 ; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
887 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
888 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
889 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
890 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
891 ; AVX512BW-FAST-NEXT: vzeroupper
892 ; AVX512BW-FAST-NEXT: retq
893 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
894 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
895 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
896 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
897 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
898 %out.bytevec = bitcast <8 x i16> %broadcast.of.aextinreg to <16 x i8>
899 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
900 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
901 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
902 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
906 define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
907 ; SSE2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
909 ; SSE2-NEXT: movdqa (%rdi), %xmm0
910 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
911 ; SSE2-NEXT: paddb (%rsi), %xmm0
912 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
913 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
914 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
915 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
916 ; SSE2-NEXT: paddb (%rdx), %xmm0
917 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
920 ; SSE42-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
922 ; SSE42-NEXT: movdqa (%rdi), %xmm0
923 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
924 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
925 ; SSE42-NEXT: paddb (%rsi), %xmm0
926 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
927 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
928 ; SSE42-NEXT: paddb (%rdx), %xmm0
929 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
932 ; AVX-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
934 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
935 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
936 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
937 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
938 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
939 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
940 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
941 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
944 ; AVX2-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
945 ; AVX2-SLOW: # %bb.0:
946 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
947 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
948 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
949 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
950 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
951 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
952 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
953 ; AVX2-SLOW-NEXT: vzeroupper
954 ; AVX2-SLOW-NEXT: retq
956 ; AVX2-FAST-PERLANE-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
957 ; AVX2-FAST-PERLANE: # %bb.0:
958 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
959 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0
960 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
961 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %xmm0
962 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
963 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
964 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
965 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
966 ; AVX2-FAST-PERLANE-NEXT: retq
968 ; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
969 ; AVX2-FAST: # %bb.0:
970 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
971 ; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
972 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
973 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
974 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
975 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
976 ; AVX2-FAST-NEXT: vzeroupper
977 ; AVX2-FAST-NEXT: retq
979 ; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
981 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
982 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
983 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
984 ; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0
985 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
986 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
987 ; AVX512F-NEXT: vzeroupper
990 ; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
992 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
993 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
994 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
995 ; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0
996 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
997 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
998 ; AVX512DQ-NEXT: vzeroupper
999 ; AVX512DQ-NEXT: retq
1001 ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
1002 ; AVX512BW: # %bb.0:
1003 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1004 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
1005 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1006 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
1007 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1008 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1009 ; AVX512BW-NEXT: vzeroupper
1010 ; AVX512BW-NEXT: retq
1011 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1012 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1013 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1014 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
1015 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
1016 %out.bytevec = bitcast <4 x i32> %broadcast.of.aextinreg to <16 x i8>
1017 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1018 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1019 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1020 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1024 define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1025 ; SSE2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1027 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1028 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1029 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1030 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1031 ; SSE2-NEXT: paddb (%rsi), %xmm0
1032 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1033 ; SSE2-NEXT: psrlw $8, %xmm1
1034 ; SSE2-NEXT: packuswb %xmm1, %xmm1
1035 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1036 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1037 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1038 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1039 ; SSE2-NEXT: psrlw $8, %xmm2
1040 ; SSE2-NEXT: packuswb %xmm2, %xmm2
1041 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1042 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
1043 ; SSE2-NEXT: paddb (%rdx), %xmm3
1044 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
1045 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
1048 ; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1050 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1051 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1052 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1053 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1054 ; SSE42-NEXT: paddb (%rsi), %xmm0
1055 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1056 ; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
1057 ; SSE42-NEXT: pshufb %xmm3, %xmm1
1058 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1059 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1060 ; SSE42-NEXT: movdqa %xmm0, %xmm4
1061 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
1062 ; SSE42-NEXT: pshufb %xmm3, %xmm2
1063 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1064 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1065 ; SSE42-NEXT: paddb (%rdx), %xmm4
1066 ; SSE42-NEXT: movdqa %xmm4, (%rcx)
1067 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1070 ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1072 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1073 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1074 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1075 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1076 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1077 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1078 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
1079 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1080 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1081 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1082 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1083 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1084 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1085 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1086 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1087 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1088 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1091 ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1093 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
1094 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1095 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1096 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1097 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1098 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
1099 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1100 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1101 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1102 ; AVX2-NEXT: vzeroupper
1105 ; AVX512F-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1107 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
1108 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1109 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1110 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1111 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1112 ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
1113 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1114 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1115 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1116 ; AVX512F-NEXT: vzeroupper
1117 ; AVX512F-NEXT: retq
1119 ; AVX512DQ-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1120 ; AVX512DQ: # %bb.0:
1121 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
1122 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1123 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1124 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
1125 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1126 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
1127 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1128 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1129 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1130 ; AVX512DQ-NEXT: vzeroupper
1131 ; AVX512DQ-NEXT: retq
1133 ; AVX512BW-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16:
1134 ; AVX512BW: # %bb.0:
1135 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1136 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1137 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1138 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
1139 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
1140 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1141 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1142 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1143 ; AVX512BW-NEXT: vzeroupper
1144 ; AVX512BW-NEXT: retq
1145 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1146 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1147 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1148 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47, i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63>
1149 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.aextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1150 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1151 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1152 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1156 define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1157 ; SSE2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1159 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1160 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1161 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1162 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1163 ; SSE2-NEXT: paddb (%rsi), %xmm0
1164 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1165 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1166 ; SSE2-NEXT: pand %xmm3, %xmm1
1167 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1168 ; SSE2-NEXT: pand %xmm3, %xmm2
1169 ; SSE2-NEXT: pandn %xmm0, %xmm3
1170 ; SSE2-NEXT: por %xmm3, %xmm1
1171 ; SSE2-NEXT: por %xmm2, %xmm3
1172 ; SSE2-NEXT: paddb 16(%rdx), %xmm3
1173 ; SSE2-NEXT: paddb (%rdx), %xmm1
1174 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
1175 ; SSE2-NEXT: movdqa %xmm3, 16(%rcx)
1178 ; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1180 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1181 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1182 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1183 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1184 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1185 ; SSE42-NEXT: paddb (%rsi), %xmm0
1186 ; SSE42-NEXT: movdqa %xmm0, %xmm3
1187 ; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
1188 ; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
1189 ; SSE42-NEXT: pshufb %xmm1, %xmm3
1190 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1191 ; SSE42-NEXT: pshufb %xmm1, %xmm0
1192 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1193 ; SSE42-NEXT: paddb (%rdx), %xmm3
1194 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
1195 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1198 ; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1200 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1201 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1202 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1203 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1204 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1205 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1206 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1207 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
1208 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1209 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1210 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1211 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1212 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1213 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1214 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1217 ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1219 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
1220 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1221 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1222 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1223 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
1224 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1225 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1226 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1227 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1228 ; AVX2-NEXT: vzeroupper
1231 ; AVX512F-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1233 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
1234 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1235 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1236 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1237 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
1238 ; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
1239 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
1240 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1241 ; AVX512F-NEXT: vzeroupper
1242 ; AVX512F-NEXT: retq
1244 ; AVX512DQ-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1245 ; AVX512DQ: # %bb.0:
1246 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
1247 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1248 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
1249 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1250 ; AVX512DQ-NEXT: vpbroadcastd %xmm1, %ymm1
1251 ; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
1252 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
1253 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1254 ; AVX512DQ-NEXT: vzeroupper
1255 ; AVX512DQ-NEXT: retq
1257 ; AVX512BW-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8:
1258 ; AVX512BW: # %bb.0:
1259 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1260 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1261 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1262 ; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0
1263 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
1264 ; AVX512BW-NEXT: kmovd %eax, %k1
1265 ; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
1266 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
1267 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1268 ; AVX512BW-NEXT: vzeroupper
1269 ; AVX512BW-NEXT: retq
1270 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1271 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1272 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1273 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63>
1274 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.aextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1275 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1276 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1277 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1281 define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1282 ; SSE2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1284 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1285 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1286 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1287 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1288 ; SSE2-NEXT: paddb (%rsi), %xmm0
1289 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1290 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
1291 ; SSE2-NEXT: pand %xmm3, %xmm1
1292 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1293 ; SSE2-NEXT: pand %xmm3, %xmm2
1294 ; SSE2-NEXT: pandn %xmm0, %xmm3
1295 ; SSE2-NEXT: por %xmm3, %xmm1
1296 ; SSE2-NEXT: por %xmm2, %xmm3
1297 ; SSE2-NEXT: paddb 16(%rdx), %xmm3
1298 ; SSE2-NEXT: paddb (%rdx), %xmm1
1299 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
1300 ; SSE2-NEXT: movdqa %xmm3, 16(%rcx)
1303 ; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1305 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1306 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1307 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1308 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1309 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1310 ; SSE42-NEXT: paddb (%rsi), %xmm0
1311 ; SSE42-NEXT: movdqa %xmm0, %xmm3
1312 ; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
1313 ; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
1314 ; SSE42-NEXT: pshufb %xmm1, %xmm3
1315 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1316 ; SSE42-NEXT: pshufb %xmm1, %xmm0
1317 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1318 ; SSE42-NEXT: paddb (%rdx), %xmm3
1319 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
1320 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1323 ; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1325 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1326 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1327 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1328 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1329 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1330 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1331 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1332 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
1333 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1334 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1335 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1336 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1337 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1338 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1339 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1342 ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1344 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
1345 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1346 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1347 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1348 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
1349 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
1350 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1351 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1352 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1353 ; AVX2-NEXT: vzeroupper
1356 ; AVX512F-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1358 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
1359 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1360 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1361 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1362 ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
1363 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1
1364 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
1365 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1366 ; AVX512F-NEXT: vzeroupper
1367 ; AVX512F-NEXT: retq
1369 ; AVX512DQ-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1370 ; AVX512DQ: # %bb.0:
1371 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
1372 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1373 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
1374 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1375 ; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
1376 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1
1377 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
1378 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1379 ; AVX512DQ-NEXT: vzeroupper
1380 ; AVX512DQ-NEXT: retq
1382 ; AVX512BW-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4:
1383 ; AVX512BW: # %bb.0:
1384 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1385 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1386 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1387 ; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm0
1388 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
1389 ; AVX512BW-NEXT: kmovd %eax, %k1
1390 ; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
1391 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
1392 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1393 ; AVX512BW-NEXT: vzeroupper
1394 ; AVX512BW-NEXT: retq
1395 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1396 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1397 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1398 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1399 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.aextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1400 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1401 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1402 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1406 define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1407 ; SSE2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1409 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1410 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1411 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1412 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1413 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1414 ; SSE2-NEXT: paddb (%rsi), %xmm0
1415 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1416 ; SSE2-NEXT: pand %xmm3, %xmm1
1417 ; SSE2-NEXT: pand %xmm3, %xmm2
1418 ; SSE2-NEXT: pandn %xmm0, %xmm3
1419 ; SSE2-NEXT: por %xmm3, %xmm1
1420 ; SSE2-NEXT: por %xmm3, %xmm2
1421 ; SSE2-NEXT: paddb 16(%rdx), %xmm2
1422 ; SSE2-NEXT: paddb (%rdx), %xmm1
1423 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
1424 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
1427 ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1429 ; SSE42-NEXT: movdqa (%rdi), %xmm1
1430 ; SSE42-NEXT: movdqa 32(%rdi), %xmm2
1431 ; SSE42-NEXT: movdqa 48(%rdi), %xmm3
1432 ; SSE42-NEXT: paddb 48(%rsi), %xmm3
1433 ; SSE42-NEXT: paddb 32(%rsi), %xmm2
1434 ; SSE42-NEXT: paddb (%rsi), %xmm1
1435 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1436 ; SSE42-NEXT: movdqa %xmm1, %xmm4
1437 ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm4
1438 ; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1
1439 ; SSE42-NEXT: paddb 16(%rdx), %xmm1
1440 ; SSE42-NEXT: paddb (%rdx), %xmm4
1441 ; SSE42-NEXT: movdqa %xmm4, (%rcx)
1442 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
1445 ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1447 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1448 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1449 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1450 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1451 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1452 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1453 ; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615]
1454 ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm1
1455 ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
1456 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1457 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1458 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1459 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1462 ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1464 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1465 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
1466 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1467 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1468 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1469 ; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615]
1470 ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
1471 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1472 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1473 ; AVX2-NEXT: vzeroupper
1476 ; AVX512F-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1478 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1479 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
1480 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1481 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1482 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1483 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1484 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1485 ; AVX512F-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm2
1486 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1487 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1488 ; AVX512F-NEXT: vzeroupper
1489 ; AVX512F-NEXT: retq
1491 ; AVX512DQ-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1492 ; AVX512DQ: # %bb.0:
1493 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
1494 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
1495 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1496 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1497 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1498 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1499 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
1500 ; AVX512DQ-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm2
1501 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1502 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1503 ; AVX512DQ-NEXT: vzeroupper
1504 ; AVX512DQ-NEXT: retq
1506 ; AVX512BW-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
1507 ; AVX512BW: # %bb.0:
1508 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1509 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1510 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1511 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1512 ; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
1513 ; AVX512BW-NEXT: kmovd %eax, %k1
1514 ; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
1515 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
1516 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1517 ; AVX512BW-NEXT: vzeroupper
1518 ; AVX512BW-NEXT: retq
1519 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1520 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1521 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1522 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1523 %out.bytevec.padded = shufflevector <32 x i8> %broadcast.of.aextinreg, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1524 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1525 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1526 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1530 define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1531 ; SSE2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1533 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1534 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1535 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1536 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1537 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1538 ; SSE2-NEXT: paddb (%rsi), %xmm0
1539 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1540 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
1541 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
1542 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1543 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1544 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1545 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1546 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7]
1547 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
1548 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1549 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1550 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1551 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
1552 ; SSE2-NEXT: paddb (%rdx), %xmm3
1553 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
1554 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
1557 ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1559 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1560 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1561 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1562 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1563 ; SSE42-NEXT: paddb (%rsi), %xmm0
1564 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1565 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
1566 ; SSE42-NEXT: pshufb %xmm3, %xmm1
1567 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1568 ; SSE42-NEXT: movdqa %xmm0, %xmm4
1569 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
1570 ; SSE42-NEXT: pshufb %xmm3, %xmm2
1571 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1572 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1573 ; SSE42-NEXT: paddb (%rdx), %xmm4
1574 ; SSE42-NEXT: movdqa %xmm4, (%rcx)
1575 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1578 ; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1580 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1581 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1582 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1583 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1584 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1585 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1586 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
1587 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1588 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1589 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1590 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1591 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1592 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1593 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1594 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1595 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1598 ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1600 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
1601 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1602 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1603 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1604 ; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
1605 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1606 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1607 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1608 ; AVX2-NEXT: vzeroupper
1611 ; AVX512F-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1613 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
1614 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1615 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1616 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1617 ; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1
1618 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1619 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1620 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1621 ; AVX512F-NEXT: vzeroupper
1622 ; AVX512F-NEXT: retq
1624 ; AVX512DQ-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1625 ; AVX512DQ: # %bb.0:
1626 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
1627 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1628 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
1629 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1630 ; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1
1631 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1632 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1633 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1634 ; AVX512DQ-NEXT: vzeroupper
1635 ; AVX512DQ-NEXT: retq
1637 ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8:
1638 ; AVX512BW: # %bb.0:
1639 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1640 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1641 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
1642 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
1643 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1644 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1645 ; AVX512BW-NEXT: vzeroupper
1646 ; AVX512BW-NEXT: retq
1647 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1648 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1649 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1650 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
1651 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31>
1652 %out.bytevec = bitcast <16 x i16> %broadcast.of.aextinreg to <32 x i8>
1653 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1654 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1655 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1656 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1660 define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1661 ; SSE2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1663 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1664 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1665 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1666 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1667 ; SSE2-NEXT: paddb (%rsi), %xmm0
1668 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1669 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535]
1670 ; SSE2-NEXT: pand %xmm3, %xmm1
1671 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1672 ; SSE2-NEXT: pand %xmm3, %xmm2
1673 ; SSE2-NEXT: pandn %xmm0, %xmm3
1674 ; SSE2-NEXT: por %xmm3, %xmm1
1675 ; SSE2-NEXT: por %xmm2, %xmm3
1676 ; SSE2-NEXT: paddb 16(%rdx), %xmm3
1677 ; SSE2-NEXT: paddb (%rdx), %xmm1
1678 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
1679 ; SSE2-NEXT: movdqa %xmm3, 16(%rcx)
1682 ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1684 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1685 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1686 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1687 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1688 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1689 ; SSE42-NEXT: paddb (%rsi), %xmm0
1690 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1691 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1692 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1693 ; SSE42-NEXT: paddb 16(%rdx), %xmm2
1694 ; SSE42-NEXT: paddb (%rdx), %xmm1
1695 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
1696 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
1699 ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1701 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1702 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1703 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1704 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1705 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1706 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1707 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1708 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
1709 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
1710 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1711 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1712 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1713 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1716 ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1718 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
1719 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1720 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1721 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1722 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
1723 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1724 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1725 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1726 ; AVX2-NEXT: vzeroupper
1729 ; AVX512F-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1731 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
1732 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1733 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1734 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1735 ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
1736 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1737 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1738 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1739 ; AVX512F-NEXT: vzeroupper
1740 ; AVX512F-NEXT: retq
1742 ; AVX512DQ-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1743 ; AVX512DQ: # %bb.0:
1744 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
1745 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1746 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
1747 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1748 ; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
1749 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
1750 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1751 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1752 ; AVX512DQ-NEXT: vzeroupper
1753 ; AVX512DQ-NEXT: retq
1755 ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
1756 ; AVX512BW: # %bb.0:
1757 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1758 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1759 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1760 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
1761 ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
1762 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
1763 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1764 ; AVX512BW-NEXT: vzeroupper
1765 ; AVX512BW-NEXT: retq
1766 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1767 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1768 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1769 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
1770 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31>
1771 %out.bytevec = bitcast <16 x i16> %broadcast.of.aextinreg to <32 x i8>
1772 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1773 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1774 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1775 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1779 define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1780 ; SSE2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1782 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1783 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1784 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1785 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1786 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1787 ; SSE2-NEXT: paddb (%rsi), %xmm0
1788 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535]
1789 ; SSE2-NEXT: pand %xmm3, %xmm1
1790 ; SSE2-NEXT: pand %xmm3, %xmm2
1791 ; SSE2-NEXT: pandn %xmm0, %xmm3
1792 ; SSE2-NEXT: por %xmm3, %xmm1
1793 ; SSE2-NEXT: por %xmm3, %xmm2
1794 ; SSE2-NEXT: paddb 16(%rdx), %xmm2
1795 ; SSE2-NEXT: paddb (%rdx), %xmm1
1796 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
1797 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
1800 ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1802 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1803 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1804 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1805 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1806 ; SSE42-NEXT: paddb (%rsi), %xmm0
1807 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1808 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1809 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
1810 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1811 ; SSE42-NEXT: paddb (%rdx), %xmm1
1812 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
1813 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1816 ; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1818 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1819 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1820 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1821 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1822 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1823 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1824 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1825 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
1826 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1827 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1828 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1829 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1832 ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1834 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1835 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
1836 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1837 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1838 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1839 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1840 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1841 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1842 ; AVX2-NEXT: vzeroupper
1845 ; AVX512F-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1847 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1848 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
1849 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1850 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1851 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1852 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1853 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1854 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1855 ; AVX512F-NEXT: vzeroupper
1856 ; AVX512F-NEXT: retq
1858 ; AVX512DQ-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1859 ; AVX512DQ: # %bb.0:
1860 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
1861 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
1862 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
1863 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1864 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1865 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1866 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1867 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1868 ; AVX512DQ-NEXT: vzeroupper
1869 ; AVX512DQ-NEXT: retq
1871 ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
1872 ; AVX512BW: # %bb.0:
1873 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1874 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1875 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1876 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
1877 ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
1878 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
1879 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1880 ; AVX512BW-NEXT: vzeroupper
1881 ; AVX512BW-NEXT: retq
1882 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1883 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1884 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1885 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
1886 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1887 %out.bytevec = bitcast <16 x i16> %broadcast.of.aextinreg to <32 x i8>
1888 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1889 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1890 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1891 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1895 define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1896 ; SSE2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1898 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1899 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
1900 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
1901 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
1902 ; SSE2-NEXT: paddb (%rsi), %xmm0
1903 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
1904 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1905 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
1906 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1907 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1908 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
1909 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1910 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
1911 ; SSE2-NEXT: paddb (%rdx), %xmm3
1912 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
1913 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
1916 ; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1918 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1919 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
1920 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
1921 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
1922 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
1923 ; SSE42-NEXT: paddb (%rsi), %xmm0
1924 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1925 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1926 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1927 ; SSE42-NEXT: paddb 16(%rdx), %xmm2
1928 ; SSE42-NEXT: paddb (%rdx), %xmm1
1929 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
1930 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
1933 ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1935 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1936 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1937 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1938 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
1939 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1940 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1941 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1942 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1943 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
1944 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1945 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
1946 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
1947 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
1948 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
1949 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
1950 ; AVX-NEXT: vzeroupper
1953 ; AVX2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1955 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
1956 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1957 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1958 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1959 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
1960 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
1961 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1962 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1963 ; AVX2-NEXT: vzeroupper
1966 ; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1967 ; AVX512F-SLOW: # %bb.0:
1968 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0
1969 ; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1970 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1
1971 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1972 ; AVX512F-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
1973 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
1974 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1975 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
1976 ; AVX512F-SLOW-NEXT: vzeroupper
1977 ; AVX512F-SLOW-NEXT: retq
1979 ; AVX512F-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1980 ; AVX512F-FAST: # %bb.0:
1981 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
1982 ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1983 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1
1984 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1985 ; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15]
1986 ; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
1987 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1988 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
1989 ; AVX512F-FAST-NEXT: vzeroupper
1990 ; AVX512F-FAST-NEXT: retq
1992 ; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
1993 ; AVX512DQ-SLOW: # %bb.0:
1994 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0
1995 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
1996 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1
1997 ; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1998 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
1999 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
2000 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2001 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
2002 ; AVX512DQ-SLOW-NEXT: vzeroupper
2003 ; AVX512DQ-SLOW-NEXT: retq
2005 ; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
2006 ; AVX512DQ-FAST: # %bb.0:
2007 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
2008 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2009 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1
2010 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1
2011 ; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15]
2012 ; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
2013 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0
2014 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx)
2015 ; AVX512DQ-FAST-NEXT: vzeroupper
2016 ; AVX512DQ-FAST-NEXT: retq
2018 ; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
2019 ; AVX512BW-SLOW: # %bb.0:
2020 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
2021 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2022 ; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2023 ; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0
2024 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
2025 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2026 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
2027 ; AVX512BW-SLOW-NEXT: vzeroupper
2028 ; AVX512BW-SLOW-NEXT: retq
2030 ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
2031 ; AVX512BW-FAST: # %bb.0:
2032 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
2033 ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31]
2034 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2035 ; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm0
2036 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2037 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
2038 ; AVX512BW-FAST-NEXT: vzeroupper
2039 ; AVX512BW-FAST-NEXT: retq
2040 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2041 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2042 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2043 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
2044 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
2045 %out.bytevec = bitcast <8 x i32> %broadcast.of.aextinreg to <32 x i8>
2046 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2047 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2048 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2049 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2053 define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2054 ; SSE2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2056 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2057 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
2058 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
2059 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
2060 ; SSE2-NEXT: paddb (%rsi), %xmm0
2061 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
2062 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2063 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
2064 ; SSE2-NEXT: paddb 16(%rdx), %xmm2
2065 ; SSE2-NEXT: paddb (%rdx), %xmm1
2066 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
2067 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
2070 ; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2072 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2073 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
2074 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
2075 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
2076 ; SSE42-NEXT: paddb (%rsi), %xmm0
2077 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
2078 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2079 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
2080 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
2081 ; SSE42-NEXT: paddb (%rdx), %xmm1
2082 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
2083 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
2086 ; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2088 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2089 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
2090 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
2091 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
2092 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
2093 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2094 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2095 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2096 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2097 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
2098 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
2099 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
2100 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
2101 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
2102 ; AVX-NEXT: vzeroupper
2105 ; AVX2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2107 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2108 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
2109 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2110 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2111 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2112 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2113 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2114 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
2115 ; AVX2-NEXT: vzeroupper
2118 ; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2119 ; AVX512F-SLOW: # %bb.0:
2120 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0
2121 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
2122 ; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2123 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2124 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2125 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2126 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2127 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
2128 ; AVX512F-SLOW-NEXT: vzeroupper
2129 ; AVX512F-SLOW-NEXT: retq
2131 ; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2132 ; AVX512F-FAST: # %bb.0:
2133 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0
2134 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
2135 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2136 ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2137 ; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7]
2138 ; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
2139 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0
2140 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
2141 ; AVX512F-FAST-NEXT: vzeroupper
2142 ; AVX512F-FAST-NEXT: retq
2144 ; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2145 ; AVX512DQ-SLOW: # %bb.0:
2146 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0
2147 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
2148 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2149 ; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2150 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2151 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2152 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2153 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
2154 ; AVX512DQ-SLOW-NEXT: vzeroupper
2155 ; AVX512DQ-SLOW-NEXT: retq
2157 ; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2158 ; AVX512DQ-FAST: # %bb.0:
2159 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0
2160 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
2161 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2162 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2163 ; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7]
2164 ; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
2165 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0
2166 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx)
2167 ; AVX512DQ-FAST-NEXT: vzeroupper
2168 ; AVX512DQ-FAST-NEXT: retq
2170 ; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2171 ; AVX512BW-SLOW: # %bb.0:
2172 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
2173 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2174 ; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2175 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
2176 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2177 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2178 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
2179 ; AVX512BW-SLOW-NEXT: vzeroupper
2180 ; AVX512BW-SLOW-NEXT: retq
2182 ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
2183 ; AVX512BW-FAST: # %bb.0:
2184 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
2185 ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15]
2186 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2187 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
2188 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2189 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
2190 ; AVX512BW-FAST-NEXT: vzeroupper
2191 ; AVX512BW-FAST-NEXT: retq
2192 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2193 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2194 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2195 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
2196 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
2197 %out.bytevec = bitcast <8 x i32> %broadcast.of.aextinreg to <32 x i8>
2198 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2199 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2200 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2201 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2205 define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2206 ; SSE2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2208 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2209 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
2210 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
2211 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
2212 ; SSE2-NEXT: paddb (%rsi), %xmm0
2213 ; SSE2-NEXT: paddb 32(%rsi), %xmm1
2214 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2215 ; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
2216 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
2217 ; SSE2-NEXT: paddb (%rdx), %xmm1
2218 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
2219 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
2222 ; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2224 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2225 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1
2226 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
2227 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
2228 ; SSE42-NEXT: paddb 32(%rsi), %xmm1
2229 ; SSE42-NEXT: paddb (%rsi), %xmm0
2230 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2231 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2232 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
2233 ; SSE42-NEXT: paddb (%rdx), %xmm1
2234 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
2235 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
2238 ; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2240 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2241 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
2242 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
2243 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
2244 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
2245 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2246 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2247 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2248 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2249 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
2250 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
2251 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
2252 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
2253 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
2254 ; AVX-NEXT: vzeroupper
2257 ; AVX2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2259 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
2260 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2261 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
2262 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
2263 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
2264 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2265 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2266 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
2267 ; AVX2-NEXT: vzeroupper
2270 ; AVX512F-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2271 ; AVX512F-SLOW: # %bb.0:
2272 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0
2273 ; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2274 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1
2275 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
2276 ; AVX512F-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
2277 ; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2278 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2279 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
2280 ; AVX512F-SLOW-NEXT: vzeroupper
2281 ; AVX512F-SLOW-NEXT: retq
2283 ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2284 ; AVX512F-FAST: # %bb.0:
2285 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0
2286 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
2287 ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2288 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2289 ; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7]
2290 ; AVX512F-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
2291 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0
2292 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
2293 ; AVX512F-FAST-NEXT: vzeroupper
2294 ; AVX512F-FAST-NEXT: retq
2296 ; AVX512DQ-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2297 ; AVX512DQ-SLOW: # %bb.0:
2298 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0
2299 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
2300 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1
2301 ; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
2302 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
2303 ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
2304 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2305 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
2306 ; AVX512DQ-SLOW-NEXT: vzeroupper
2307 ; AVX512DQ-SLOW-NEXT: retq
2309 ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2310 ; AVX512DQ-FAST: # %bb.0:
2311 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0
2312 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
2313 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
2314 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2315 ; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7]
2316 ; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
2317 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0
2318 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx)
2319 ; AVX512DQ-FAST-NEXT: vzeroupper
2320 ; AVX512DQ-FAST-NEXT: retq
2322 ; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2323 ; AVX512BW-SLOW: # %bb.0:
2324 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
2325 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2326 ; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2327 ; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0
2328 ; AVX512BW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2329 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2330 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
2331 ; AVX512BW-SLOW-NEXT: vzeroupper
2332 ; AVX512BW-SLOW-NEXT: retq
2334 ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
2335 ; AVX512BW-FAST: # %bb.0:
2336 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
2337 ; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
2338 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2339 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0
2340 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2341 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
2342 ; AVX512BW-FAST-NEXT: vzeroupper
2343 ; AVX512BW-FAST-NEXT: retq
2344 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2345 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2346 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2347 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
2348 %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
2349 %out.bytevec = bitcast <4 x i64> %broadcast.of.aextinreg to <32 x i8>
2350 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2351 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2352 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2353 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2357 define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2358 ; SSE2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2360 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2361 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
2362 ; SSE2-NEXT: paddb (%rsi), %xmm0
2363 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
2364 ; SSE2-NEXT: psrlw $8, %xmm1
2365 ; SSE2-NEXT: packuswb %xmm1, %xmm1
2366 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
2367 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2368 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2369 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2370 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
2371 ; SSE2-NEXT: paddb (%rdx), %xmm0
2372 ; SSE2-NEXT: movdqa 16(%rdx), %xmm2
2373 ; SSE2-NEXT: paddb %xmm1, %xmm2
2374 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
2375 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
2376 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
2377 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
2380 ; SSE42-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2382 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2383 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
2384 ; SSE42-NEXT: paddb (%rsi), %xmm0
2385 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
2386 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2387 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
2388 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2389 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2390 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2391 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
2392 ; SSE42-NEXT: paddb (%rdx), %xmm0
2393 ; SSE42-NEXT: movdqa 16(%rdx), %xmm2
2394 ; SSE42-NEXT: paddb %xmm1, %xmm2
2395 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
2396 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
2397 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
2398 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
2401 ; AVX-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2403 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2404 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
2405 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2406 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2407 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2408 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2409 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
2410 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2411 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2412 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2413 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
2414 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
2415 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
2416 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
2417 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
2418 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
2421 ; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2423 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
2424 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2425 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2426 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2427 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2428 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
2429 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2430 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2431 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2432 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2433 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
2434 ; AVX2-NEXT: vzeroupper
2437 ; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2439 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2440 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
2441 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2442 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2443 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2444 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
2445 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2446 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2447 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2448 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2449 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
2450 ; AVX512F-NEXT: vzeroupper
2451 ; AVX512F-NEXT: retq
2453 ; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2454 ; AVX512DQ: # %bb.0:
2455 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2456 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
2457 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2458 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2459 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2460 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
2461 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2462 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2463 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2464 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
2465 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
2466 ; AVX512DQ-NEXT: vzeroupper
2467 ; AVX512DQ-NEXT: retq
2469 ; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
2470 ; AVX512BW: # %bb.0:
2471 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2472 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2473 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2474 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2475 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
2476 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
2477 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2478 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
2479 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2480 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2481 ; AVX512BW-NEXT: vzeroupper
2482 ; AVX512BW-NEXT: retq
2483 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2484 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2485 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2486 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63, i32 0, i32 65, i32 0, i32 67, i32 0, i32 69, i32 0, i32 71, i32 0, i32 73, i32 0, i32 75, i32 0, i32 77, i32 0, i32 79, i32 0, i32 81, i32 0, i32 83, i32 0, i32 85, i32 0, i32 87, i32 0, i32 89, i32 0, i32 91, i32 0, i32 93, i32 0, i32 95>
2487 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2488 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2489 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2490 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2494 define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2495 ; SSE2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2497 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2498 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
2499 ; SSE2-NEXT: paddb (%rsi), %xmm0
2500 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
2501 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
2502 ; SSE2-NEXT: pand %xmm2, %xmm1
2503 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2504 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2505 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2506 ; SSE2-NEXT: pandn %xmm0, %xmm2
2507 ; SSE2-NEXT: por %xmm1, %xmm2
2508 ; SSE2-NEXT: paddb (%rdx), %xmm2
2509 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
2510 ; SSE2-NEXT: paddb %xmm0, %xmm1
2511 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
2512 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
2513 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
2514 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
2517 ; SSE42-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2519 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2520 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
2521 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
2522 ; SSE42-NEXT: paddb (%rsi), %xmm0
2523 ; SSE42-NEXT: movdqa %xmm0, %xmm2
2524 ; SSE42-NEXT: palignr {{.*#+}} xmm2 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
2525 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2526 ; SSE42-NEXT: pxor %xmm1, %xmm1
2527 ; SSE42-NEXT: pshufb %xmm1, %xmm0
2528 ; SSE42-NEXT: paddb (%rdx), %xmm2
2529 ; SSE42-NEXT: movdqa 16(%rdx), %xmm1
2530 ; SSE42-NEXT: paddb %xmm0, %xmm1
2531 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
2532 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
2533 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
2534 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
2537 ; AVX-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2539 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2540 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
2541 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2542 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2543 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2544 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2545 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
2546 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2547 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
2548 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
2549 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
2550 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
2551 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
2552 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
2555 ; AVX2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2557 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
2558 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2559 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2560 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2561 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2562 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2563 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2564 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
2565 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2566 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2567 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2568 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
2569 ; AVX2-NEXT: vzeroupper
2572 ; AVX512F-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2574 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2575 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
2576 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2577 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2578 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2579 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2580 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
2581 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2582 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
2583 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2584 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2585 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2586 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
2587 ; AVX512F-NEXT: vzeroupper
2588 ; AVX512F-NEXT: retq
2590 ; AVX512DQ-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2591 ; AVX512DQ: # %bb.0:
2592 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2593 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
2594 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2595 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2596 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2597 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2598 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
2599 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2600 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
2601 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2602 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2603 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
2604 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
2605 ; AVX512DQ-NEXT: vzeroupper
2606 ; AVX512DQ-NEXT: retq
2608 ; AVX512BW-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16:
2609 ; AVX512BW: # %bb.0:
2610 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2611 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2612 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2613 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2614 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2615 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15]
2616 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0
2617 ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
2618 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
2619 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2620 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2621 ; AVX512BW-NEXT: vzeroupper
2622 ; AVX512BW-NEXT: retq
2623 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2624 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2625 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2626 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 0, i32 52, i32 53, i32 0, i32 55, i32 56, i32 0, i32 58, i32 59, i32 0, i32 61, i32 62, i32 0, i32 64, i32 65, i32 0, i32 67, i32 68, i32 0, i32 70, i32 71, i32 0, i32 73, i32 74, i32 0, i32 76, i32 77, i32 0, i32 79, i32 80, i32 0, i32 82, i32 83, i32 0, i32 85, i32 86, i32 0, i32 88, i32 89, i32 0, i32 91, i32 92, i32 0, i32 94, i32 95>
2627 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2628 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2629 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2630 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2634 define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2635 ; SSE2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2637 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2638 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
2639 ; SSE2-NEXT: paddb (%rsi), %xmm0
2640 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
2641 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2642 ; SSE2-NEXT: pand %xmm2, %xmm1
2643 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2644 ; SSE2-NEXT: pandn %xmm0, %xmm2
2645 ; SSE2-NEXT: por %xmm1, %xmm2
2646 ; SSE2-NEXT: paddb (%rdx), %xmm2
2647 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
2648 ; SSE2-NEXT: paddb %xmm0, %xmm1
2649 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
2650 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
2651 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
2652 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
2655 ; SSE42-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2657 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2658 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
2659 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
2660 ; SSE42-NEXT: paddb (%rsi), %xmm0
2661 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
2662 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2663 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
2664 ; SSE42-NEXT: paddb (%rdx), %xmm0
2665 ; SSE42-NEXT: movdqa 16(%rdx), %xmm1
2666 ; SSE42-NEXT: paddb %xmm2, %xmm1
2667 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
2668 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
2669 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
2670 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
2673 ; AVX-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2675 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2676 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
2677 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2678 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2679 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2680 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14]
2681 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2682 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
2683 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
2684 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
2685 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
2686 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
2687 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
2690 ; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2692 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
2693 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2694 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2695 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2696 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2
2697 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2698 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2699 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2700 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2701 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2702 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
2703 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2704 ; AVX2-NEXT: vzeroupper
2707 ; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2709 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2710 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
2711 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2712 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2713 ; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm2
2714 ; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
2715 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
2716 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2717 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2718 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2719 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
2720 ; AVX512F-NEXT: vzeroupper
2721 ; AVX512F-NEXT: retq
2723 ; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2724 ; AVX512DQ: # %bb.0:
2725 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2726 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
2727 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2728 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2729 ; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm2
2730 ; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
2731 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
2732 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2733 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2734 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
2735 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
2736 ; AVX512DQ-NEXT: vzeroupper
2737 ; AVX512DQ-NEXT: retq
2739 ; AVX512BW-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12:
2740 ; AVX512BW: # %bb.0:
2741 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2742 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2743 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2744 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2745 ; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
2746 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
2747 ; AVX512BW-NEXT: kmovd %eax, %k1
2748 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
2749 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
2750 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
2751 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2752 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2753 ; AVX512BW-NEXT: vzeroupper
2754 ; AVX512BW-NEXT: retq
2755 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2756 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2757 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2758 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 0, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 0, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 0, i32 93, i32 94, i32 95>
2759 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2760 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2761 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2762 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2766 define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2767 ; SSE2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2769 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2770 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
2771 ; SSE2-NEXT: paddb (%rsi), %xmm0
2772 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
2773 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
2774 ; SSE2-NEXT: pand %xmm2, %xmm1
2775 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2776 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2777 ; SSE2-NEXT: pandn %xmm0, %xmm2
2778 ; SSE2-NEXT: por %xmm1, %xmm2
2779 ; SSE2-NEXT: paddb (%rdx), %xmm2
2780 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
2781 ; SSE2-NEXT: paddb %xmm0, %xmm1
2782 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
2783 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
2784 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
2785 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
2788 ; SSE42-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2790 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2791 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
2792 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
2793 ; SSE42-NEXT: paddb (%rsi), %xmm0
2794 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
2795 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2796 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2797 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
2798 ; SSE42-NEXT: paddb (%rdx), %xmm0
2799 ; SSE42-NEXT: movdqa 16(%rdx), %xmm2
2800 ; SSE42-NEXT: paddb %xmm1, %xmm2
2801 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
2802 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
2803 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
2804 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
2807 ; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2809 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2810 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
2811 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2812 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2813 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2814 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2815 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2816 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2817 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
2818 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
2819 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
2820 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
2821 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
2822 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
2825 ; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2827 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
2828 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2829 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2830 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2831 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2832 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2833 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2834 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
2835 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2836 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2837 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2838 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
2839 ; AVX2-NEXT: vzeroupper
2842 ; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2844 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2845 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
2846 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2847 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2848 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2849 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2850 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
2851 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2852 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
2853 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2854 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2855 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2856 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
2857 ; AVX512F-NEXT: vzeroupper
2858 ; AVX512F-NEXT: retq
2860 ; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2861 ; AVX512DQ: # %bb.0:
2862 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2863 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
2864 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2865 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2866 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2867 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2868 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
2869 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2870 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
2871 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2872 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2873 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
2874 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
2875 ; AVX512DQ-NEXT: vzeroupper
2876 ; AVX512DQ-NEXT: retq
2878 ; AVX512BW-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8:
2879 ; AVX512BW: # %bb.0:
2880 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2881 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2882 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2883 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2884 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2885 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
2886 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
2887 ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
2888 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
2889 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2890 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2891 ; AVX512BW-NEXT: vzeroupper
2892 ; AVX512BW-NEXT: retq
2893 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2894 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2895 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2896 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 0, i32 55, i32 56, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 64, i32 65, i32 0, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 0, i32 79, i32 80, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 88, i32 89, i32 0, i32 91, i32 92, i32 93, i32 94, i32 95>
2897 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2898 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2899 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2900 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2904 define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2905 ; SSE2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2907 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2908 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
2909 ; SSE2-NEXT: paddb (%rsi), %xmm0
2910 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
2911 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2912 ; SSE2-NEXT: pand %xmm2, %xmm1
2913 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2914 ; SSE2-NEXT: pandn %xmm0, %xmm2
2915 ; SSE2-NEXT: por %xmm1, %xmm2
2916 ; SSE2-NEXT: paddb (%rdx), %xmm2
2917 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
2918 ; SSE2-NEXT: paddb %xmm0, %xmm1
2919 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
2920 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
2921 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
2922 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
2925 ; SSE42-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2927 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2928 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
2929 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
2930 ; SSE42-NEXT: paddb (%rsi), %xmm0
2931 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
2932 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2933 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
2934 ; SSE42-NEXT: paddb (%rdx), %xmm0
2935 ; SSE42-NEXT: movdqa 16(%rdx), %xmm1
2936 ; SSE42-NEXT: paddb %xmm2, %xmm1
2937 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
2938 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
2939 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
2940 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
2943 ; AVX-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2945 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2946 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
2947 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2948 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2949 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
2950 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14]
2951 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2952 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
2953 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
2954 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
2955 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
2956 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
2957 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
2960 ; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2962 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
2963 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
2964 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2965 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2966 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2
2967 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2968 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2969 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2970 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2971 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2972 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
2973 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2974 ; AVX2-NEXT: vzeroupper
2977 ; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2979 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2980 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
2981 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2982 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2983 ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
2984 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm2
2985 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
2986 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2987 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2988 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2989 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
2990 ; AVX512F-NEXT: vzeroupper
2991 ; AVX512F-NEXT: retq
2993 ; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
2994 ; AVX512DQ: # %bb.0:
2995 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2996 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
2997 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
2998 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2999 ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
3000 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm2
3001 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
3002 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
3003 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3004 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
3005 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
3006 ; AVX512DQ-NEXT: vzeroupper
3007 ; AVX512DQ-NEXT: retq
3009 ; AVX512BW-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6:
3010 ; AVX512BW: # %bb.0:
3011 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3012 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3013 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3014 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3015 ; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
3016 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
3017 ; AVX512BW-NEXT: kmovd %eax, %k1
3018 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
3019 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
3020 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3021 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3022 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3023 ; AVX512BW-NEXT: vzeroupper
3024 ; AVX512BW-NEXT: retq
3025 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3026 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3027 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3028 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3029 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3030 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3031 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3032 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3036 define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3037 ; SSE2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3039 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3040 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
3041 ; SSE2-NEXT: paddb (%rsi), %xmm0
3042 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
3043 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
3044 ; SSE2-NEXT: pand %xmm2, %xmm1
3045 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3046 ; SSE2-NEXT: pandn %xmm3, %xmm2
3047 ; SSE2-NEXT: por %xmm1, %xmm2
3048 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
3049 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3050 ; SSE2-NEXT: paddb (%rdx), %xmm2
3051 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
3052 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
3053 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
3054 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
3055 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3058 ; SSE42-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3060 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3061 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
3062 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
3063 ; SSE42-NEXT: paddb (%rsi), %xmm0
3064 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
3065 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
3066 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3067 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3068 ; SSE42-NEXT: paddb (%rdx), %xmm0
3069 ; SSE42-NEXT: paddb 16(%rdx), %xmm3
3070 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
3071 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
3072 ; SSE42-NEXT: movdqa %xmm3, 16(%rcx)
3073 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
3076 ; AVX-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3078 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3079 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3080 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3081 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3082 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3083 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3084 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
3085 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3086 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3087 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
3088 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
3089 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
3090 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
3091 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3094 ; AVX2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3096 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
3097 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
3098 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3099 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3100 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3101 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3102 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
3103 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3104 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3105 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3106 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
3107 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
3108 ; AVX2-NEXT: vzeroupper
3111 ; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3113 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3114 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
3115 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3116 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3117 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3118 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3119 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
3120 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
3121 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3122 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3123 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3124 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
3125 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
3126 ; AVX512F-NEXT: vzeroupper
3127 ; AVX512F-NEXT: retq
3129 ; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3130 ; AVX512DQ: # %bb.0:
3131 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
3132 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
3133 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3134 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3135 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3136 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3137 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
3138 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
3139 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3140 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3141 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3142 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
3143 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
3144 ; AVX512DQ-NEXT: vzeroupper
3145 ; AVX512DQ-NEXT: retq
3147 ; AVX512BW-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4:
3148 ; AVX512BW: # %bb.0:
3149 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3150 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3151 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3152 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3153 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
3154 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
3155 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
3156 ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3157 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3158 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3159 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3160 ; AVX512BW-NEXT: vzeroupper
3161 ; AVX512BW-NEXT: retq
3162 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3163 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3164 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3165 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3166 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3167 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3168 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3169 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3173 define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3174 ; SSE2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3176 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3177 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
3178 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2
3179 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
3180 ; SSE2-NEXT: paddb 48(%rsi), %xmm2
3181 ; SSE2-NEXT: paddb (%rsi), %xmm0
3182 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3183 ; SSE2-NEXT: pand %xmm3, %xmm2
3184 ; SSE2-NEXT: pandn %xmm0, %xmm3
3185 ; SSE2-NEXT: por %xmm2, %xmm3
3186 ; SSE2-NEXT: paddb (%rdx), %xmm3
3187 ; SSE2-NEXT: movdqa 16(%rdx), %xmm2
3188 ; SSE2-NEXT: paddb %xmm0, %xmm2
3189 ; SSE2-NEXT: paddb 48(%rdx), %xmm1
3190 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
3191 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
3192 ; SSE2-NEXT: movdqa %xmm1, 48(%rcx)
3193 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
3194 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
3197 ; SSE42-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3199 ; SSE42-NEXT: movdqa (%rdi), %xmm1
3200 ; SSE42-NEXT: movdqa 16(%rdi), %xmm2
3201 ; SSE42-NEXT: movdqa 48(%rdi), %xmm3
3202 ; SSE42-NEXT: paddb 16(%rsi), %xmm2
3203 ; SSE42-NEXT: paddb 48(%rsi), %xmm3
3204 ; SSE42-NEXT: paddb (%rsi), %xmm1
3205 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3206 ; SSE42-NEXT: movdqa %xmm1, %xmm4
3207 ; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm4
3208 ; SSE42-NEXT: paddb (%rdx), %xmm4
3209 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0
3210 ; SSE42-NEXT: paddb %xmm1, %xmm0
3211 ; SSE42-NEXT: paddb 48(%rdx), %xmm2
3212 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
3213 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
3214 ; SSE42-NEXT: movdqa %xmm2, 48(%rcx)
3215 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
3216 ; SSE42-NEXT: movdqa %xmm4, (%rcx)
3219 ; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3221 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3222 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
3223 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
3224 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
3225 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
3226 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3227 ; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615]
3228 ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm2
3229 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
3230 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
3231 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
3232 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
3233 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
3234 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
3235 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
3236 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
3239 ; AVX2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3241 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
3242 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3243 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
3244 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3245 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3246 ; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm3 = [18446744073709551360,18446744073709551615,0,0]
3247 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
3248 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3249 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3250 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
3251 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
3252 ; AVX2-NEXT: vzeroupper
3255 ; AVX512F-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3257 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
3258 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3259 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
3260 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3261 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3262 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3263 ; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
3264 ; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm3
3265 ; AVX512F-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3266 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3267 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
3268 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
3269 ; AVX512F-NEXT: vzeroupper
3270 ; AVX512F-NEXT: retq
3272 ; AVX512DQ-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3273 ; AVX512DQ: # %bb.0:
3274 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
3275 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3276 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
3277 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3278 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3279 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3280 ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
3281 ; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm3
3282 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3283 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3284 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
3285 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
3286 ; AVX512DQ-NEXT: vzeroupper
3287 ; AVX512DQ-NEXT: retq
3289 ; AVX512BW-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3:
3290 ; AVX512BW: # %bb.0:
3291 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3292 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3293 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3294 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3295 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3296 ; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
3297 ; AVX512BW-NEXT: kmovd %eax, %k1
3298 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
3299 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3300 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3301 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3302 ; AVX512BW-NEXT: vzeroupper
3303 ; AVX512BW-NEXT: retq
3304 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3305 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3306 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3307 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3308 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3309 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3310 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3311 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3315 define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3316 ; SSE2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3318 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3319 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
3320 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
3321 ; SSE2-NEXT: paddb (%rsi), %xmm0
3322 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3323 ; SSE2-NEXT: pand %xmm2, %xmm1
3324 ; SSE2-NEXT: pandn %xmm0, %xmm2
3325 ; SSE2-NEXT: por %xmm1, %xmm2
3326 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3327 ; SSE2-NEXT: paddb (%rdx), %xmm2
3328 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
3329 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
3330 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3333 ; SSE42-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3335 ; SSE42-NEXT: movdqa (%rdi), %xmm1
3336 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2
3337 ; SSE42-NEXT: paddb 48(%rsi), %xmm2
3338 ; SSE42-NEXT: paddb (%rsi), %xmm1
3339 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3340 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
3341 ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1
3342 ; SSE42-NEXT: paddb (%rdx), %xmm1
3343 ; SSE42-NEXT: paddb 16(%rdx), %xmm3
3344 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
3345 ; SSE42-NEXT: movdqa %xmm3, 16(%rcx)
3348 ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3350 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3351 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3352 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3353 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3354 ; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
3355 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1
3356 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3357 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3358 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
3359 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3360 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
3363 ; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3365 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
3366 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
3367 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3368 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3369 ; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
3370 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1
3371 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
3372 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3373 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3374 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
3375 ; AVX2-NEXT: vzeroupper
3378 ; AVX512F-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3380 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3381 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
3382 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3383 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3384 ; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
3385 ; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0
3386 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3387 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3388 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
3389 ; AVX512F-NEXT: vzeroupper
3390 ; AVX512F-NEXT: retq
3392 ; AVX512DQ-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3393 ; AVX512DQ: # %bb.0:
3394 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
3395 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
3396 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3397 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3398 ; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
3399 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm0
3400 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3401 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3402 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
3403 ; AVX512DQ-NEXT: vzeroupper
3404 ; AVX512DQ-NEXT: retq
3406 ; AVX512BW-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2:
3407 ; AVX512BW: # %bb.0:
3408 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3409 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3410 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3411 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3412 ; AVX512BW-NEXT: movw $1, %ax
3413 ; AVX512BW-NEXT: kmovd %eax, %k1
3414 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
3415 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %xmm0
3416 ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
3417 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3418 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3419 ; AVX512BW-NEXT: vzeroupper
3420 ; AVX512BW-NEXT: retq
3421 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3422 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3423 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3424 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3425 %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.aextinreg, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3426 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3427 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3428 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3432 define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3433 ; SSE2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3435 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3436 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
3437 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
3438 ; SSE2-NEXT: paddb (%rsi), %xmm0
3439 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
3440 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
3441 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
3442 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3443 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
3444 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3445 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3446 ; SSE2-NEXT: paddb (%rdx), %xmm2
3447 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
3448 ; SSE2-NEXT: paddb %xmm0, %xmm1
3449 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
3450 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
3451 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
3452 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3455 ; SSE42-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3457 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3458 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
3459 ; SSE42-NEXT: paddb (%rsi), %xmm0
3460 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
3461 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3462 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
3463 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3464 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3465 ; SSE42-NEXT: paddb (%rdx), %xmm2
3466 ; SSE42-NEXT: movdqa 16(%rdx), %xmm1
3467 ; SSE42-NEXT: paddb %xmm0, %xmm1
3468 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
3469 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
3470 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
3471 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
3474 ; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3476 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3477 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3478 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3479 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3480 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
3481 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3482 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3483 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3484 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
3485 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm1
3486 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
3487 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
3488 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
3489 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
3492 ; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3494 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
3495 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
3496 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3497 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3498 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
3499 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3500 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3501 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3502 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
3503 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
3504 ; AVX2-NEXT: vzeroupper
3507 ; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3509 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3510 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
3511 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3512 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3513 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
3514 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3515 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3516 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3517 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
3518 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
3519 ; AVX512F-NEXT: vzeroupper
3520 ; AVX512F-NEXT: retq
3522 ; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3523 ; AVX512DQ: # %bb.0:
3524 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
3525 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
3526 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3527 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3528 ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
3529 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3530 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3531 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3532 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
3533 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
3534 ; AVX512DQ-NEXT: vzeroupper
3535 ; AVX512DQ-NEXT: retq
3537 ; AVX512BW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
3538 ; AVX512BW: # %bb.0:
3539 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3540 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3541 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,0,27,0,29,0,31]
3542 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1
3543 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0
3544 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3545 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3546 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3547 ; AVX512BW-NEXT: vzeroupper
3548 ; AVX512BW-NEXT: retq
3549 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3550 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3551 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3552 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
3553 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31, i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47>
3554 %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8>
3555 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3556 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3557 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3558 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3562 define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3563 ; SSE2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3565 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3566 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
3567 ; SSE2-NEXT: paddb (%rsi), %xmm0
3568 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
3569 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535]
3570 ; SSE2-NEXT: pand %xmm2, %xmm1
3571 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3572 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3573 ; SSE2-NEXT: pandn %xmm0, %xmm2
3574 ; SSE2-NEXT: por %xmm1, %xmm2
3575 ; SSE2-NEXT: paddb (%rdx), %xmm2
3576 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
3577 ; SSE2-NEXT: paddb %xmm0, %xmm1
3578 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
3579 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
3580 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
3581 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3584 ; SSE42-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3586 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3587 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
3588 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
3589 ; SSE42-NEXT: paddb (%rsi), %xmm0
3590 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3591 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3592 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3593 ; SSE42-NEXT: paddb (%rdx), %xmm1
3594 ; SSE42-NEXT: movdqa 16(%rdx), %xmm2
3595 ; SSE42-NEXT: paddb %xmm0, %xmm2
3596 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
3597 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
3598 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
3599 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
3602 ; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3604 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3605 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3606 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3607 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3608 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3609 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3610 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3611 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3612 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
3613 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
3614 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
3615 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
3616 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3619 ; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3621 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
3622 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
3623 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3624 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
3625 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3626 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3627 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3628 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3629 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3630 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
3631 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
3632 ; AVX2-NEXT: vzeroupper
3635 ; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3637 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3638 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
3639 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3640 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
3641 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
3642 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3643 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3644 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3645 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3646 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3647 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
3648 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
3649 ; AVX512F-NEXT: vzeroupper
3650 ; AVX512F-NEXT: retq
3652 ; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3653 ; AVX512DQ: # %bb.0:
3654 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
3655 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
3656 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3657 ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
3658 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
3659 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3660 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
3661 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3662 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3663 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3664 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
3665 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
3666 ; AVX512DQ-NEXT: vzeroupper
3667 ; AVX512DQ-NEXT: retq
3669 ; AVX512BW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
3670 ; AVX512BW: # %bb.0:
3671 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3672 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3673 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,0,28,29,0,31]
3674 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1
3675 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %xmm0
3676 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3677 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3678 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3679 ; AVX512BW-NEXT: vzeroupper
3680 ; AVX512BW-NEXT: retq
3681 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3682 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3683 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3684 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
3685 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 0, i32 28, i32 29, i32 0, i32 31, i32 32, i32 0, i32 34, i32 35, i32 0, i32 37, i32 38, i32 0, i32 40, i32 41, i32 0, i32 43, i32 44, i32 0, i32 46, i32 47>
3686 %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8>
3687 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3688 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3689 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3690 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3694 define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3695 ; SSE2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3697 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3698 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
3699 ; SSE2-NEXT: paddb (%rsi), %xmm0
3700 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
3701 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
3702 ; SSE2-NEXT: pand %xmm2, %xmm1
3703 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3704 ; SSE2-NEXT: pandn %xmm0, %xmm2
3705 ; SSE2-NEXT: por %xmm1, %xmm2
3706 ; SSE2-NEXT: paddb (%rdx), %xmm2
3707 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
3708 ; SSE2-NEXT: paddb %xmm0, %xmm1
3709 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
3710 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
3711 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
3712 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3715 ; SSE42-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3717 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3718 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
3719 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
3720 ; SSE42-NEXT: paddb (%rsi), %xmm0
3721 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3722 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
3723 ; SSE42-NEXT: paddb (%rdx), %xmm1
3724 ; SSE42-NEXT: movdqa 16(%rdx), %xmm2
3725 ; SSE42-NEXT: paddb %xmm0, %xmm2
3726 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
3727 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
3728 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
3729 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
3732 ; AVX-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3734 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3735 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3736 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3737 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3738 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3739 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
3740 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3741 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
3742 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
3743 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
3744 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
3745 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3748 ; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3750 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
3751 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
3752 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3753 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2
3754 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3755 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
3756 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
3757 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3758 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
3759 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
3760 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
3761 ; AVX2-NEXT: vzeroupper
3764 ; AVX512F-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3766 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3767 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
3768 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3769 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3770 ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
3771 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
3772 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
3773 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3774 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3775 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
3776 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
3777 ; AVX512F-NEXT: vzeroupper
3778 ; AVX512F-NEXT: retq
3780 ; AVX512DQ-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3781 ; AVX512DQ: # %bb.0:
3782 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
3783 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
3784 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3785 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3786 ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
3787 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
3788 ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
3789 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3790 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3791 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
3792 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
3793 ; AVX512DQ-NEXT: vzeroupper
3794 ; AVX512DQ-NEXT: retq
3796 ; AVX512BW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
3797 ; AVX512BW: # %bb.0:
3798 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3799 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3800 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3801 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,16,13,14,15,16,9,10,11,16,13,14,15]
3802 ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
3803 ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
3804 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0
3805 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
3806 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3807 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3808 ; AVX512BW-NEXT: vzeroupper
3809 ; AVX512BW-NEXT: retq
3810 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3811 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3812 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3813 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
3814 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31, i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47>
3815 %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8>
3816 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3817 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3818 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3819 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3823 define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3824 ; SSE2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3826 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3827 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
3828 ; SSE2-NEXT: paddb (%rsi), %xmm0
3829 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
3830 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535]
3831 ; SSE2-NEXT: pand %xmm2, %xmm1
3832 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3833 ; SSE2-NEXT: pandn %xmm3, %xmm2
3834 ; SSE2-NEXT: por %xmm1, %xmm2
3835 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
3836 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3837 ; SSE2-NEXT: paddb (%rdx), %xmm2
3838 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
3839 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
3840 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
3841 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
3842 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3845 ; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3847 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3848 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
3849 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
3850 ; SSE42-NEXT: paddb (%rsi), %xmm0
3851 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
3852 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
3853 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
3854 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3855 ; SSE42-NEXT: paddb (%rdx), %xmm2
3856 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
3857 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
3858 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
3859 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
3860 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
3863 ; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3865 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3866 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3867 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3868 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3869 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
3870 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
3871 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5],xmm3[6],xmm1[7]
3872 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3873 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3874 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
3875 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
3876 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
3877 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
3878 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3881 ; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3883 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
3884 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
3885 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3886 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
3887 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3888 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
3889 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3890 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3891 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3892 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
3893 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
3894 ; AVX2-NEXT: vzeroupper
3897 ; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3899 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3900 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
3901 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3902 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
3903 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
3904 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3905 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
3906 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3907 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3908 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3909 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
3910 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
3911 ; AVX512F-NEXT: vzeroupper
3912 ; AVX512F-NEXT: retq
3914 ; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3915 ; AVX512DQ: # %bb.0:
3916 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
3917 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
3918 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3919 ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
3920 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
3921 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3922 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
3923 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
3924 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3925 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3926 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
3927 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
3928 ; AVX512DQ-NEXT: vzeroupper
3929 ; AVX512DQ-NEXT: retq
3931 ; AVX512BW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
3932 ; AVX512BW: # %bb.0:
3933 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3934 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3935 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3936 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,16,15,0,0,0,0,16,0,0,0]
3937 ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
3938 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0
3939 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
3940 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3941 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3942 ; AVX512BW-NEXT: vzeroupper
3943 ; AVX512BW-NEXT: retq
3944 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3945 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3946 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3947 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
3948 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 0, i32 31, i32 32, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 40, i32 41, i32 0, i32 43, i32 44, i32 45, i32 46, i32 47>
3949 %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8>
3950 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3951 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3952 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3953 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3957 define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3958 ; SSE2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
3960 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3961 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
3962 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
3963 ; SSE2-NEXT: paddb (%rsi), %xmm0
3964 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
3965 ; SSE2-NEXT: pand %xmm2, %xmm1
3966 ; SSE2-NEXT: pandn %xmm0, %xmm2
3967 ; SSE2-NEXT: por %xmm1, %xmm2
3968 ; SSE2-NEXT: paddb (%rdx), %xmm2
3969 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
3970 ; SSE2-NEXT: paddb %xmm0, %xmm1
3971 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
3972 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
3973 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
3974 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3977 ; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
3979 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3980 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
3981 ; SSE42-NEXT: paddb (%rsi), %xmm0
3982 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
3983 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
3984 ; SSE42-NEXT: paddb (%rdx), %xmm1
3985 ; SSE42-NEXT: movdqa 16(%rdx), %xmm2
3986 ; SSE42-NEXT: paddb %xmm0, %xmm2
3987 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
3988 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
3989 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
3990 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
3993 ; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
3995 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3996 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
3997 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
3998 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
3999 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4000 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
4001 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4002 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
4003 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
4004 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
4005 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
4006 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
4007 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
4008 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
4009 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
4012 ; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4014 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
4015 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4016 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
4017 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4018 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4019 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4020 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4021 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4022 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
4023 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
4024 ; AVX2-NEXT: vzeroupper
4027 ; AVX512F-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4029 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4030 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4031 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
4032 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4033 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4034 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4035 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4036 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4037 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
4038 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
4039 ; AVX512F-NEXT: vzeroupper
4040 ; AVX512F-NEXT: retq
4042 ; AVX512DQ-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4043 ; AVX512DQ: # %bb.0:
4044 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
4045 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4046 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
4047 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4048 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4049 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
4050 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4051 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4052 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
4053 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
4054 ; AVX512DQ-NEXT: vzeroupper
4055 ; AVX512DQ-NEXT: retq
4057 ; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
4058 ; AVX512BW: # %bb.0:
4059 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
4060 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4061 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4062 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,16,9,10,11,12,13,14,15]
4063 ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
4064 ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
4065 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
4066 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4067 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
4068 ; AVX512BW-NEXT: vzeroupper
4069 ; AVX512BW-NEXT: retq
4070 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4071 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4072 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4073 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
4074 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4075 %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8>
4076 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4077 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4078 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4079 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4083 define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4084 ; SSE2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4086 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4087 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
4088 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
4089 ; SSE2-NEXT: paddb (%rsi), %xmm0
4090 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
4091 ; SSE2-NEXT: pand %xmm2, %xmm1
4092 ; SSE2-NEXT: pandn %xmm0, %xmm2
4093 ; SSE2-NEXT: por %xmm1, %xmm2
4094 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4095 ; SSE2-NEXT: paddb (%rdx), %xmm2
4096 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
4097 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
4098 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
4101 ; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4103 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4104 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
4105 ; SSE42-NEXT: paddb (%rsi), %xmm0
4106 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
4107 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4108 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4109 ; SSE42-NEXT: paddb (%rdx), %xmm1
4110 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
4111 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
4112 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
4115 ; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4117 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4118 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
4119 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4120 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4121 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4122 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4123 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
4124 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
4125 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
4126 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
4129 ; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4131 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
4132 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
4133 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4134 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4135 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4136 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
4137 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
4138 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4139 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
4140 ; AVX2-NEXT: vzeroupper
4143 ; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4145 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
4146 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
4147 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4148 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4149 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4150 ; AVX512F-NEXT: vpbroadcastw %xmm0, %xmm0
4151 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
4152 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4153 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4154 ; AVX512F-NEXT: vzeroupper
4155 ; AVX512F-NEXT: retq
4157 ; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4158 ; AVX512DQ: # %bb.0:
4159 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
4160 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
4161 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4162 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4163 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4164 ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %xmm0
4165 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
4166 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4167 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
4168 ; AVX512DQ-NEXT: vzeroupper
4169 ; AVX512DQ-NEXT: retq
4171 ; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
4172 ; AVX512BW: # %bb.0:
4173 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
4174 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4175 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4176 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,0,0,0,0,16,0,0,0]
4177 ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
4178 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
4179 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
4180 ; AVX512BW-NEXT: vzeroupper
4181 ; AVX512BW-NEXT: retq
4182 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4183 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4184 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4185 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
4186 %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4187 %out.bytevec = bitcast <24 x i16> %broadcast.of.aextinreg to <48 x i8>
4188 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4189 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4190 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4191 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4195 define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4196 ; SSE2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4198 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4199 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
4200 ; SSE2-NEXT: paddb (%rsi), %xmm0
4201 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
4202 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
4203 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
4204 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4205 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4206 ; SSE2-NEXT: paddb (%rdx), %xmm2
4207 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1
4208 ; SSE2-NEXT: paddb %xmm0, %xmm1
4209 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
4210 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
4211 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
4212 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
4215 ; SSE42-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4217 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4218 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
4219 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
4220 ; SSE42-NEXT: paddb (%rsi), %xmm0
4221 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4222 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
4223 ; SSE42-NEXT: paddb (%rdx), %xmm1
4224 ; SSE42-NEXT: movdqa 16(%rdx), %xmm2
4225 ; SSE42-NEXT: paddb %xmm0, %xmm2
4226 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
4227 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
4228 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
4229 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
4232 ; AVX-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4234 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4235 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
4236 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4237 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4238 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
4239 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4240 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7]
4241 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
4242 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
4243 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
4244 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
4245 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
4246 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
4247 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
4248 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
4249 ; AVX-NEXT: vzeroupper
4252 ; AVX2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4254 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
4255 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
4256 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4257 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm2
4258 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4259 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
4260 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7]
4261 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4262 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
4263 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
4264 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
4265 ; AVX2-NEXT: vzeroupper
4268 ; AVX512F-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4270 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4271 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4272 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4273 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4274 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4275 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
4276 ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0
4277 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4278 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4279 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4280 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4281 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
4282 ; AVX512F-NEXT: vzeroupper
4283 ; AVX512F-NEXT: retq
4285 ; AVX512DQ-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4286 ; AVX512DQ: # %bb.0:
4287 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
4288 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4289 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4290 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4291 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4292 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
4293 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
4294 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4295 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4296 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4297 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
4298 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
4299 ; AVX512DQ-NEXT: vzeroupper
4300 ; AVX512DQ-NEXT: retq
4302 ; AVX512BW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
4303 ; AVX512BW: # %bb.0:
4304 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
4305 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4306 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
4307 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
4308 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4309 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
4310 ; AVX512BW-NEXT: vzeroupper
4311 ; AVX512BW-NEXT: retq
4312 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4313 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4314 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4315 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
4316 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <12 x i32> <i32 0, i32 13, i32 0, i32 15, i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23>
4317 %out.bytevec = bitcast <12 x i32> %broadcast.of.aextinreg to <48 x i8>
4318 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4319 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4320 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4321 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4325 define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4326 ; SSE2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4328 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4329 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
4330 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
4331 ; SSE2-NEXT: paddb (%rsi), %xmm0
4332 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
4333 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
4334 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,2]
4335 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
4336 ; SSE2-NEXT: paddb (%rdx), %xmm0
4337 ; SSE2-NEXT: paddb 16(%rdx), %xmm3
4338 ; SSE2-NEXT: paddb 32(%rdx), %xmm2
4339 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
4340 ; SSE2-NEXT: movdqa %xmm3, 16(%rcx)
4341 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
4344 ; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4346 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4347 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
4348 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
4349 ; SSE42-NEXT: paddb (%rsi), %xmm0
4350 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4351 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7]
4352 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
4353 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4354 ; SSE42-NEXT: paddb (%rdx), %xmm2
4355 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
4356 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
4357 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
4358 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
4359 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
4362 ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4364 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4365 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
4366 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4367 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4368 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
4369 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4370 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4371 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4372 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
4373 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
4374 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
4375 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
4376 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
4377 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
4380 ; AVX2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4382 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
4383 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
4384 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4385 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4386 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
4387 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4388 ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0]
4389 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
4390 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4391 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
4392 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
4393 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
4394 ; AVX2-NEXT: vzeroupper
4397 ; AVX512F-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4399 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4400 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4401 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4402 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4403 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4404 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
4405 ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0
4406 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4407 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4408 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4409 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4410 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
4411 ; AVX512F-NEXT: vzeroupper
4412 ; AVX512F-NEXT: retq
4414 ; AVX512DQ-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4415 ; AVX512DQ: # %bb.0:
4416 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
4417 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4418 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4419 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4420 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4421 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
4422 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
4423 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4424 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4425 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4426 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
4427 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
4428 ; AVX512DQ-NEXT: vzeroupper
4429 ; AVX512DQ-NEXT: retq
4431 ; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4432 ; AVX512BW-SLOW: # %bb.0:
4433 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
4434 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4435 ; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
4436 ; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
4437 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
4438 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4439 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4440 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4441 ; AVX512BW-SLOW-NEXT: vzeroupper
4442 ; AVX512BW-SLOW-NEXT: retq
4444 ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
4445 ; AVX512BW-FAST: # %bb.0:
4446 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4447 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4448 ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
4449 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
4450 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4451 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4452 ; AVX512BW-FAST-NEXT: vzeroupper
4453 ; AVX512BW-FAST-NEXT: retq
4454 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4455 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4456 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4457 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
4458 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <12 x i32> <i32 0, i32 13, i32 14, i32 0, i32 16, i32 17, i32 0, i32 19, i32 20, i32 0, i32 22, i32 23>
4459 %out.bytevec = bitcast <12 x i32> %broadcast.of.aextinreg to <48 x i8>
4460 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4461 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4462 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4463 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4467 define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4468 ; SSE2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4470 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4471 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
4472 ; SSE2-NEXT: paddb (%rsi), %xmm0
4473 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
4474 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
4475 ; SSE2-NEXT: paddb (%rdx), %xmm1
4476 ; SSE2-NEXT: movdqa 16(%rdx), %xmm2
4477 ; SSE2-NEXT: paddb %xmm0, %xmm2
4478 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
4479 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
4480 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
4481 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
4484 ; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4486 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4487 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
4488 ; SSE42-NEXT: paddb (%rsi), %xmm0
4489 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
4490 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4491 ; SSE42-NEXT: paddb (%rdx), %xmm1
4492 ; SSE42-NEXT: movdqa 16(%rdx), %xmm2
4493 ; SSE42-NEXT: paddb %xmm0, %xmm2
4494 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
4495 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
4496 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
4497 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
4500 ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4502 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4503 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
4504 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
4505 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
4506 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4507 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
4508 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
4509 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
4510 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
4511 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
4512 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
4513 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
4514 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
4515 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
4516 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
4519 ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4521 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
4522 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4523 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
4524 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4525 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
4526 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
4527 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4528 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4529 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
4530 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
4531 ; AVX2-NEXT: vzeroupper
4534 ; AVX512F-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4536 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4537 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4538 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4539 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4540 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4541 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4542 ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0
4543 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4544 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4545 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4546 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4547 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
4548 ; AVX512F-NEXT: vzeroupper
4549 ; AVX512F-NEXT: retq
4551 ; AVX512DQ-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4552 ; AVX512DQ: # %bb.0:
4553 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
4554 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4555 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4556 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4557 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4558 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4559 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
4560 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4561 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4562 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4563 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
4564 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
4565 ; AVX512DQ-NEXT: vzeroupper
4566 ; AVX512DQ-NEXT: retq
4568 ; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
4569 ; AVX512BW: # %bb.0:
4570 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
4571 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
4572 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4573 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
4574 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4575 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4576 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
4577 ; AVX512BW-NEXT: vzeroupper
4578 ; AVX512BW-NEXT: retq
4579 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4580 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4581 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4582 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
4583 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23>
4584 %out.bytevec = bitcast <12 x i32> %broadcast.of.aextinreg to <48 x i8>
4585 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4586 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4587 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4588 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4592 define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4593 ; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4595 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4596 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
4597 ; SSE2-NEXT: paddb (%rsi), %xmm0
4598 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
4599 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
4600 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4601 ; SSE2-NEXT: paddb (%rdx), %xmm1
4602 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
4603 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
4604 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
4607 ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4609 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4610 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
4611 ; SSE42-NEXT: paddb (%rsi), %xmm0
4612 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
4613 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4614 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4615 ; SSE42-NEXT: paddb (%rdx), %xmm1
4616 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
4617 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
4618 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
4621 ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4623 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4624 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
4625 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4626 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4627 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4628 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4629 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
4630 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
4631 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
4632 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
4635 ; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4637 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
4638 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
4639 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4640 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4641 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4642 ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,7]
4643 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
4644 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4645 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
4646 ; AVX2-NEXT: vzeroupper
4649 ; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4651 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4652 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4653 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4654 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4655 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4656 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4657 ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0
4658 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4659 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4660 ; AVX512F-NEXT: vzeroupper
4661 ; AVX512F-NEXT: retq
4663 ; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4664 ; AVX512DQ: # %bb.0:
4665 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
4666 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4667 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4668 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4669 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4670 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4671 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
4672 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4673 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
4674 ; AVX512DQ-NEXT: vzeroupper
4675 ; AVX512DQ-NEXT: retq
4677 ; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4678 ; AVX512BW-SLOW: # %bb.0:
4679 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
4680 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4681 ; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4682 ; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0
4683 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4684 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4685 ; AVX512BW-SLOW-NEXT: vzeroupper
4686 ; AVX512BW-SLOW-NEXT: retq
4688 ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
4689 ; AVX512BW-FAST: # %bb.0:
4690 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4691 ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
4692 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4693 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
4694 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
4695 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4696 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4697 ; AVX512BW-FAST-NEXT: vzeroupper
4698 ; AVX512BW-FAST-NEXT: retq
4699 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4700 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4701 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4702 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
4703 %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 16, i32 17, i32 0, i32 19, i32 20, i32 21, i32 22, i32 23>
4704 %out.bytevec = bitcast <12 x i32> %broadcast.of.aextinreg to <48 x i8>
4705 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4706 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4707 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4708 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4712 define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4713 ; SSE2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4715 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4716 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
4717 ; SSE2-NEXT: paddb (%rsi), %xmm0
4718 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
4719 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4720 ; SSE2-NEXT: paddb (%rdx), %xmm1
4721 ; SSE2-NEXT: movdqa 16(%rdx), %xmm2
4722 ; SSE2-NEXT: paddb %xmm0, %xmm2
4723 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
4724 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
4725 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
4726 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
4729 ; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4731 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4732 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
4733 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
4734 ; SSE42-NEXT: paddb (%rsi), %xmm0
4735 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4736 ; SSE42-NEXT: paddb (%rdx), %xmm1
4737 ; SSE42-NEXT: movdqa 16(%rdx), %xmm2
4738 ; SSE42-NEXT: paddb %xmm0, %xmm2
4739 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
4740 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
4741 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
4742 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
4745 ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4747 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4748 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
4749 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
4750 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
4751 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
4752 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4753 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3
4754 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[2]
4755 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
4756 ; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
4757 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
4758 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
4759 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
4760 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
4761 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
4762 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
4763 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
4764 ; AVX-NEXT: vzeroupper
4767 ; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4769 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
4770 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
4771 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4772 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4773 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
4774 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3]
4775 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4776 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4777 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
4778 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
4779 ; AVX2-NEXT: vzeroupper
4782 ; AVX512F-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4784 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4785 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4786 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4787 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4788 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4789 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
4790 ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
4791 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4792 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4793 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4794 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4795 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
4796 ; AVX512F-NEXT: vzeroupper
4797 ; AVX512F-NEXT: retq
4799 ; AVX512DQ-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4800 ; AVX512DQ: # %bb.0:
4801 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
4802 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4803 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4804 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4805 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4806 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
4807 ; AVX512DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
4808 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4809 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4810 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4811 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
4812 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
4813 ; AVX512DQ-NEXT: vzeroupper
4814 ; AVX512DQ-NEXT: retq
4816 ; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4817 ; AVX512BW-SLOW: # %bb.0:
4818 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
4819 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4820 ; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4821 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
4822 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3]
4823 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4824 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4825 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4826 ; AVX512BW-SLOW-NEXT: vzeroupper
4827 ; AVX512BW-SLOW-NEXT: retq
4829 ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
4830 ; AVX512BW-FAST: # %bb.0:
4831 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4832 ; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7]
4833 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4834 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1
4835 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4836 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4837 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4838 ; AVX512BW-FAST-NEXT: vzeroupper
4839 ; AVX512BW-FAST-NEXT: retq
4840 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4841 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4842 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4843 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
4844 %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <6 x i32> <i32 0, i32 7, i32 0, i32 9, i32 0, i32 11>
4845 %out.bytevec = bitcast <6 x i64> %broadcast.of.aextinreg to <48 x i8>
4846 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4847 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4848 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4849 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4853 define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4854 ; SSE2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4856 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4857 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
4858 ; SSE2-NEXT: paddb (%rsi), %xmm0
4859 ; SSE2-NEXT: paddb 48(%rsi), %xmm1
4860 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4861 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4862 ; SSE2-NEXT: paddb (%rdx), %xmm1
4863 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
4864 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
4865 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
4868 ; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4870 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4871 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1
4872 ; SSE42-NEXT: paddb 48(%rsi), %xmm1
4873 ; SSE42-NEXT: paddb (%rsi), %xmm0
4874 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4875 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4876 ; SSE42-NEXT: paddb (%rdx), %xmm1
4877 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
4878 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
4879 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
4882 ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4884 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4885 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
4886 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4887 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
4888 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
4889 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
4890 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
4891 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
4892 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
4893 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
4894 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
4895 ; AVX-NEXT: vzeroupper
4898 ; AVX2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4900 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
4901 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
4902 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4903 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4904 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
4905 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,0]
4906 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4907 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
4908 ; AVX2-NEXT: vzeroupper
4911 ; AVX512F-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4913 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4914 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4915 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4916 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4917 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4918 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
4919 ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
4920 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4921 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4922 ; AVX512F-NEXT: vzeroupper
4923 ; AVX512F-NEXT: retq
4925 ; AVX512DQ-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4926 ; AVX512DQ: # %bb.0:
4927 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
4928 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4929 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4930 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4931 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4932 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
4933 ; AVX512DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
4934 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4935 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
4936 ; AVX512DQ-NEXT: vzeroupper
4937 ; AVX512DQ-NEXT: retq
4939 ; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4940 ; AVX512BW-SLOW: # %bb.0:
4941 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
4942 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4943 ; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
4944 ; AVX512BW-SLOW-NEXT: vpermq %zmm0, %zmm1, %zmm0
4945 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4946 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4947 ; AVX512BW-SLOW-NEXT: vzeroupper
4948 ; AVX512BW-SLOW-NEXT: retq
4950 ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
4951 ; AVX512BW-FAST: # %bb.0:
4952 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
4953 ; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7]
4954 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4955 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0
4956 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
4957 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4958 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4959 ; AVX512BW-FAST-NEXT: vzeroupper
4960 ; AVX512BW-FAST-NEXT: retq
4961 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4962 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4963 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4964 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
4965 %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <6 x i32> <i32 0, i32 7, i32 8, i32 0, i32 10, i32 11>
4966 %out.bytevec = bitcast <6 x i64> %broadcast.of.aextinreg to <48 x i8>
4967 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4968 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4969 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4970 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4974 define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4975 ; SSE-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
4977 ; SSE-NEXT: movdqa (%rdi), %xmm0
4978 ; SSE-NEXT: paddb (%rsi), %xmm0
4979 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
4980 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4981 ; SSE-NEXT: movdqa 16(%rdx), %xmm1
4982 ; SSE-NEXT: paddb %xmm0, %xmm1
4983 ; SSE-NEXT: movdqa (%rdx), %xmm2
4984 ; SSE-NEXT: paddb %xmm0, %xmm2
4985 ; SSE-NEXT: movdqa 48(%rdx), %xmm3
4986 ; SSE-NEXT: paddb %xmm0, %xmm3
4987 ; SSE-NEXT: paddb 32(%rdx), %xmm0
4988 ; SSE-NEXT: movdqa %xmm0, 32(%rcx)
4989 ; SSE-NEXT: movdqa %xmm3, 48(%rcx)
4990 ; SSE-NEXT: movdqa %xmm2, (%rcx)
4991 ; SSE-NEXT: movdqa %xmm1, 16(%rcx)
4994 ; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
4996 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4997 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4998 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
4999 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5000 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1
5001 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
5002 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3
5003 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5004 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5005 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
5006 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
5007 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
5010 ; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5012 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
5013 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5014 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
5015 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5016 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5017 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5018 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
5019 ; AVX2-NEXT: vzeroupper
5022 ; AVX512F-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5024 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
5025 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5026 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
5027 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5028 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5029 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5030 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
5031 ; AVX512F-NEXT: vzeroupper
5032 ; AVX512F-NEXT: retq
5034 ; AVX512DQ-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5035 ; AVX512DQ: # %bb.0:
5036 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
5037 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5038 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
5039 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5040 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5041 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
5042 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
5043 ; AVX512DQ-NEXT: vzeroupper
5044 ; AVX512DQ-NEXT: retq
5046 ; AVX512BW-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
5047 ; AVX512BW: # %bb.0:
5048 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
5049 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5050 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
5051 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5052 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5053 ; AVX512BW-NEXT: vzeroupper
5054 ; AVX512BW-NEXT: retq
5055 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5056 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5057 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5058 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 0, i32 67, i32 0, i32 69, i32 0, i32 71, i32 0, i32 73, i32 0, i32 75, i32 0, i32 77, i32 0, i32 79, i32 0, i32 81, i32 0, i32 83, i32 0, i32 85, i32 0, i32 87, i32 0, i32 89, i32 0, i32 91, i32 0, i32 93, i32 0, i32 95, i32 0, i32 97, i32 0, i32 99, i32 0, i32 101, i32 0, i32 103, i32 0, i32 105, i32 0, i32 107, i32 0, i32 109, i32 0, i32 111, i32 0, i32 113, i32 0, i32 115, i32 0, i32 117, i32 0, i32 119, i32 0, i32 121, i32 0, i32 123, i32 0, i32 125, i32 0, i32 127>
5059 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5060 %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias
5061 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5065 define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5066 ; SSE-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5068 ; SSE-NEXT: movdqa (%rdi), %xmm0
5069 ; SSE-NEXT: paddb (%rsi), %xmm0
5070 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5071 ; SSE-NEXT: movdqa 16(%rdx), %xmm1
5072 ; SSE-NEXT: paddb %xmm0, %xmm1
5073 ; SSE-NEXT: movdqa (%rdx), %xmm2
5074 ; SSE-NEXT: paddb %xmm0, %xmm2
5075 ; SSE-NEXT: movdqa 48(%rdx), %xmm3
5076 ; SSE-NEXT: paddb %xmm0, %xmm3
5077 ; SSE-NEXT: paddb 32(%rdx), %xmm0
5078 ; SSE-NEXT: movdqa %xmm0, 32(%rcx)
5079 ; SSE-NEXT: movdqa %xmm3, 48(%rcx)
5080 ; SSE-NEXT: movdqa %xmm2, (%rcx)
5081 ; SSE-NEXT: movdqa %xmm1, 16(%rcx)
5084 ; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5086 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5087 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5088 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5089 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1
5090 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
5091 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3
5092 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5093 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5094 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
5095 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
5096 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
5099 ; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5101 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
5102 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5103 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
5104 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5105 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5106 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5107 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
5108 ; AVX2-NEXT: vzeroupper
5111 ; AVX512F-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5113 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
5114 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5115 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
5116 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5117 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5118 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5119 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
5120 ; AVX512F-NEXT: vzeroupper
5121 ; AVX512F-NEXT: retq
5123 ; AVX512DQ-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5124 ; AVX512DQ: # %bb.0:
5125 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
5126 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5127 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
5128 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5129 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5130 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
5131 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
5132 ; AVX512DQ-NEXT: vzeroupper
5133 ; AVX512DQ-NEXT: retq
5135 ; AVX512BW-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
5136 ; AVX512BW: # %bb.0:
5137 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
5138 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5139 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
5140 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5141 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5142 ; AVX512BW-NEXT: vzeroupper
5143 ; AVX512BW-NEXT: retq
5144 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5145 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5146 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5147 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 0, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 0, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 0, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 0, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 0, i32 101, i32 102, i32 103, i32 0, i32 105, i32 106, i32 107, i32 0, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 0, i32 117, i32 118, i32 119, i32 0, i32 121, i32 122, i32 123, i32 0, i32 125, i32 126, i32 127>
5148 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5149 %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias
5150 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5154 define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5155 ; SSE-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5157 ; SSE-NEXT: movdqa (%rdi), %xmm0
5158 ; SSE-NEXT: paddb (%rsi), %xmm0
5159 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5160 ; SSE-NEXT: movdqa 16(%rdx), %xmm1
5161 ; SSE-NEXT: paddb %xmm0, %xmm1
5162 ; SSE-NEXT: movdqa (%rdx), %xmm2
5163 ; SSE-NEXT: paddb %xmm0, %xmm2
5164 ; SSE-NEXT: movdqa 48(%rdx), %xmm3
5165 ; SSE-NEXT: paddb %xmm0, %xmm3
5166 ; SSE-NEXT: paddb 32(%rdx), %xmm0
5167 ; SSE-NEXT: movdqa %xmm0, 32(%rcx)
5168 ; SSE-NEXT: movdqa %xmm3, 48(%rcx)
5169 ; SSE-NEXT: movdqa %xmm2, (%rcx)
5170 ; SSE-NEXT: movdqa %xmm1, 16(%rcx)
5173 ; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5175 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5176 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5177 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5178 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1
5179 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
5180 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3
5181 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5182 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5183 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
5184 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
5185 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
5188 ; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5190 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
5191 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5192 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
5193 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5194 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5195 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5196 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
5197 ; AVX2-NEXT: vzeroupper
5200 ; AVX512F-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5202 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
5203 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5204 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
5205 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5206 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5207 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5208 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
5209 ; AVX512F-NEXT: vzeroupper
5210 ; AVX512F-NEXT: retq
5212 ; AVX512DQ-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5213 ; AVX512DQ: # %bb.0:
5214 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
5215 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5216 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
5217 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5218 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5219 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
5220 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
5221 ; AVX512DQ-NEXT: vzeroupper
5222 ; AVX512DQ-NEXT: retq
5224 ; AVX512BW-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
5225 ; AVX512BW: # %bb.0:
5226 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
5227 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5228 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
5229 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5230 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5231 ; AVX512BW-NEXT: vzeroupper
5232 ; AVX512BW-NEXT: retq
5233 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5234 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5235 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5236 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 0, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 0, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 0, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 0, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5237 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5238 %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias
5239 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5243 define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5244 ; SSE-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5246 ; SSE-NEXT: movdqa (%rdi), %xmm0
5247 ; SSE-NEXT: paddb (%rsi), %xmm0
5248 ; SSE-NEXT: movdqa 16(%rdx), %xmm1
5249 ; SSE-NEXT: paddb %xmm0, %xmm1
5250 ; SSE-NEXT: movdqa (%rdx), %xmm2
5251 ; SSE-NEXT: paddb %xmm0, %xmm2
5252 ; SSE-NEXT: movdqa 48(%rdx), %xmm3
5253 ; SSE-NEXT: paddb %xmm0, %xmm3
5254 ; SSE-NEXT: paddb 32(%rdx), %xmm0
5255 ; SSE-NEXT: movdqa %xmm0, 32(%rcx)
5256 ; SSE-NEXT: movdqa %xmm3, 48(%rcx)
5257 ; SSE-NEXT: movdqa %xmm2, (%rcx)
5258 ; SSE-NEXT: movdqa %xmm1, 16(%rcx)
5261 ; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5263 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5264 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5265 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1
5266 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2
5267 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3
5268 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5269 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5270 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
5271 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
5272 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
5275 ; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5277 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
5278 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5279 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
5280 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5281 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5282 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5283 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
5284 ; AVX2-NEXT: vzeroupper
5287 ; AVX512F-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5289 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
5290 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5291 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
5292 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5293 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5294 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5295 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
5296 ; AVX512F-NEXT: vzeroupper
5297 ; AVX512F-NEXT: retq
5299 ; AVX512DQ-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5300 ; AVX512DQ: # %bb.0:
5301 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
5302 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5303 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
5304 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5305 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5306 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
5307 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
5308 ; AVX512DQ-NEXT: vzeroupper
5309 ; AVX512DQ-NEXT: retq
5311 ; AVX512BW-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
5312 ; AVX512BW: # %bb.0:
5313 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
5314 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5315 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
5316 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5317 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5318 ; AVX512BW-NEXT: vzeroupper
5319 ; AVX512BW-NEXT: retq
5320 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5321 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5322 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5323 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 0, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 0, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5324 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5325 %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias
5326 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5330 define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5331 ; SSE-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
5333 ; SSE-NEXT: movdqa (%rdi), %xmm0
5334 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
5335 ; SSE-NEXT: paddb (%rsi), %xmm0
5336 ; SSE-NEXT: paddb 16(%rsi), %xmm1
5337 ; SSE-NEXT: movdqa 16(%rdx), %xmm2
5338 ; SSE-NEXT: paddb %xmm1, %xmm2
5339 ; SSE-NEXT: movdqa (%rdx), %xmm3
5340 ; SSE-NEXT: paddb %xmm0, %xmm3
5341 ; SSE-NEXT: paddb 48(%rdx), %xmm1
5342 ; SSE-NEXT: paddb 32(%rdx), %xmm0
5343 ; SSE-NEXT: movdqa %xmm0, 32(%rcx)
5344 ; SSE-NEXT: movdqa %xmm1, 48(%rcx)
5345 ; SSE-NEXT: movdqa %xmm3, (%rcx)
5346 ; SSE-NEXT: movdqa %xmm2, 16(%rcx)
5349 ; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
5351 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5352 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
5353 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5354 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
5355 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2
5356 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
5357 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
5358 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5359 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5360 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
5361 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
5362 ; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
5365 ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
5367 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5368 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5369 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5370 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5371 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5372 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
5373 ; AVX2-NEXT: vzeroupper
5376 ; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
5378 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5379 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5380 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5381 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5382 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5383 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
5384 ; AVX512F-NEXT: vzeroupper
5385 ; AVX512F-NEXT: retq
5387 ; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
5388 ; AVX512DQ: # %bb.0:
5389 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
5390 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5391 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5392 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5393 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
5394 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
5395 ; AVX512DQ-NEXT: vzeroupper
5396 ; AVX512DQ-NEXT: retq
5398 ; AVX512BW-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
5399 ; AVX512BW: # %bb.0:
5400 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
5401 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5402 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
5403 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5404 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5405 ; AVX512BW-NEXT: vzeroupper
5406 ; AVX512BW-NEXT: retq
5407 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5408 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5409 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5410 %broadcast.of.aextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 0, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5411 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5412 %out.vec = add <64 x i8> %broadcast.of.aextinreg, %out.vec.bias
5413 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5417 ; FIXME: all these crash during selection:
5418 ; define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5419 ; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5420 ; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5421 ; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5422 ; %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
5423 ; %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <32 x i32> <i32 0, i32 33, i32 0, i32 35, i32 0, i32 37, i32 0, i32 39, i32 0, i32 41, i32 0, i32 43, i32 0, i32 45, i32 0, i32 47, i32 0, i32 49, i32 0, i32 51, i32 0, i32 53, i32 0, i32 55, i32 0, i32 57, i32 0, i32 59, i32 0, i32 61, i32 0, i32 63>
5424 ; %out.bytevec = bitcast <32 x i16> %broadcast.of.aextinreg to <64 x i8>
5425 ; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5426 ; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5427 ; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5431 ; define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5432 ; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5433 ; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5434 ; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5435 ; %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
5436 ; %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 0, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 0, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 0, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 0, i32 61, i32 62, i32 63>
5437 ; %out.bytevec = bitcast <32 x i16> %broadcast.of.aextinreg to <64 x i8>
5438 ; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5439 ; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5440 ; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5444 ; define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5445 ; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5446 ; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5447 ; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5448 ; %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
5449 ; %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 0, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 0, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5450 ; %out.bytevec = bitcast <32 x i16> %broadcast.of.aextinreg to <64 x i8>
5451 ; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5452 ; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5453 ; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5457 ; define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5458 ; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5459 ; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5460 ; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5461 ; %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
5462 ; %broadcast.of.aextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5463 ; %out.bytevec = bitcast <32 x i16> %broadcast.of.aextinreg to <64 x i8>
5464 ; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5465 ; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5466 ; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5470 ; define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5471 ; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5472 ; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5473 ; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5474 ; %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
5475 ; %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <16 x i32> <i32 0, i32 17, i32 0, i32 19, i32 0, i32 21, i32 0, i32 23, i32 0, i32 25, i32 0, i32 27, i32 0, i32 29, i32 0, i32 31>
5476 ; %out.bytevec = bitcast <16 x i32> %broadcast.of.aextinreg to <64 x i8>
5477 ; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5478 ; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5479 ; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5483 ; define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5484 ; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5485 ; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5486 ; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5487 ; %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
5488 ; %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 0, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 0, i32 29, i32 30, i32 31>
5489 ; %out.bytevec = bitcast <16 x i32> %broadcast.of.aextinreg to <64 x i8>
5490 ; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5491 ; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5492 ; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5496 ; define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5497 ; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5498 ; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5499 ; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5500 ; %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
5501 ; %broadcast.of.aextinreg = shufflevector <16 x i32> %in.vec.cast, <16 x i32> poison, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
5502 ; %out.bytevec = bitcast <16 x i32> %broadcast.of.aextinreg to <64 x i8>
5503 ; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5504 ; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5505 ; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5509 ; define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5510 ; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5511 ; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5512 ; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5513 ; %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
5514 ; %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
5515 ; %out.bytevec = bitcast <8 x i64> %broadcast.of.aextinreg to <64 x i8>
5516 ; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5517 ; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5518 ; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5522 ; define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5523 ; %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5524 ; %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5525 ; %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5526 ; %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
5527 ; %broadcast.of.aextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
5528 ; %out.bytevec = bitcast <8 x i64> %broadcast.of.aextinreg to <64 x i8>
5529 ; %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5530 ; %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5531 ; store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5535 define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5536 ; SSE-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
5538 ; SSE-NEXT: movdqa (%rdi), %xmm0
5539 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
5540 ; SSE-NEXT: paddb (%rsi), %xmm0
5541 ; SSE-NEXT: paddb 16(%rsi), %xmm1
5542 ; SSE-NEXT: movdqa 16(%rdx), %xmm2
5543 ; SSE-NEXT: paddb %xmm1, %xmm2
5544 ; SSE-NEXT: movdqa (%rdx), %xmm3
5545 ; SSE-NEXT: paddb %xmm0, %xmm3
5546 ; SSE-NEXT: paddb 48(%rdx), %xmm1
5547 ; SSE-NEXT: paddb 32(%rdx), %xmm0
5548 ; SSE-NEXT: movdqa %xmm0, 32(%rcx)
5549 ; SSE-NEXT: movdqa %xmm1, 48(%rcx)
5550 ; SSE-NEXT: movdqa %xmm3, (%rcx)
5551 ; SSE-NEXT: movdqa %xmm2, 16(%rcx)
5554 ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
5556 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5557 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
5558 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5559 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
5560 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2
5561 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3
5562 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
5563 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5564 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5565 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
5566 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
5567 ; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
5570 ; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
5572 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5573 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5574 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5575 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5576 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5577 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
5578 ; AVX2-NEXT: vzeroupper
5581 ; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
5583 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5584 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5585 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5586 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5587 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5588 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
5589 ; AVX512F-NEXT: vzeroupper
5590 ; AVX512F-NEXT: retq
5592 ; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
5593 ; AVX512DQ: # %bb.0:
5594 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
5595 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5596 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1
5597 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5598 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
5599 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
5600 ; AVX512DQ-NEXT: vzeroupper
5601 ; AVX512DQ-NEXT: retq
5603 ; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
5604 ; AVX512BW: # %bb.0:
5605 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
5606 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5607 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
5608 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5609 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5610 ; AVX512BW-NEXT: vzeroupper
5611 ; AVX512BW-NEXT: retq
5612 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5613 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5614 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5615 %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128>
5616 %broadcast.of.aextinreg = shufflevector <4 x i128> %in.vec.cast, <4 x i128> poison, <4 x i32> <i32 0, i32 5, i32 0, i32 7>
5617 %out.bytevec = bitcast <4 x i128> %broadcast.of.aextinreg to <64 x i8>
5618 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5619 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
5620 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5623 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
5627 ; FALLBACK10: {{.*}}
5628 ; FALLBACK11: {{.*}}
5629 ; FALLBACK12: {{.*}}
5630 ; FALLBACK13: {{.*}}