1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-ONLY,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
16 ; These patterns are produced by LoopVectorizer for interleaved stores.
18 define void @store_i16_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
19 ; SSE-LABEL: store_i16_stride3_vf2:
21 ; SSE-NEXT: movdqa (%rdi), %xmm0
22 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
23 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
24 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
25 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
26 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
27 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
28 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
29 ; SSE-NEXT: movq %xmm0, (%rcx)
30 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
31 ; SSE-NEXT: movd %xmm0, 8(%rcx)
34 ; AVX1-LABEL: store_i16_stride3_vf2:
36 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
37 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
38 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
39 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,4,5,12,13,6,7,4,5,6,7]
40 ; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rcx)
41 ; AVX1-NEXT: vmovq %xmm0, (%rcx)
44 ; AVX512F-LABEL: store_i16_stride3_vf2:
46 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
47 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
48 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
49 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,4,5,12,13,6,7,4,5,6,7]
50 ; AVX512F-NEXT: vpextrd $2, %xmm0, 8(%rcx)
51 ; AVX512F-NEXT: vmovq %xmm0, (%rcx)
54 ; AVX512BW-LABEL: store_i16_stride3_vf2:
56 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
57 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
58 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9]
59 ; AVX512BW-NEXT: vpermi2w (%rdx), %xmm0, %xmm1
60 ; AVX512BW-NEXT: vpextrd $2, %xmm1, 8(%rcx)
61 ; AVX512BW-NEXT: vmovq %xmm1, (%rcx)
63 %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64
64 %in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64
65 %in.vec2 = load <2 x i16>, ptr %in.vecptr2, align 64
66 %1 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
67 %2 = shufflevector <2 x i16> %in.vec2, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
68 %3 = shufflevector <4 x i16> %1, <4 x i16> %2, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
69 %interleaved.vec = shufflevector <6 x i16> %3, <6 x i16> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
70 store <6 x i16> %interleaved.vec, ptr %out.vec, align 64
74 define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
75 ; SSE-LABEL: store_i16_stride3_vf4:
77 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
78 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
79 ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
80 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
81 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
82 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535]
83 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,4,7]
84 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
85 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7]
86 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,4]
87 ; SSE-NEXT: pand %xmm3, %xmm4
88 ; SSE-NEXT: pandn %xmm1, %xmm3
89 ; SSE-NEXT: por %xmm4, %xmm3
90 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
91 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,65535,65535]
92 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
93 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
94 ; SSE-NEXT: pand %xmm1, %xmm2
95 ; SSE-NEXT: pandn %xmm0, %xmm1
96 ; SSE-NEXT: por %xmm2, %xmm1
97 ; SSE-NEXT: movq %xmm1, 16(%rcx)
98 ; SSE-NEXT: movdqa %xmm3, (%rcx)
101 ; AVX1-ONLY-LABEL: store_i16_stride3_vf4:
102 ; AVX1-ONLY: # %bb.0:
103 ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
104 ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
105 ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
106 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
107 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
108 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
109 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
110 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
111 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13]
112 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
113 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
114 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rcx)
115 ; AVX1-ONLY-NEXT: vmovq %xmm2, 16(%rcx)
116 ; AVX1-ONLY-NEXT: retq
118 ; AVX2-ONLY-LABEL: store_i16_stride3_vf4:
119 ; AVX2-ONLY: # %bb.0:
120 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
121 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
122 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
123 ; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
124 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
125 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
126 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
127 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
128 ; AVX2-ONLY-NEXT: vpor %ymm0, %ymm1, %ymm0
129 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1
130 ; AVX2-ONLY-NEXT: vmovq %xmm1, 16(%rcx)
131 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rcx)
132 ; AVX2-ONLY-NEXT: vzeroupper
133 ; AVX2-ONLY-NEXT: retq
135 ; AVX512F-LABEL: store_i16_stride3_vf4:
137 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
138 ; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
139 ; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
140 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
141 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
142 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
143 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
144 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
145 ; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0
146 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
147 ; AVX512F-NEXT: vmovq %xmm1, 16(%rcx)
148 ; AVX512F-NEXT: vmovdqa %xmm0, (%rcx)
149 ; AVX512F-NEXT: vzeroupper
152 ; AVX512BW-LABEL: store_i16_stride3_vf4:
154 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
155 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
156 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
157 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
158 ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
159 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u>
160 ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
161 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
162 ; AVX512BW-NEXT: vmovq %xmm1, 16(%rcx)
163 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rcx)
164 ; AVX512BW-NEXT: vzeroupper
165 ; AVX512BW-NEXT: retq
166 %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64
167 %in.vec1 = load <4 x i16>, ptr %in.vecptr1, align 64
168 %in.vec2 = load <4 x i16>, ptr %in.vecptr2, align 64
169 %1 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
170 %2 = shufflevector <4 x i16> %in.vec2, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
171 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
172 %interleaved.vec = shufflevector <12 x i16> %3, <12 x i16> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
173 store <12 x i16> %interleaved.vec, ptr %out.vec, align 64
177 define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
178 ; SSE-LABEL: store_i16_stride3_vf8:
180 ; SSE-NEXT: movdqa (%rdi), %xmm0
181 ; SSE-NEXT: movdqa (%rsi), %xmm2
182 ; SSE-NEXT: movdqa (%rdx), %xmm3
183 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0]
184 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535]
185 ; SSE-NEXT: movdqa %xmm4, %xmm5
186 ; SSE-NEXT: pandn %xmm1, %xmm5
187 ; SSE-NEXT: movdqa %xmm0, %xmm1
188 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
189 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
190 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
191 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5]
192 ; SSE-NEXT: pand %xmm4, %xmm1
193 ; SSE-NEXT: por %xmm5, %xmm1
194 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,2]
195 ; SSE-NEXT: pand %xmm4, %xmm5
196 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7]
197 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
198 ; SSE-NEXT: pandn %xmm6, %xmm4
199 ; SSE-NEXT: por %xmm5, %xmm4
200 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,0,65535]
201 ; SSE-NEXT: pand %xmm5, %xmm4
202 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,2,2]
203 ; SSE-NEXT: pandn %xmm6, %xmm5
204 ; SSE-NEXT: por %xmm4, %xmm5
205 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
206 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0]
207 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
208 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,1,3,3]
209 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,1,0,4,5,6,7]
210 ; SSE-NEXT: pand %xmm4, %xmm0
211 ; SSE-NEXT: pandn %xmm3, %xmm4
212 ; SSE-NEXT: por %xmm0, %xmm4
213 ; SSE-NEXT: movdqa %xmm4, 32(%rcx)
214 ; SSE-NEXT: movdqa %xmm5, 16(%rcx)
215 ; SSE-NEXT: movdqa %xmm1, (%rcx)
218 ; AVX1-ONLY-LABEL: store_i16_stride3_vf8:
219 ; AVX1-ONLY: # %bb.0:
220 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
221 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm1
222 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2
223 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
224 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[3,3,3,3,4,5,6,7]
225 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
226 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
227 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2]
228 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7]
229 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
230 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
231 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3]
232 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7]
233 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
234 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,u,u,4,5,6,7,u,u,8,9,10,11]
235 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
236 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
237 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rcx)
238 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rcx)
239 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%rcx)
240 ; AVX1-ONLY-NEXT: retq
242 ; AVX2-SLOW-LABEL: store_i16_stride3_vf8:
243 ; AVX2-SLOW: # %bb.0:
244 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
245 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1
246 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm2
247 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
248 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
249 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
250 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
251 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
252 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
253 ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4
254 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
255 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
256 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
257 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
258 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
259 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
260 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, 32(%rcx)
261 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rcx)
262 ; AVX2-SLOW-NEXT: vzeroupper
263 ; AVX2-SLOW-NEXT: retq
265 ; AVX2-FAST-LABEL: store_i16_stride3_vf8:
266 ; AVX2-FAST: # %bb.0:
267 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
268 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1
269 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm2
270 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
271 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
272 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm4
273 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6]
274 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3
275 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27]
276 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
277 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
278 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
279 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
280 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
281 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
282 ; AVX2-FAST-NEXT: vmovdqa %xmm0, 32(%rcx)
283 ; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rcx)
284 ; AVX2-FAST-NEXT: vzeroupper
285 ; AVX2-FAST-NEXT: retq
287 ; AVX2-FAST-PERLANE-LABEL: store_i16_stride3_vf8:
288 ; AVX2-FAST-PERLANE: # %bb.0:
289 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
290 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1
291 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2
292 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
293 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
294 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
295 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
296 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
297 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
298 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm4
299 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
300 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
301 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
302 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
303 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
304 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
305 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 32(%rcx)
306 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rcx)
307 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
308 ; AVX2-FAST-PERLANE-NEXT: retq
310 ; AVX512F-SLOW-LABEL: store_i16_stride3_vf8:
311 ; AVX512F-SLOW: # %bb.0:
312 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
313 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1
314 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm2
315 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
316 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u]
317 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
318 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
319 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
320 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
321 ; AVX512F-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4
322 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4
323 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
324 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
325 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
326 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
327 ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
328 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, 32(%rcx)
329 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
330 ; AVX512F-SLOW-NEXT: vzeroupper
331 ; AVX512F-SLOW-NEXT: retq
333 ; AVX512F-FAST-LABEL: store_i16_stride3_vf8:
334 ; AVX512F-FAST: # %bb.0:
335 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
336 ; AVX512F-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
337 ; AVX512F-FAST-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0
338 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
339 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
340 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
341 ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm2
342 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
343 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
344 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,0,0,u,1,1,u,2>
345 ; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
346 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5,1,5,2,6]
347 ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0
348 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5],zero,zero,ymm0[2,3,6,7],zero,zero,ymm0[8,9,12,13],zero,zero,ymm0[18,19,22,23],zero,zero,ymm0[24,25,28,29],zero,zero,ymm0[26,27]
349 ; AVX512F-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
350 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
351 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, 32(%rcx)
352 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
353 ; AVX512F-FAST-NEXT: vzeroupper
354 ; AVX512F-FAST-NEXT: retq
356 ; AVX512BW-LABEL: store_i16_stride3_vf8:
358 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
359 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
360 ; AVX512BW-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0
361 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,u,u,u,u,u,u,u,u>
362 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
363 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rcx)
364 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rcx)
365 ; AVX512BW-NEXT: vzeroupper
366 ; AVX512BW-NEXT: retq
367 %in.vec0 = load <8 x i16>, ptr %in.vecptr0, align 64
368 %in.vec1 = load <8 x i16>, ptr %in.vecptr1, align 64
369 %in.vec2 = load <8 x i16>, ptr %in.vecptr2, align 64
370 %1 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
371 %2 = shufflevector <8 x i16> %in.vec2, <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
372 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
373 %interleaved.vec = shufflevector <24 x i16> %3, <24 x i16> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
374 store <24 x i16> %interleaved.vec, ptr %out.vec, align 64
378 define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
379 ; SSE-LABEL: store_i16_stride3_vf16:
381 ; SSE-NEXT: movdqa (%rdi), %xmm1
382 ; SSE-NEXT: movdqa 16(%rdi), %xmm6
383 ; SSE-NEXT: movdqa (%rsi), %xmm2
384 ; SSE-NEXT: movdqa 16(%rsi), %xmm7
385 ; SSE-NEXT: movdqa (%rdx), %xmm4
386 ; SSE-NEXT: movdqa 16(%rdx), %xmm9
387 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2]
388 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
389 ; SSE-NEXT: pand %xmm0, %xmm3
390 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7]
391 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
392 ; SSE-NEXT: movdqa %xmm0, %xmm8
393 ; SSE-NEXT: pandn %xmm5, %xmm8
394 ; SSE-NEXT: por %xmm3, %xmm8
395 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,0,65535]
396 ; SSE-NEXT: pand %xmm3, %xmm8
397 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
398 ; SSE-NEXT: movdqa %xmm3, %xmm5
399 ; SSE-NEXT: pandn %xmm10, %xmm5
400 ; SSE-NEXT: por %xmm8, %xmm5
401 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,0,0]
402 ; SSE-NEXT: movdqa %xmm0, %xmm10
403 ; SSE-NEXT: pandn %xmm8, %xmm10
404 ; SSE-NEXT: movdqa %xmm6, %xmm8
405 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
406 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
407 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7]
408 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,4,5]
409 ; SSE-NEXT: pand %xmm0, %xmm8
410 ; SSE-NEXT: por %xmm10, %xmm8
411 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,2,2]
412 ; SSE-NEXT: pand %xmm0, %xmm10
413 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm2[3,3,3,3,4,5,6,7]
414 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4]
415 ; SSE-NEXT: movdqa %xmm0, %xmm12
416 ; SSE-NEXT: pandn %xmm11, %xmm12
417 ; SSE-NEXT: por %xmm10, %xmm12
418 ; SSE-NEXT: pand %xmm3, %xmm12
419 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,2,2]
420 ; SSE-NEXT: pandn %xmm10, %xmm3
421 ; SSE-NEXT: por %xmm12, %xmm3
422 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,0,0]
423 ; SSE-NEXT: movdqa %xmm1, %xmm11
424 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
425 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
426 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7]
427 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,4,5]
428 ; SSE-NEXT: pand %xmm0, %xmm11
429 ; SSE-NEXT: pandn %xmm10, %xmm0
430 ; SSE-NEXT: por %xmm11, %xmm0
431 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
432 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0]
433 ; SSE-NEXT: movdqa %xmm10, %xmm11
434 ; SSE-NEXT: pandn %xmm9, %xmm11
435 ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
436 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,1,3,3]
437 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,1,0,4,5,6,7]
438 ; SSE-NEXT: pand %xmm10, %xmm6
439 ; SSE-NEXT: por %xmm11, %xmm6
440 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
441 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
442 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
443 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7]
444 ; SSE-NEXT: pand %xmm10, %xmm1
445 ; SSE-NEXT: pandn %xmm4, %xmm10
446 ; SSE-NEXT: por %xmm1, %xmm10
447 ; SSE-NEXT: movdqa %xmm10, 32(%rcx)
448 ; SSE-NEXT: movdqa %xmm6, 80(%rcx)
449 ; SSE-NEXT: movdqa %xmm0, (%rcx)
450 ; SSE-NEXT: movdqa %xmm3, 16(%rcx)
451 ; SSE-NEXT: movdqa %xmm8, 48(%rcx)
452 ; SSE-NEXT: movdqa %xmm5, 64(%rcx)
455 ; AVX1-ONLY-LABEL: store_i16_stride3_vf16:
456 ; AVX1-ONLY: # %bb.0:
457 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
458 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
459 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,2]
460 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3
461 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm4
462 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7]
463 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
464 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7]
465 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5
466 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6
467 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
468 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5],xmm7[6],xmm2[7]
469 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,2]
470 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7]
471 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
472 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
473 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2]
474 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1,2],xmm8[3],xmm7[4,5],xmm8[6],xmm7[7]
475 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
476 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
477 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm8
478 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[2,2,3,3]
479 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6],xmm10[7]
480 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
481 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
482 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm10
483 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,0,0,0]
484 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7]
485 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
486 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0
487 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,2,3,3]
488 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7]
489 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
490 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1
491 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
492 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7]
493 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 48(%rcx)
494 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rcx)
495 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%rcx)
496 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%rcx)
497 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, 16(%rcx)
498 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 64(%rcx)
499 ; AVX1-ONLY-NEXT: retq
501 ; AVX2-SLOW-LABEL: store_i16_stride3_vf16:
502 ; AVX2-SLOW: # %bb.0:
503 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm1
504 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm0
505 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2
506 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3
507 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,2]
508 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm5
509 ; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm6
510 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[3,3,3,3,4,5,6,7]
511 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
512 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
513 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
514 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
515 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
516 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <5,5,u,6,6,u,7,7>
517 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm4, %ymm4
518 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
519 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
520 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2]
521 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[3,3,3,3,4,5,6,7]
522 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
523 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7]
524 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
525 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
526 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
527 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
528 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm4, %ymm4
529 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
530 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
531 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,3,3,u,4,4,u,5>
532 ; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm4, %ymm4
533 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
534 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
535 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1
536 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
537 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm4, %ymm0
538 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
539 ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
540 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
541 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rcx)
542 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%rcx)
543 ; AVX2-SLOW-NEXT: vzeroupper
544 ; AVX2-SLOW-NEXT: retq
546 ; AVX2-FAST-LABEL: store_i16_stride3_vf16:
547 ; AVX2-FAST: # %bb.0:
548 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1
549 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0
550 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2
551 ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3
552 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
553 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5
554 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6
555 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm7
556 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2]
557 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
558 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
559 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
560 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
561 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,0,0,u,1,1,u,2>
562 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm5
563 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
564 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2
565 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4
566 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,2]
567 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
568 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
569 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
570 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
571 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <5,5,u,6,6,u,7,7>
572 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm4
573 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
574 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
575 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,3,3,u,4,4,u,5>
576 ; AVX2-FAST-NEXT: vpermd (%rdi), %ymm4, %ymm4
577 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
578 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
579 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1
580 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
581 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
582 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
583 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
584 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
585 ; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rcx)
586 ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rcx)
587 ; AVX2-FAST-NEXT: vzeroupper
588 ; AVX2-FAST-NEXT: retq
590 ; AVX2-FAST-PERLANE-LABEL: store_i16_stride3_vf16:
591 ; AVX2-FAST-PERLANE: # %bb.0:
592 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1
593 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0
594 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2
595 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3
596 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
597 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm5
598 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6
599 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7
600 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2]
601 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
602 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
603 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
604 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
605 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <u,0,0,u,1,1,u,2>
606 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm5, %ymm5
607 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
608 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2
609 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm4
610 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,2]
611 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
612 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
613 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
614 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
615 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <5,5,u,6,6,u,7,7>
616 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm4
617 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
618 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
619 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <u,3,3,u,4,4,u,5>
620 ; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm4, %ymm4
621 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
622 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
623 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1
624 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
625 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm0
626 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
627 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
628 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
629 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%rcx)
630 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rcx)
631 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
632 ; AVX2-FAST-PERLANE-NEXT: retq
634 ; AVX512F-LABEL: store_i16_stride3_vf16:
636 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
637 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm2
638 ; AVX512F-NEXT: vmovdqa (%rdx), %ymm0
639 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm3
640 ; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm4
641 ; AVX512F-NEXT: vprold $16, %xmm3, %xmm5
642 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm6
643 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm7
644 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2]
645 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
646 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
647 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,u,u,4,5,6,7,u,u,8,9,10,11]
648 ; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3
649 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[u,u,12,13],zero,zero,ymm1[u,u,14,15],zero,zero,ymm1[u,u,16,17],zero,zero,ymm1[u,u,18,19],zero,zero,ymm1[u,u,20,21],zero,zero
650 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[10,11,u,u],zero,zero,ymm2[12,13,u,u],zero,zero,ymm2[14,15,u,u],zero,zero,ymm2[16,17,u,u],zero,zero,ymm2[18,19,u,u],zero,zero,ymm2[20,21]
651 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
652 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
653 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,0,u,1,1,u,2>
654 ; AVX512F-NEXT: vpermd %ymm0, %ymm2, %ymm2
655 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
656 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
657 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm0[10,11],zero,zero,zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18,19],zero,zero,zero,zero
658 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
659 ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2
660 ; AVX512F-NEXT: vprold $16, %xmm4, %xmm1
661 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2]
662 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
663 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
664 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
665 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
666 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <5,5,u,6,6,u,7,7>
667 ; AVX512F-NEXT: vpermd %ymm0, %ymm3, %ymm0
668 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
669 ; AVX512F-NEXT: vmovdqa %ymm0, 64(%rcx)
670 ; AVX512F-NEXT: vmovdqa64 %zmm2, (%rcx)
671 ; AVX512F-NEXT: vzeroupper
674 ; AVX512BW-LABEL: store_i16_stride3_vf16:
676 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
677 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1
678 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
679 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47]
680 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
681 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26]
682 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
683 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rcx)
684 ; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rcx)
685 ; AVX512BW-NEXT: vzeroupper
686 ; AVX512BW-NEXT: retq
687 %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 64
688 %in.vec1 = load <16 x i16>, ptr %in.vecptr1, align 64
689 %in.vec2 = load <16 x i16>, ptr %in.vecptr2, align 64
690 %1 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
691 %2 = shufflevector <16 x i16> %in.vec2, <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
692 %3 = shufflevector <32 x i16> %1, <32 x i16> %2, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
693 %interleaved.vec = shufflevector <48 x i16> %3, <48 x i16> poison, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
694 store <48 x i16> %interleaved.vec, ptr %out.vec, align 64
698 define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
699 ; SSE-LABEL: store_i16_stride3_vf32:
701 ; SSE-NEXT: movdqa 16(%rdi), %xmm6
702 ; SSE-NEXT: movdqa 32(%rdi), %xmm4
703 ; SSE-NEXT: movdqa 48(%rdi), %xmm0
704 ; SSE-NEXT: movdqa 16(%rsi), %xmm7
705 ; SSE-NEXT: movdqa 32(%rsi), %xmm8
706 ; SSE-NEXT: movdqa 48(%rsi), %xmm11
707 ; SSE-NEXT: movdqa 32(%rdx), %xmm10
708 ; SSE-NEXT: movdqa 48(%rdx), %xmm12
709 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
710 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,2]
711 ; SSE-NEXT: movdqa %xmm0, %xmm9
712 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
713 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
714 ; SSE-NEXT: pand %xmm5, %xmm1
715 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7]
716 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
717 ; SSE-NEXT: movdqa %xmm5, %xmm3
718 ; SSE-NEXT: pandn %xmm2, %xmm3
719 ; SSE-NEXT: por %xmm1, %xmm3
720 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535]
721 ; SSE-NEXT: pand %xmm2, %xmm3
722 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,2,2]
723 ; SSE-NEXT: movdqa %xmm2, %xmm0
724 ; SSE-NEXT: pandn %xmm1, %xmm0
725 ; SSE-NEXT: por %xmm3, %xmm0
726 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
727 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0]
728 ; SSE-NEXT: movdqa %xmm5, %xmm3
729 ; SSE-NEXT: pandn %xmm1, %xmm3
730 ; SSE-NEXT: movdqa %xmm9, %xmm1
731 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
732 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
733 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
734 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,7,5,4,5]
735 ; SSE-NEXT: pand %xmm5, %xmm9
736 ; SSE-NEXT: por %xmm3, %xmm9
737 ; SSE-NEXT: movdqa %xmm4, %xmm0
738 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
739 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2]
740 ; SSE-NEXT: pand %xmm5, %xmm1
741 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7]
742 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
743 ; SSE-NEXT: movdqa %xmm5, %xmm4
744 ; SSE-NEXT: pandn %xmm3, %xmm4
745 ; SSE-NEXT: por %xmm1, %xmm4
746 ; SSE-NEXT: pand %xmm2, %xmm4
747 ; SSE-NEXT: movdqa %xmm10, %xmm3
748 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
749 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2]
750 ; SSE-NEXT: movdqa %xmm2, %xmm10
751 ; SSE-NEXT: pandn %xmm1, %xmm10
752 ; SSE-NEXT: por %xmm4, %xmm10
753 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0]
754 ; SSE-NEXT: movdqa %xmm5, %xmm3
755 ; SSE-NEXT: pandn %xmm1, %xmm3
756 ; SSE-NEXT: movdqa %xmm0, %xmm1
757 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
758 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
759 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
760 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,7,5,4,5]
761 ; SSE-NEXT: pand %xmm5, %xmm13
762 ; SSE-NEXT: por %xmm3, %xmm13
763 ; SSE-NEXT: movdqa %xmm7, %xmm0
764 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
765 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7]
766 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
767 ; SSE-NEXT: movdqa %xmm5, %xmm3
768 ; SSE-NEXT: pandn %xmm1, %xmm3
769 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
770 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2]
771 ; SSE-NEXT: pand %xmm5, %xmm1
772 ; SSE-NEXT: por %xmm1, %xmm3
773 ; SSE-NEXT: movdqa 16(%rdx), %xmm7
774 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
775 ; SSE-NEXT: movdqa %xmm2, %xmm14
776 ; SSE-NEXT: pandn %xmm1, %xmm14
777 ; SSE-NEXT: pand %xmm2, %xmm3
778 ; SSE-NEXT: por %xmm3, %xmm14
779 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0]
780 ; SSE-NEXT: movdqa %xmm5, %xmm3
781 ; SSE-NEXT: pandn %xmm1, %xmm3
782 ; SSE-NEXT: movdqa %xmm6, %xmm1
783 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
784 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
785 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
786 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,4,5]
787 ; SSE-NEXT: pand %xmm5, %xmm15
788 ; SSE-NEXT: por %xmm3, %xmm15
789 ; SSE-NEXT: movdqa (%rsi), %xmm4
790 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7]
791 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
792 ; SSE-NEXT: movdqa %xmm5, %xmm0
793 ; SSE-NEXT: pandn %xmm3, %xmm0
794 ; SSE-NEXT: movdqa (%rdi), %xmm6
795 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2]
796 ; SSE-NEXT: pand %xmm5, %xmm3
797 ; SSE-NEXT: por %xmm3, %xmm0
798 ; SSE-NEXT: pand %xmm2, %xmm0
799 ; SSE-NEXT: movdqa (%rdx), %xmm3
800 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,2,2]
801 ; SSE-NEXT: pandn %xmm12, %xmm2
802 ; SSE-NEXT: por %xmm0, %xmm2
803 ; SSE-NEXT: movdqa %xmm6, %xmm0
804 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
805 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
806 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
807 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,5]
808 ; SSE-NEXT: pand %xmm5, %xmm0
809 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,0,0]
810 ; SSE-NEXT: pandn %xmm12, %xmm5
811 ; SSE-NEXT: por %xmm0, %xmm5
812 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
813 ; SSE-NEXT: # xmm1 = mem[2,2,3,3]
814 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0]
815 ; SSE-NEXT: movdqa %xmm12, %xmm0
816 ; SSE-NEXT: pandn %xmm1, %xmm0
817 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
818 ; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7]
819 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,3,3]
820 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[2,1,1,0,4,5,6,7]
821 ; SSE-NEXT: pand %xmm12, %xmm11
822 ; SSE-NEXT: por %xmm0, %xmm11
823 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
824 ; SSE-NEXT: # xmm0 = mem[2,2,3,3]
825 ; SSE-NEXT: movdqa %xmm12, %xmm1
826 ; SSE-NEXT: pandn %xmm0, %xmm1
827 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
828 ; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7]
829 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,3,3]
830 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,1,0,4,5,6,7]
831 ; SSE-NEXT: pand %xmm12, %xmm0
832 ; SSE-NEXT: por %xmm1, %xmm0
833 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3]
834 ; SSE-NEXT: movdqa %xmm12, %xmm7
835 ; SSE-NEXT: pandn %xmm1, %xmm7
836 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
837 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
838 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
839 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
840 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7]
841 ; SSE-NEXT: pand %xmm12, %xmm1
842 ; SSE-NEXT: por %xmm7, %xmm1
843 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
844 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
845 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3]
846 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,1,0,4,5,6,7]
847 ; SSE-NEXT: pand %xmm12, %xmm4
848 ; SSE-NEXT: pandn %xmm3, %xmm12
849 ; SSE-NEXT: por %xmm4, %xmm12
850 ; SSE-NEXT: movdqa %xmm12, 32(%rcx)
851 ; SSE-NEXT: movdqa %xmm1, 80(%rcx)
852 ; SSE-NEXT: movdqa %xmm0, 128(%rcx)
853 ; SSE-NEXT: movdqa %xmm11, 176(%rcx)
854 ; SSE-NEXT: movdqa %xmm5, (%rcx)
855 ; SSE-NEXT: movdqa %xmm2, 16(%rcx)
856 ; SSE-NEXT: movdqa %xmm15, 48(%rcx)
857 ; SSE-NEXT: movdqa %xmm14, 64(%rcx)
858 ; SSE-NEXT: movdqa %xmm13, 96(%rcx)
859 ; SSE-NEXT: movdqa %xmm10, 112(%rcx)
860 ; SSE-NEXT: movdqa %xmm9, 144(%rcx)
861 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
862 ; SSE-NEXT: movaps %xmm0, 160(%rcx)
865 ; AVX1-ONLY-LABEL: store_i16_stride3_vf32:
866 ; AVX1-ONLY: # %bb.0:
867 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14
868 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
869 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
870 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8
871 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9
872 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,2,2]
873 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3
874 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm5
875 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10
876 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12
877 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7]
878 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
879 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7]
880 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6
881 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm13
882 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm0
883 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,2]
884 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm4[1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7]
885 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
886 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,2,2]
887 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7]
888 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
889 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
890 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2]
891 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm4[1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7]
892 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
893 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,2]
894 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[3,3,3,3,4,5,6,7]
895 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4]
896 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm11[2],xmm7[3,4],xmm11[5],xmm7[6,7]
897 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[1,1,2,2]
898 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm7[1,2],xmm11[3],xmm7[4,5],xmm11[6],xmm7[7]
899 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
900 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[1,1,2,2]
901 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[3,3,3,3,4,5,6,7]
902 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4]
903 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3,4],xmm14[5],xmm11[6,7]
904 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm15
905 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,1,2,2]
906 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm11[1,2],xmm14[3],xmm11[4,5],xmm14[6],xmm11[7]
907 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
908 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3]
909 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
910 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14
911 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,0,0]
912 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm4[2],xmm14[3,4],xmm4[5],xmm14[6,7]
913 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
914 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
915 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
916 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,2,3,3]
917 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6],xmm11[7]
918 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
919 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm9
920 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
921 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7]
922 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
923 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm8
924 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,0,0,0]
925 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
926 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
927 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9
928 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,0,0]
929 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7]
930 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
931 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
932 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm10
933 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[2,2,3,3]
934 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6],xmm11[7]
935 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
936 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1
937 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3]
938 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
939 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
940 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2
941 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,0,0,0]
942 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
943 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rcx)
944 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 80(%rcx)
945 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, 32(%rcx)
946 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, 48(%rcx)
947 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, 96(%rcx)
948 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 176(%rcx)
949 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 128(%rcx)
950 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, 144(%rcx)
951 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
952 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rcx)
953 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
954 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rcx)
955 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
956 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rcx)
957 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
958 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rcx)
959 ; AVX1-ONLY-NEXT: retq
961 ; AVX2-SLOW-LABEL: store_i16_stride3_vf32:
962 ; AVX2-SLOW: # %bb.0:
963 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm0
964 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4
965 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2
966 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm1
967 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3
968 ; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm5
969 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
970 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
971 ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6
972 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2]
973 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7]
974 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
975 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7]
976 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
977 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7>
978 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm5, %ymm6
979 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
980 ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm3
981 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6
982 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm9
983 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10
984 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11
985 ; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm12
986 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13
987 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
988 ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm7
989 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
990 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7]
991 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4]
992 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7]
993 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7
994 ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm5, %ymm5
995 ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5
996 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
997 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[3,3,3,3,4,5,6,7]
998 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
999 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
1000 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3]
1001 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
1002 ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm6
1003 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
1004 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,0,0,u,1,1,u,2>
1005 ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm7, %ymm9
1006 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1007 ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6
1008 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,2]
1009 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[3,3,3,3,4,5,6,7]
1010 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4]
1011 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7]
1012 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3]
1013 ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm8
1014 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8
1015 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm7, %ymm7
1016 ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm7, %ymm7
1017 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
1018 ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4
1019 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <u,3,3,u,4,4,u,5>
1020 ; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm9, %ymm10
1021 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
1022 ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4
1023 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,3,3,u,4,4,u>
1024 ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm10, %ymm2
1025 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
1026 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
1027 ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0
1028 ; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm9, %ymm4
1029 ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0
1030 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm10, %ymm1
1031 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
1032 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rcx)
1033 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rcx)
1034 ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 96(%rcx)
1035 ; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%rcx)
1036 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%rcx)
1037 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rcx)
1038 ; AVX2-SLOW-NEXT: vzeroupper
1039 ; AVX2-SLOW-NEXT: retq
1041 ; AVX2-FAST-LABEL: store_i16_stride3_vf32:
1042 ; AVX2-FAST: # %bb.0:
1043 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0
1044 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1
1045 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4
1046 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
1047 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm3
1048 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5
1049 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6
1050 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
1051 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7]
1052 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
1053 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
1054 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1
1055 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
1056 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,0,0,u,1,1,u,2>
1057 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm3
1058 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1059 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1
1060 ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3
1061 ; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm9
1062 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
1063 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
1064 ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10
1065 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm9
1066 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2]
1067 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm9[2],xmm3[3,4],xmm9[5],xmm3[6,7]
1068 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm9
1069 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3
1070 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2]
1071 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm3[2],xmm10[3,4],xmm3[5],xmm10[6,7]
1072 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3
1073 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
1074 ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm6
1075 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4
1076 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm12
1077 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4
1078 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7>
1079 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm5
1080 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4
1081 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm5
1082 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
1083 ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5
1084 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
1085 ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8
1086 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2
1087 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6
1088 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2]
1089 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2],xmm9[3,4],xmm2[5],xmm9[6,7]
1090 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2
1091 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm8
1092 ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2
1093 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
1094 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6
1095 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,3,3,u,4,4,u,5>
1096 ; AVX2-FAST-NEXT: vpermd (%rdi), %ymm8, %ymm9
1097 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
1098 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6
1099 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm9
1100 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm7
1101 ; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm8, %ymm8
1102 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
1103 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u>
1104 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0
1105 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
1106 ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0
1107 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3
1108 ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3
1109 ; AVX2-FAST-NEXT: vmovdqa %ymm3, 128(%rcx)
1110 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
1111 ; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rcx)
1112 ; AVX2-FAST-NEXT: vmovdqa %ymm4, 96(%rcx)
1113 ; AVX2-FAST-NEXT: vmovdqa %ymm5, 160(%rcx)
1114 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
1115 ; AVX2-FAST-NEXT: vzeroupper
1116 ; AVX2-FAST-NEXT: retq
1118 ; AVX2-FAST-PERLANE-LABEL: store_i16_stride3_vf32:
1119 ; AVX2-FAST-PERLANE: # %bb.0:
1120 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0
1121 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1
1122 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4
1123 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
1124 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm3
1125 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5
1126 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6
1127 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
1128 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7]
1129 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
1130 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
1131 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1
1132 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
1133 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <u,0,0,u,1,1,u,2>
1134 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm7, %ymm3
1135 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1136 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1
1137 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3
1138 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm9
1139 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
1140 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
1141 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10
1142 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm9
1143 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2]
1144 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm9[2],xmm3[3,4],xmm9[5],xmm3[6,7]
1145 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm9
1146 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm3
1147 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2]
1148 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm3[2],xmm10[3,4],xmm3[5],xmm10[6,7]
1149 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3
1150 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
1151 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm6
1152 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4
1153 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm12
1154 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4
1155 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7>
1156 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm7, %ymm5
1157 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4
1158 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm10, %ymm5
1159 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
1160 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5
1161 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
1162 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm8
1163 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2
1164 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6
1165 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2]
1166 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2],xmm9[3,4],xmm2[5],xmm9[6,7]
1167 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2
1168 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm8
1169 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2
1170 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
1171 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm6
1172 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <u,3,3,u,4,4,u,5>
1173 ; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm8, %ymm9
1174 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
1175 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6
1176 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9
1177 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm9, %ymm7
1178 ; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm8, %ymm8
1179 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
1180 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u>
1181 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm0
1182 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
1183 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0
1184 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm8, %ymm3
1185 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3
1186 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rcx)
1187 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
1188 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rcx)
1189 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 96(%rcx)
1190 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 160(%rcx)
1191 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
1192 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1193 ; AVX2-FAST-PERLANE-NEXT: retq
1195 ; AVX512F-LABEL: store_i16_stride3_vf32:
1197 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm0
1198 ; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm3
1199 ; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm2
1200 ; AVX512F-NEXT: vprold $16, %xmm2, %xmm4
1201 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
1202 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm5
1203 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm6
1204 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
1205 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
1206 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
1207 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
1208 ; AVX512F-NEXT: vpshufb %xmm2, %xmm6, %xmm6
1209 ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
1210 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
1211 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
1212 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
1213 ; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6
1214 ; AVX512F-NEXT: vprold $16, %xmm3, %xmm3
1215 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
1216 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
1217 ; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
1218 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
1219 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10>
1220 ; AVX512F-NEXT: vpermd (%rdx), %zmm4, %zmm4
1221 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4
1222 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
1223 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
1224 ; AVX512F-NEXT: vpshufb %ymm5, %ymm3, %ymm3
1225 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm6
1226 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
1227 ; AVX512F-NEXT: vpshufb %ymm8, %ymm6, %ymm6
1228 ; AVX512F-NEXT: vpor %ymm3, %ymm6, %ymm3
1229 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm6
1230 ; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm9
1231 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
1232 ; AVX512F-NEXT: vpshufb %xmm7, %xmm10, %xmm7
1233 ; AVX512F-NEXT: vprold $16, %xmm9, %xmm9
1234 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
1235 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2],xmm6[3,4],xmm9[5],xmm6[6,7]
1236 ; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
1237 ; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6
1238 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm6[4,5,6,7]
1239 ; AVX512F-NEXT: vmovdqa (%rdx), %ymm6
1240 ; AVX512F-NEXT: vmovdqa 32(%rdx), %ymm7
1241 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7>
1242 ; AVX512F-NEXT: vpermd %ymm7, %ymm9, %ymm9
1243 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
1244 ; AVX512F-NEXT: vpandn %ymm9, %ymm10, %ymm9
1245 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
1246 ; AVX512F-NEXT: vpshufb %ymm10, %ymm7, %ymm7
1247 ; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7
1248 ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7
1249 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm3
1250 ; AVX512F-NEXT: vpshufb %ymm5, %ymm3, %ymm3
1251 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm5
1252 ; AVX512F-NEXT: vpshufb %ymm8, %ymm5, %ymm5
1253 ; AVX512F-NEXT: vpor %ymm3, %ymm5, %ymm3
1254 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
1255 ; AVX512F-NEXT: vprold $16, %xmm0, %xmm5
1256 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
1257 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
1258 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1259 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1260 ; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
1261 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7]
1262 ; AVX512F-NEXT: vpshufb %ymm10, %ymm6, %ymm1
1263 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,0,u,1,1,u,2>
1264 ; AVX512F-NEXT: vpermd %ymm6, %ymm2, %ymm2
1265 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
1266 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
1267 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
1268 ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1269 ; AVX512F-NEXT: vmovdqa64 %zmm1, (%rcx)
1270 ; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rcx)
1271 ; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rcx)
1272 ; AVX512F-NEXT: vzeroupper
1273 ; AVX512F-NEXT: retq
1275 ; AVX512BW-LABEL: store_i16_stride3_vf32:
1276 ; AVX512BW: # %bb.0:
1277 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1278 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1
1279 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2
1280 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42>
1281 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1282 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31]
1283 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4
1284 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21>
1285 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1286 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31]
1287 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5
1288 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u>
1289 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3
1290 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63]
1291 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0
1292 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rcx)
1293 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rcx)
1294 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rcx)
1295 ; AVX512BW-NEXT: vzeroupper
1296 ; AVX512BW-NEXT: retq
1297 %in.vec0 = load <32 x i16>, ptr %in.vecptr0, align 64
1298 %in.vec1 = load <32 x i16>, ptr %in.vecptr1, align 64
1299 %in.vec2 = load <32 x i16>, ptr %in.vecptr2, align 64
1300 %1 = shufflevector <32 x i16> %in.vec0, <32 x i16> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1301 %2 = shufflevector <32 x i16> %in.vec2, <32 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1302 %3 = shufflevector <64 x i16> %1, <64 x i16> %2, <96 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
1303 %interleaved.vec = shufflevector <96 x i16> %3, <96 x i16> poison, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95>
1304 store <96 x i16> %interleaved.vec, ptr %out.vec, align 64
1308 define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind {
1309 ; SSE-LABEL: store_i16_stride3_vf64:
1311 ; SSE-NEXT: subq $328, %rsp # imm = 0x148
1312 ; SSE-NEXT: movdqa (%rdi), %xmm3
1313 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1314 ; SSE-NEXT: movdqa 16(%rdi), %xmm8
1315 ; SSE-NEXT: movdqa 32(%rdi), %xmm6
1316 ; SSE-NEXT: movdqa (%rsi), %xmm4
1317 ; SSE-NEXT: movdqa 16(%rsi), %xmm5
1318 ; SSE-NEXT: movdqa (%rdx), %xmm0
1319 ; SSE-NEXT: movdqa 16(%rdx), %xmm9
1320 ; SSE-NEXT: movdqa 32(%rdx), %xmm7
1321 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1322 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
1323 ; SSE-NEXT: movdqa %xmm0, %xmm10
1324 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1325 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
1326 ; SSE-NEXT: movdqa %xmm0, %xmm2
1327 ; SSE-NEXT: pandn %xmm1, %xmm2
1328 ; SSE-NEXT: movdqa %xmm3, %xmm1
1329 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1330 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1331 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
1332 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5]
1333 ; SSE-NEXT: pand %xmm0, %xmm1
1334 ; SSE-NEXT: por %xmm2, %xmm1
1335 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1336 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,2]
1337 ; SSE-NEXT: pand %xmm0, %xmm1
1338 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[3,3,3,3,4,5,6,7]
1339 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
1340 ; SSE-NEXT: movdqa %xmm0, %xmm3
1341 ; SSE-NEXT: pandn %xmm2, %xmm3
1342 ; SSE-NEXT: por %xmm1, %xmm3
1343 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,0,65535,65535,0,65535]
1344 ; SSE-NEXT: pand %xmm11, %xmm3
1345 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2]
1346 ; SSE-NEXT: movdqa %xmm11, %xmm2
1347 ; SSE-NEXT: pandn %xmm1, %xmm2
1348 ; SSE-NEXT: por %xmm3, %xmm2
1349 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1350 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0]
1351 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1352 ; SSE-NEXT: movdqa %xmm0, %xmm2
1353 ; SSE-NEXT: pandn %xmm1, %xmm2
1354 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1355 ; SSE-NEXT: movdqa %xmm8, %xmm1
1356 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
1357 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1358 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
1359 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5]
1360 ; SSE-NEXT: pand %xmm0, %xmm1
1361 ; SSE-NEXT: por %xmm2, %xmm1
1362 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1363 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,2,2]
1364 ; SSE-NEXT: pand %xmm0, %xmm1
1365 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7]
1366 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
1367 ; SSE-NEXT: movdqa %xmm0, %xmm3
1368 ; SSE-NEXT: pandn %xmm2, %xmm3
1369 ; SSE-NEXT: por %xmm1, %xmm3
1370 ; SSE-NEXT: pand %xmm11, %xmm3
1371 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,2,2]
1372 ; SSE-NEXT: movdqa %xmm11, %xmm2
1373 ; SSE-NEXT: pandn %xmm1, %xmm2
1374 ; SSE-NEXT: por %xmm3, %xmm2
1375 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1376 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0]
1377 ; SSE-NEXT: movdqa %xmm0, %xmm2
1378 ; SSE-NEXT: pandn %xmm1, %xmm2
1379 ; SSE-NEXT: movdqa 32(%rsi), %xmm8
1380 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1381 ; SSE-NEXT: movdqa %xmm6, %xmm1
1382 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
1383 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1384 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1385 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
1386 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5]
1387 ; SSE-NEXT: pand %xmm0, %xmm1
1388 ; SSE-NEXT: por %xmm2, %xmm1
1389 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1390 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7]
1391 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1392 ; SSE-NEXT: movdqa %xmm0, %xmm2
1393 ; SSE-NEXT: pandn %xmm1, %xmm2
1394 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2]
1395 ; SSE-NEXT: pand %xmm0, %xmm1
1396 ; SSE-NEXT: por %xmm1, %xmm2
1397 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
1398 ; SSE-NEXT: movdqa %xmm11, %xmm6
1399 ; SSE-NEXT: pandn %xmm1, %xmm6
1400 ; SSE-NEXT: pand %xmm11, %xmm2
1401 ; SSE-NEXT: por %xmm2, %xmm6
1402 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1403 ; SSE-NEXT: movdqa 48(%rdx), %xmm2
1404 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
1405 ; SSE-NEXT: movdqa %xmm2, %xmm7
1406 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
1407 ; SSE-NEXT: movdqa %xmm0, %xmm2
1408 ; SSE-NEXT: pandn %xmm1, %xmm2
1409 ; SSE-NEXT: movdqa 48(%rdi), %xmm6
1410 ; SSE-NEXT: movdqa 48(%rsi), %xmm3
1411 ; SSE-NEXT: movdqa %xmm6, %xmm1
1412 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1413 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1414 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1415 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
1416 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5]
1417 ; SSE-NEXT: pand %xmm0, %xmm1
1418 ; SSE-NEXT: por %xmm2, %xmm1
1419 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1420 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7]
1421 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1422 ; SSE-NEXT: movdqa %xmm0, %xmm2
1423 ; SSE-NEXT: pandn %xmm1, %xmm2
1424 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2]
1425 ; SSE-NEXT: pand %xmm0, %xmm1
1426 ; SSE-NEXT: por %xmm1, %xmm2
1427 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
1428 ; SSE-NEXT: movdqa %xmm11, %xmm6
1429 ; SSE-NEXT: pandn %xmm1, %xmm6
1430 ; SSE-NEXT: pand %xmm11, %xmm2
1431 ; SSE-NEXT: por %xmm2, %xmm6
1432 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1433 ; SSE-NEXT: movdqa 64(%rdx), %xmm2
1434 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
1435 ; SSE-NEXT: movdqa %xmm2, %xmm7
1436 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1437 ; SSE-NEXT: movdqa %xmm0, %xmm2
1438 ; SSE-NEXT: pandn %xmm1, %xmm2
1439 ; SSE-NEXT: movdqa 64(%rdi), %xmm6
1440 ; SSE-NEXT: movdqa 64(%rsi), %xmm14
1441 ; SSE-NEXT: movdqa %xmm6, %xmm1
1442 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1443 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
1444 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1445 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
1446 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5]
1447 ; SSE-NEXT: pand %xmm0, %xmm1
1448 ; SSE-NEXT: por %xmm2, %xmm1
1449 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1450 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,3,3,3,4,5,6,7]
1451 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1452 ; SSE-NEXT: movdqa %xmm0, %xmm2
1453 ; SSE-NEXT: pandn %xmm1, %xmm2
1454 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2]
1455 ; SSE-NEXT: pand %xmm0, %xmm1
1456 ; SSE-NEXT: por %xmm1, %xmm2
1457 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
1458 ; SSE-NEXT: movdqa %xmm11, %xmm6
1459 ; SSE-NEXT: pandn %xmm1, %xmm6
1460 ; SSE-NEXT: pand %xmm11, %xmm2
1461 ; SSE-NEXT: por %xmm2, %xmm6
1462 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1463 ; SSE-NEXT: movdqa 80(%rdx), %xmm2
1464 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
1465 ; SSE-NEXT: movdqa %xmm2, %xmm7
1466 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1467 ; SSE-NEXT: movdqa %xmm0, %xmm2
1468 ; SSE-NEXT: pandn %xmm1, %xmm2
1469 ; SSE-NEXT: movdqa 80(%rdi), %xmm6
1470 ; SSE-NEXT: movdqa 80(%rsi), %xmm12
1471 ; SSE-NEXT: movdqa %xmm6, %xmm1
1472 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1473 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
1474 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1475 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
1476 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5]
1477 ; SSE-NEXT: pand %xmm0, %xmm1
1478 ; SSE-NEXT: por %xmm2, %xmm1
1479 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1480 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7]
1481 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1482 ; SSE-NEXT: movdqa %xmm0, %xmm2
1483 ; SSE-NEXT: pandn %xmm1, %xmm2
1484 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2]
1485 ; SSE-NEXT: pand %xmm0, %xmm1
1486 ; SSE-NEXT: por %xmm1, %xmm2
1487 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
1488 ; SSE-NEXT: movdqa %xmm11, %xmm6
1489 ; SSE-NEXT: pandn %xmm1, %xmm6
1490 ; SSE-NEXT: pand %xmm11, %xmm2
1491 ; SSE-NEXT: por %xmm2, %xmm6
1492 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1493 ; SSE-NEXT: movdqa 96(%rdx), %xmm2
1494 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
1495 ; SSE-NEXT: movdqa %xmm2, %xmm7
1496 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1497 ; SSE-NEXT: movdqa %xmm0, %xmm2
1498 ; SSE-NEXT: pandn %xmm1, %xmm2
1499 ; SSE-NEXT: movdqa 96(%rdi), %xmm6
1500 ; SSE-NEXT: movdqa 96(%rsi), %xmm9
1501 ; SSE-NEXT: movdqa %xmm6, %xmm1
1502 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1503 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
1504 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1505 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
1506 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,4,5]
1507 ; SSE-NEXT: pand %xmm0, %xmm15
1508 ; SSE-NEXT: por %xmm2, %xmm15
1509 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7]
1510 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1511 ; SSE-NEXT: movdqa %xmm0, %xmm2
1512 ; SSE-NEXT: pandn %xmm1, %xmm2
1513 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2]
1514 ; SSE-NEXT: pand %xmm0, %xmm1
1515 ; SSE-NEXT: por %xmm1, %xmm2
1516 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
1517 ; SSE-NEXT: movdqa %xmm11, %xmm13
1518 ; SSE-NEXT: pandn %xmm1, %xmm13
1519 ; SSE-NEXT: pand %xmm11, %xmm2
1520 ; SSE-NEXT: por %xmm2, %xmm13
1521 ; SSE-NEXT: movdqa 112(%rdx), %xmm2
1522 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
1523 ; SSE-NEXT: movdqa %xmm2, %xmm6
1524 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1525 ; SSE-NEXT: movdqa %xmm0, %xmm2
1526 ; SSE-NEXT: pandn %xmm1, %xmm2
1527 ; SSE-NEXT: movdqa 112(%rdi), %xmm8
1528 ; SSE-NEXT: movdqa 112(%rsi), %xmm7
1529 ; SSE-NEXT: movdqa %xmm8, %xmm1
1530 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
1531 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1532 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
1533 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,4,5]
1534 ; SSE-NEXT: pand %xmm0, %xmm10
1535 ; SSE-NEXT: por %xmm2, %xmm10
1536 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,2,2]
1537 ; SSE-NEXT: pand %xmm0, %xmm1
1538 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,3,3,3,4,5,6,7]
1539 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
1540 ; SSE-NEXT: pandn %xmm2, %xmm0
1541 ; SSE-NEXT: por %xmm1, %xmm0
1542 ; SSE-NEXT: pand %xmm11, %xmm0
1543 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2]
1544 ; SSE-NEXT: pandn %xmm1, %xmm11
1545 ; SSE-NEXT: por %xmm0, %xmm11
1546 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1547 ; SSE-NEXT: # xmm1 = mem[2,2,3,3]
1548 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0]
1549 ; SSE-NEXT: movdqa %xmm0, %xmm2
1550 ; SSE-NEXT: pandn %xmm1, %xmm2
1551 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
1552 ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
1553 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,1,3,3]
1554 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,1,0,4,5,6,7]
1555 ; SSE-NEXT: pand %xmm0, %xmm6
1556 ; SSE-NEXT: por %xmm2, %xmm6
1557 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1558 ; SSE-NEXT: # xmm1 = mem[2,2,3,3]
1559 ; SSE-NEXT: movdqa %xmm0, %xmm2
1560 ; SSE-NEXT: pandn %xmm1, %xmm2
1561 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
1562 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
1563 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,1,3,3]
1564 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[2,1,1,0,4,5,6,7]
1565 ; SSE-NEXT: pand %xmm0, %xmm5
1566 ; SSE-NEXT: por %xmm2, %xmm5
1567 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1568 ; SSE-NEXT: # xmm2 = mem[2,2,3,3]
1569 ; SSE-NEXT: movdqa %xmm0, %xmm1
1570 ; SSE-NEXT: pandn %xmm2, %xmm1
1571 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1572 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1573 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
1574 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
1575 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,1,1,0,4,5,6,7]
1576 ; SSE-NEXT: pand %xmm0, %xmm4
1577 ; SSE-NEXT: por %xmm1, %xmm4
1578 ; SSE-NEXT: pshufd $250, (%rsp), %xmm2 # 16-byte Folded Reload
1579 ; SSE-NEXT: # xmm2 = mem[2,2,3,3]
1580 ; SSE-NEXT: movdqa %xmm0, %xmm1
1581 ; SSE-NEXT: pandn %xmm2, %xmm1
1582 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1583 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
1584 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3]
1585 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,1,0,4,5,6,7]
1586 ; SSE-NEXT: pand %xmm0, %xmm2
1587 ; SSE-NEXT: por %xmm1, %xmm2
1588 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1589 ; SSE-NEXT: # xmm3 = mem[2,2,3,3]
1590 ; SSE-NEXT: movdqa %xmm0, %xmm1
1591 ; SSE-NEXT: pandn %xmm3, %xmm1
1592 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
1593 ; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
1594 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,1,3,3]
1595 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm3[2,1,1,0,4,5,6,7]
1596 ; SSE-NEXT: pand %xmm0, %xmm14
1597 ; SSE-NEXT: por %xmm1, %xmm14
1598 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1599 ; SSE-NEXT: # xmm1 = mem[2,2,3,3]
1600 ; SSE-NEXT: movdqa %xmm0, %xmm3
1601 ; SSE-NEXT: pandn %xmm1, %xmm3
1602 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1603 ; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7]
1604 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3]
1605 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7]
1606 ; SSE-NEXT: pand %xmm0, %xmm1
1607 ; SSE-NEXT: por %xmm3, %xmm1
1608 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1609 ; SSE-NEXT: # xmm3 = mem[2,2,3,3]
1610 ; SSE-NEXT: movdqa %xmm0, %xmm12
1611 ; SSE-NEXT: pandn %xmm3, %xmm12
1612 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
1613 ; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7]
1614 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,3,3]
1615 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,1,0,4,5,6,7]
1616 ; SSE-NEXT: pand %xmm0, %xmm3
1617 ; SSE-NEXT: por %xmm12, %xmm3
1618 ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
1619 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
1620 ; SSE-NEXT: # xmm9 = mem[2,2,3,3]
1621 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3]
1622 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,1,0,4,5,6,7]
1623 ; SSE-NEXT: pand %xmm0, %xmm7
1624 ; SSE-NEXT: pandn %xmm9, %xmm0
1625 ; SSE-NEXT: por %xmm7, %xmm0
1626 ; SSE-NEXT: movdqa %xmm0, 368(%rcx)
1627 ; SSE-NEXT: movdqa %xmm3, 320(%rcx)
1628 ; SSE-NEXT: movdqa %xmm1, 272(%rcx)
1629 ; SSE-NEXT: movdqa %xmm14, 224(%rcx)
1630 ; SSE-NEXT: movdqa %xmm2, 176(%rcx)
1631 ; SSE-NEXT: movdqa %xmm4, 128(%rcx)
1632 ; SSE-NEXT: movdqa %xmm5, 80(%rcx)
1633 ; SSE-NEXT: movdqa %xmm6, 32(%rcx)
1634 ; SSE-NEXT: movdqa %xmm11, 352(%rcx)
1635 ; SSE-NEXT: movdqa %xmm10, 336(%rcx)
1636 ; SSE-NEXT: movdqa %xmm13, 304(%rcx)
1637 ; SSE-NEXT: movdqa %xmm15, 288(%rcx)
1638 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1639 ; SSE-NEXT: movaps %xmm0, 256(%rcx)
1640 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1641 ; SSE-NEXT: movaps %xmm0, 240(%rcx)
1642 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1643 ; SSE-NEXT: movaps %xmm0, 208(%rcx)
1644 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1645 ; SSE-NEXT: movaps %xmm0, 192(%rcx)
1646 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1647 ; SSE-NEXT: movaps %xmm0, 160(%rcx)
1648 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1649 ; SSE-NEXT: movaps %xmm0, 144(%rcx)
1650 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1651 ; SSE-NEXT: movaps %xmm0, 112(%rcx)
1652 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1653 ; SSE-NEXT: movaps %xmm0, 96(%rcx)
1654 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1655 ; SSE-NEXT: movaps %xmm0, 64(%rcx)
1656 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1657 ; SSE-NEXT: movaps %xmm0, 48(%rcx)
1658 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1659 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
1660 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1661 ; SSE-NEXT: movaps %xmm0, (%rcx)
1662 ; SSE-NEXT: addq $328, %rsp # imm = 0x148
1665 ; AVX1-ONLY-LABEL: store_i16_stride3_vf64:
1666 ; AVX1-ONLY: # %bb.0:
1667 ; AVX1-ONLY-NEXT: subq $280, %rsp # imm = 0x118
1668 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm6
1669 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,2,2]
1670 ; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm12
1671 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7]
1672 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1673 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
1674 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm15
1675 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,2,2]
1676 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
1677 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1678 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7
1679 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2]
1680 ; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm1
1681 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1682 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7]
1683 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1684 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
1685 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm14
1686 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2]
1687 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
1688 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1689 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
1690 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1691 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0
1692 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1693 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
1694 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm3
1695 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1696 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1
1697 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1698 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7]
1699 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1700 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
1701 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1
1702 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1703 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
1704 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
1705 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1706 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,2,2]
1707 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7]
1708 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1709 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
1710 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10
1711 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,2]
1712 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
1713 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1714 ; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm0
1715 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1716 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
1717 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1718 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm13
1719 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,2,2]
1720 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1721 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm11
1722 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2]
1723 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
1724 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1725 ; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm9
1726 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7]
1727 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1728 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5
1729 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2]
1730 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1731 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm8
1732 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[1,1,2,2]
1733 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
1734 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1735 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0
1736 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1737 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
1738 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1739 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
1740 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
1741 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
1742 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1743 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1
1744 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1745 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
1746 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
1747 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1748 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0
1749 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1750 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
1751 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1752 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
1753 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1754 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
1755 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1756 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1
1757 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1758 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
1759 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
1760 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1761 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
1762 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
1763 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm0
1764 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm2
1765 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3]
1766 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3],xmm15[4],xmm0[5,6],xmm15[7]
1767 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1768 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm3
1769 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1770 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
1771 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
1772 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm15
1773 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm0
1774 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
1775 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7]
1776 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1777 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
1778 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14
1779 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,0,0,0]
1780 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7]
1781 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1782 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14
1783 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3]
1784 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7]
1785 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1786 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1787 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1788 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1789 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14
1790 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1791 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,0,0]
1792 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7]
1793 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1794 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1795 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1796 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1797 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14
1798 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[2,2,3,3]
1799 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7]
1800 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1801 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1802 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14
1803 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,2,3,3]
1804 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7]
1805 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1806 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1807 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14
1808 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,0,0]
1809 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm10[2],xmm14[3,4],xmm10[5],xmm14[6,7]
1810 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1811 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1812 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
1813 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm10
1814 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,0,0,0]
1815 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7]
1816 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
1817 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm15
1818 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,2,3,3]
1819 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7]
1820 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
1821 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm1
1822 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm13
1823 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3]
1824 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7]
1825 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3]
1826 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5
1827 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,0,0,0]
1828 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
1829 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1830 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload
1831 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
1832 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5
1833 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm15
1834 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1835 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,3,3]
1836 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3],xmm9[4],xmm5[5,6],xmm9[7]
1837 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1838 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1839 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1840 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9
1841 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1842 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,0,0,0]
1843 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm13[2],xmm9[3,4],xmm13[5],xmm9[6,7]
1844 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
1845 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm0
1846 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,0,0,0]
1847 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4],xmm7[5],xmm0[6,7]
1848 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
1849 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm1
1850 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3]
1851 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
1852 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rcx)
1853 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rcx)
1854 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%rcx)
1855 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 80(%rcx)
1856 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 288(%rcx)
1857 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, 368(%rcx)
1858 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, 320(%rcx)
1859 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, 336(%rcx)
1860 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1861 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rcx)
1862 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1863 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rcx)
1864 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1865 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rcx)
1866 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1867 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rcx)
1868 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1869 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rcx)
1870 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, 240(%rcx)
1871 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1872 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rcx)
1873 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1874 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rcx)
1875 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1876 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rcx)
1877 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1878 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rcx)
1879 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1880 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rcx)
1881 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1882 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rcx)
1883 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1884 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rcx)
1885 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1886 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rcx)
1887 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1888 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rcx)
1889 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1890 ; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rcx)
1891 ; AVX1-ONLY-NEXT: addq $280, %rsp # imm = 0x118
1892 ; AVX1-ONLY-NEXT: retq
1894 ; AVX2-SLOW-LABEL: store_i16_stride3_vf64:
1895 ; AVX2-SLOW: # %bb.0:
1896 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4
1897 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0
1898 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3
1899 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm1
1900 ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2
1901 ; AVX2-SLOW-NEXT: vmovdqa 80(%rsi), %xmm5
1902 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
1903 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
1904 ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6
1905 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2]
1906 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7]
1907 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
1908 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7]
1909 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
1910 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <5,5,u,6,6,u,7,7>
1911 ; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm8, %ymm5
1912 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
1913 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm5, %ymm2
1914 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1915 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5
1916 ; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm6
1917 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1918 ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm10, %xmm10
1919 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
1920 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,3,3,3,4,5,6,7]
1921 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
1922 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
1923 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5
1924 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm8, %ymm6
1925 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm2
1926 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1927 ; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6
1928 ; AVX2-SLOW-NEXT: vmovdqa 112(%rsi), %xmm10
1929 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
1930 ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm11
1931 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
1932 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7]
1933 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4]
1934 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6,7]
1935 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6
1936 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm8, %ymm10
1937 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm2
1938 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1939 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10
1940 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm11
1941 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12
1942 ; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm13
1943 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
1944 ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm7
1945 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2]
1946 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,3,3,3,4,5,6,7]
1947 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4]
1948 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
1949 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm11, %ymm7
1950 ; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm8, %ymm8
1951 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
1952 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,2,2]
1953 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[3,3,3,3,4,5,6,7]
1954 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4]
1955 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
1956 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
1957 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
1958 ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9
1959 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
1960 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <u,0,0,u,1,1,u,2>
1961 ; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm12, %ymm9
1962 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1963 ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8
1964 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9
1965 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
1966 ; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14
1967 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[3,3,3,3,4,5,6,7]
1968 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4]
1969 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7]
1970 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm15
1971 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
1972 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm14
1973 ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9
1974 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9
1975 ; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm10
1976 ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9
1977 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[3,3,3,3,4,5,6,7]
1978 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4]
1979 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2]
1980 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2],xmm2[3,4],xmm10[5],xmm2[6,7]
1981 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
1982 ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10
1983 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2
1984 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm12, %ymm10
1985 ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm10
1986 ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2
1987 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[3,3,3,3,4,5,6,7]
1988 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4]
1989 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm15
1990 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[1,1,2,2]
1991 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm14[2],xmm5[3,4],xmm14[5],xmm5[6,7]
1992 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3]
1993 ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm2
1994 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm14
1995 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
1996 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm12, %ymm5
1997 ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm11
1998 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
1999 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm5
2000 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <u,3,3,u,4,4,u,5>
2001 ; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm12, %ymm13
2002 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
2003 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm13, %ymm5
2004 ; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm13
2005 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm13, %ymm13
2006 ; AVX2-SLOW-NEXT: vpermd 64(%rdi), %ymm12, %ymm15
2007 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm13
2008 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm15
2009 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm15
2010 ; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm12, %ymm6
2011 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm15, %ymm6, %ymm6
2012 ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm15
2013 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm2
2014 ; AVX2-SLOW-NEXT: vpermd 96(%rdi), %ymm12, %ymm12
2015 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm12, %ymm2
2016 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,3,3,u,4,4,u>
2017 ; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm12, %ymm4
2018 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
2019 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm4
2020 ; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm3
2021 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm3, %ymm3
2022 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm12, %ymm0
2023 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0
2024 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm12, %ymm1
2025 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm1
2026 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 320(%rcx)
2027 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rcx)
2028 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 224(%rcx)
2029 ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%rcx)
2030 ; AVX2-SLOW-NEXT: vmovdqa %ymm11, 288(%rcx)
2031 ; AVX2-SLOW-NEXT: vmovdqa %ymm10, 96(%rcx)
2032 ; AVX2-SLOW-NEXT: vmovdqa %ymm9, 192(%rcx)
2033 ; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx)
2034 ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rcx)
2035 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2036 ; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rcx)
2037 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2038 ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rcx)
2039 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2040 ; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rcx)
2041 ; AVX2-SLOW-NEXT: vzeroupper
2042 ; AVX2-SLOW-NEXT: retq
2044 ; AVX2-FAST-LABEL: store_i16_stride3_vf64:
2045 ; AVX2-FAST: # %bb.0:
2046 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2
2047 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm0
2048 ; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm1
2049 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm3
2050 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
2051 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm4
2052 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5
2053 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
2054 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
2055 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
2056 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
2057 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3
2058 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
2059 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,0,0,u,1,1,u,2>
2060 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm4
2061 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
2062 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3
2063 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2064 ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm4
2065 ; AVX2-FAST-NEXT: vmovdqa 80(%rsi), %xmm5
2066 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
2067 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
2068 ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7
2069 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5
2070 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
2071 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
2072 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4
2073 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7>
2074 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm5
2075 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
2076 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3
2077 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2078 ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm5
2079 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7
2080 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm8
2081 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,2]
2082 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2],xmm15[3,4],xmm7[5],xmm15[6,7]
2083 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
2084 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5
2085 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
2086 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm7
2087 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3
2088 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2089 ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7
2090 ; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm8
2091 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
2092 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8
2093 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
2094 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
2095 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm8
2096 ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm15
2097 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7
2098 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5
2099 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm15
2100 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7
2101 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm15
2102 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm0
2103 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2]
2104 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7]
2105 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3]
2106 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3
2107 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
2108 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm3
2109 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm8
2110 ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm0
2111 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm3
2112 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm15
2113 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2]
2114 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
2115 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2116 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0
2117 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2118 ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm15
2119 ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm13, %ymm3
2120 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm12
2121 ; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm0
2122 ; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm3
2123 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2124 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3
2125 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
2126 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7]
2127 ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm3
2128 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2129 ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm10, %ymm3
2130 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13
2131 ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm0
2132 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3
2133 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2134 ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4
2135 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
2136 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2]
2137 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7]
2138 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm3
2139 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
2140 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm4
2141 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6
2142 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
2143 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3
2144 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,3,3,u,4,4,u,5>
2145 ; AVX2-FAST-NEXT: vpermd (%rdi), %ymm4, %ymm9
2146 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
2147 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3
2148 ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm9
2149 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm9
2150 ; AVX2-FAST-NEXT: vpermd 64(%rdi), %ymm4, %ymm11
2151 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9
2152 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm11
2153 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11
2154 ; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm4, %ymm14
2155 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm11, %ymm14, %ymm11
2156 ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm14
2157 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm0
2158 ; AVX2-FAST-NEXT: vpermd 96(%rdi), %ymm4, %ymm4
2159 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0
2160 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
2161 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
2162 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
2163 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2
2164 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
2165 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1
2166 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm3
2167 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm11, %ymm3, %ymm3
2168 ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm4
2169 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0
2170 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 320(%rcx)
2171 ; AVX2-FAST-NEXT: vmovdqa %ymm3, 128(%rcx)
2172 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 224(%rcx)
2173 ; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rcx)
2174 ; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rcx)
2175 ; AVX2-FAST-NEXT: vmovdqa %ymm12, 288(%rcx)
2176 ; AVX2-FAST-NEXT: vmovdqa %ymm13, 352(%rcx)
2177 ; AVX2-FAST-NEXT: vmovdqa %ymm8, 96(%rcx)
2178 ; AVX2-FAST-NEXT: vmovdqa %ymm7, 160(%rcx)
2179 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2180 ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx)
2181 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2182 ; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rcx)
2183 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2184 ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx)
2185 ; AVX2-FAST-NEXT: vzeroupper
2186 ; AVX2-FAST-NEXT: retq
2188 ; AVX2-FAST-PERLANE-LABEL: store_i16_stride3_vf64:
2189 ; AVX2-FAST-PERLANE: # %bb.0:
2190 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2
2191 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm0
2192 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm1
2193 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3
2194 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
2195 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm4
2196 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5
2197 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
2198 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
2199 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
2200 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
2201 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3
2202 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
2203 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <u,0,0,u,1,1,u,2>
2204 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm13, %ymm4
2205 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
2206 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3
2207 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2208 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm4
2209 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rsi), %xmm5
2210 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
2211 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
2212 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm7
2213 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5
2214 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
2215 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
2216 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4
2217 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7>
2218 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm10, %ymm5
2219 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
2220 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3
2221 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2222 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm5
2223 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm7
2224 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm8
2225 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,2]
2226 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2],xmm15[3,4],xmm7[5],xmm15[6,7]
2227 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
2228 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5
2229 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
2230 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm13, %ymm7
2231 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3
2232 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2233 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7
2234 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm8
2235 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
2236 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8
2237 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
2238 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
2239 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8
2240 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm15
2241 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7
2242 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm5
2243 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm15
2244 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7
2245 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm15
2246 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm0
2247 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2]
2248 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7]
2249 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3]
2250 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3
2251 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
2252 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm13, %ymm3
2253 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm8
2254 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm0
2255 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm3
2256 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm15
2257 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2]
2258 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
2259 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2260 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0
2261 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2262 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm15
2263 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm13, %ymm3
2264 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm12
2265 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0
2266 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rsi), %xmm3
2267 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2268 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3
2269 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
2270 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7]
2271 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm3
2272 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2273 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm10, %ymm3
2274 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13
2275 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm0
2276 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3
2277 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2278 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm4
2279 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0
2280 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2]
2281 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7]
2282 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3
2283 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
2284 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm10, %ymm4
2285 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6
2286 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
2287 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm3
2288 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <u,3,3,u,4,4,u,5>
2289 ; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm4, %ymm9
2290 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
2291 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3
2292 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm9
2293 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm9, %ymm9
2294 ; AVX2-FAST-PERLANE-NEXT: vpermd 64(%rdi), %ymm4, %ymm11
2295 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9
2296 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm11
2297 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm11
2298 ; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm4, %ymm14
2299 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm14, %ymm11
2300 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm14
2301 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm14, %ymm0
2302 ; AVX2-FAST-PERLANE-NEXT: vpermd 96(%rdi), %ymm4, %ymm4
2303 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0
2304 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
2305 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm2
2306 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
2307 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2
2308 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm4, %ymm1
2309 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1
2310 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm4, %ymm3
2311 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm3, %ymm3
2312 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm4, %ymm4
2313 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0
2314 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 320(%rcx)
2315 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rcx)
2316 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rcx)
2317 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rcx)
2318 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rcx)
2319 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 288(%rcx)
2320 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 352(%rcx)
2321 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%rcx)
2322 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 160(%rcx)
2323 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2324 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx)
2325 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2326 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rcx)
2327 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2328 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx)
2329 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2330 ; AVX2-FAST-PERLANE-NEXT: retq
2332 ; AVX512F-LABEL: store_i16_stride3_vf64:
2334 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
2335 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
2336 ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0
2337 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm7
2338 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
2339 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
2340 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2341 ; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0
2342 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3
2343 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm5
2344 ; AVX512F-NEXT: vmovdqa64 16(%rsi), %xmm24
2345 ; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm6
2346 ; AVX512F-NEXT: vprold $16, %xmm5, %xmm8
2347 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm9
2348 ; AVX512F-NEXT: vmovdqa64 16(%rdi), %xmm25
2349 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm4
2350 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
2351 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
2352 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
2353 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
2354 ; AVX512F-NEXT: vpshufb %xmm0, %xmm9, %xmm9
2355 ; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
2356 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7]
2357 ; AVX512F-NEXT: vmovdqa (%rdx), %ymm3
2358 ; AVX512F-NEXT: vmovdqa 32(%rdx), %ymm8
2359 ; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm14
2360 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
2361 ; AVX512F-NEXT: vpshufb %ymm9, %ymm3, %ymm11
2362 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm19 = <u,0,0,u,1,1,u,2>
2363 ; AVX512F-NEXT: vpermd %ymm3, %ymm19, %ymm3
2364 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
2365 ; AVX512F-NEXT: vpandnq %ymm3, %ymm16, %ymm3
2366 ; AVX512F-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3
2367 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
2368 ; AVX512F-NEXT: vpternlogq $248, %zmm17, %zmm10, %zmm3
2369 ; AVX512F-NEXT: vmovdqa 96(%rsi), %xmm10
2370 ; AVX512F-NEXT: vprold $16, %xmm10, %xmm11
2371 ; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm12
2372 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2]
2373 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
2374 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
2375 ; AVX512F-NEXT: vpshufb %xmm0, %xmm10, %xmm10
2376 ; AVX512F-NEXT: vmovdqa64 %xmm0, %xmm26
2377 ; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10
2378 ; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10
2379 ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm12
2380 ; AVX512F-NEXT: vmovdqa 80(%rsi), %xmm13
2381 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
2382 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
2383 ; AVX512F-NEXT: vpshufb %xmm11, %xmm15, %xmm15
2384 ; AVX512F-NEXT: vprold $16, %xmm13, %xmm13
2385 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2]
2386 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
2387 ; AVX512F-NEXT: vinserti128 $1, %xmm15, %ymm12, %ymm12
2388 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm12[0,1,2,3],zmm10[4,5,6,7]
2389 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10>
2390 ; AVX512F-NEXT: vpermd 64(%rdx), %zmm20, %zmm10
2391 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
2392 ; AVX512F-NEXT: vpternlogq $184, %zmm15, %zmm21, %zmm10
2393 ; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm15
2394 ; AVX512F-NEXT: vmovdqa %ymm7, %ymm1
2395 ; AVX512F-NEXT: vpshufb %ymm7, %ymm15, %ymm15
2396 ; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm12
2397 ; AVX512F-NEXT: vpshufb %ymm2, %ymm12, %ymm12
2398 ; AVX512F-NEXT: vpor %ymm15, %ymm12, %ymm12
2399 ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm15
2400 ; AVX512F-NEXT: vmovdqa 112(%rsi), %xmm13
2401 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
2402 ; AVX512F-NEXT: vpshufb %xmm11, %xmm5, %xmm5
2403 ; AVX512F-NEXT: vprold $16, %xmm13, %xmm13
2404 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2]
2405 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm13[2],xmm15[3,4],xmm13[5],xmm15[6,7]
2406 ; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm13, %ymm5
2407 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
2408 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm12[0,1,2,3],zmm5[4,5,6,7]
2409 ; AVX512F-NEXT: vmovdqa 96(%rdx), %ymm12
2410 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm23 = <5,5,u,6,6,u,7,7>
2411 ; AVX512F-NEXT: vpermd %ymm12, %ymm23, %ymm15
2412 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
2413 ; AVX512F-NEXT: vpandnq %ymm15, %ymm22, %ymm15
2414 ; AVX512F-NEXT: vpshufb %ymm9, %ymm12, %ymm12
2415 ; AVX512F-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm18
2416 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
2417 ; AVX512F-NEXT: vpternlogq $248, %zmm12, %zmm5, %zmm18
2418 ; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm5
2419 ; AVX512F-NEXT: vpshufb %ymm7, %ymm5, %ymm5
2420 ; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm15
2421 ; AVX512F-NEXT: vpshufb %ymm2, %ymm15, %ymm15
2422 ; AVX512F-NEXT: vpor %ymm5, %ymm15, %ymm5
2423 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
2424 ; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm15
2425 ; AVX512F-NEXT: vprold $16, %xmm15, %xmm0
2426 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm13
2427 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2]
2428 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
2429 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
2430 ; AVX512F-NEXT: vmovdqa64 %xmm26, %xmm15
2431 ; AVX512F-NEXT: vpshufb %xmm15, %xmm7, %xmm7
2432 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0
2433 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7]
2434 ; AVX512F-NEXT: vpshufb %ymm9, %ymm14, %ymm5
2435 ; AVX512F-NEXT: vpermd %ymm14, %ymm19, %ymm7
2436 ; AVX512F-NEXT: vpandnq %ymm7, %ymm16, %ymm7
2437 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5
2438 ; AVX512F-NEXT: vpternlogq $248, %zmm17, %zmm0, %zmm5
2439 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
2440 ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0
2441 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm7
2442 ; AVX512F-NEXT: vpshufb %ymm2, %ymm7, %ymm7
2443 ; AVX512F-NEXT: vpor %ymm0, %ymm7, %ymm0
2444 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm7
2445 ; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm13
2446 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
2447 ; AVX512F-NEXT: vpshufb %xmm11, %xmm14, %xmm14
2448 ; AVX512F-NEXT: vprold $16, %xmm13, %xmm13
2449 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
2450 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm13[2],xmm7[3,4],xmm13[5],xmm7[6,7]
2451 ; AVX512F-NEXT: vinserti128 $1, %xmm14, %ymm7, %ymm7
2452 ; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
2453 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[4,5,6,7]
2454 ; AVX512F-NEXT: vpermd %ymm8, %ymm23, %ymm7
2455 ; AVX512F-NEXT: vpandnq %ymm7, %ymm22, %ymm7
2456 ; AVX512F-NEXT: vpshufb %ymm9, %ymm8, %ymm8
2457 ; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
2458 ; AVX512F-NEXT: vpternlogq $248, %zmm12, %zmm0, %zmm7
2459 ; AVX512F-NEXT: vprold $16, %xmm6, %xmm0
2460 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,2,2]
2461 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm0[2],xmm8[3,4],xmm0[5],xmm8[6,7]
2462 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
2463 ; AVX512F-NEXT: vpshufb %xmm15, %xmm2, %xmm2
2464 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
2465 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
2466 ; AVX512F-NEXT: vmovdqa64 %xmm24, %xmm1
2467 ; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm6
2468 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
2469 ; AVX512F-NEXT: vpshufb %xmm11, %xmm2, %xmm2
2470 ; AVX512F-NEXT: vprold $16, %xmm24, %xmm4
2471 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm25[1,1,2,2]
2472 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
2473 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2474 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
2475 ; AVX512F-NEXT: vpermd (%rdx), %zmm20, %zmm1
2476 ; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm21, %zmm1
2477 ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rcx)
2478 ; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rcx)
2479 ; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rcx)
2480 ; AVX512F-NEXT: vmovdqa64 %zmm18, 320(%rcx)
2481 ; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%rcx)
2482 ; AVX512F-NEXT: vmovdqa64 %zmm3, (%rcx)
2483 ; AVX512F-NEXT: vzeroupper
2484 ; AVX512F-NEXT: retq
2486 ; AVX512BW-LABEL: store_i16_stride3_vf64:
2487 ; AVX512BW: # %bb.0:
2488 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2489 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
2490 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2
2491 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3
2492 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4
2493 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5
2494 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42>
2495 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7
2496 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm7
2497 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31]
2498 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm7
2499 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u>
2500 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
2501 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm9, %zmm10
2502 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63]
2503 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm10
2504 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21>
2505 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13
2506 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm13
2507 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31]
2508 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm13
2509 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1
2510 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm1
2511 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm9
2512 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm9
2513 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm12, %zmm0
2514 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm0
2515 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rcx)
2516 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rcx)
2517 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rcx)
2518 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rcx)
2519 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%rcx)
2520 ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx)
2521 ; AVX512BW-NEXT: vzeroupper
2522 ; AVX512BW-NEXT: retq
2523 %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64
2524 %in.vec1 = load <64 x i16>, ptr %in.vecptr1, align 64
2525 %in.vec2 = load <64 x i16>, ptr %in.vecptr2, align 64
2526 %1 = shufflevector <64 x i16> %in.vec0, <64 x i16> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
2527 %2 = shufflevector <64 x i16> %in.vec2, <64 x i16> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2528 %3 = shufflevector <128 x i16> %1, <128 x i16> %2, <192 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191>
2529 %interleaved.vec = shufflevector <192 x i16> %3, <192 x i16> poison, <192 x i32> <i32 0, i32 64, i32 128, i32 1, i32 65, i32 129, i32 2, i32 66, i32 130, i32 3, i32 67, i32 131, i32 4, i32 68, i32 132, i32 5, i32 69, i32 133, i32 6, i32 70, i32 134, i32 7, i32 71, i32 135, i32 8, i32 72, i32 136, i32 9, i32 73, i32 137, i32 10, i32 74, i32 138, i32 11, i32 75, i32 139, i32 12, i32 76, i32 140, i32 13, i32 77, i32 141, i32 14, i32 78, i32 142, i32 15, i32 79, i32 143, i32 16, i32 80, i32 144, i32 17, i32 81, i32 145, i32 18, i32 82, i32 146, i32 19, i32 83, i32 147, i32 20, i32 84, i32 148, i32 21, i32 85, i32 149, i32 22, i32 86, i32 150, i32 23, i32 87, i32 151, i32 24, i32 88, i32 152, i32 25, i32 89, i32 153, i32 26, i32 90, i32 154, i32 27, i32 91, i32 155, i32 28, i32 92, i32 156, i32 29, i32 93, i32 157, i32 30, i32 94, i32 158, i32 31, i32 95, i32 159, i32 32, i32 96, i32 160, i32 33, i32 97, i32 161, i32 34, i32 98, i32 162, i32 35, i32 99, i32 163, i32 36, i32 100, i32 164, i32 37, i32 101, i32 165, i32 38, i32 102, i32 166, i32 39, i32 103, i32 167, i32 40, i32 104, i32 168, i32 41, i32 105, i32 169, i32 42, i32 106, i32 170, i32 43, i32 107, i32 171, i32 44, i32 108, i32 172, i32 45, i32 109, i32 173, i32 46, i32 110, i32 174, i32 47, i32 111, i32 175, i32 48, i32 112, i32 176, i32 49, i32 113, i32 177, i32 50, i32 114, i32 178, i32 51, i32 115, i32 179, i32 52, i32 116, i32 180, i32 53, i32 117, i32 181, i32 54, i32 118, i32 182, i32 55, i32 119, i32 183, i32 56, i32 120, i32 184, i32 57, i32 121, i32 185, i32 58, i32 122, i32 186, i32 59, i32 123, i32 187, i32 60, i32 124, i32 188, i32 61, i32 125, i32 189, i32 62, i32 126, i32 190, i32 63, i32 127, i32 191>
2530 store <192 x i16> %interleaved.vec, ptr %out.vec, align 64
2533 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2537 ; AVX512-FAST: {{.*}}
2538 ; AVX512-SLOW: {{.*}}
2539 ; AVX512BW-FAST: {{.*}}
2540 ; AVX512BW-ONLY-FAST: {{.*}}
2541 ; AVX512BW-ONLY-SLOW: {{.*}}
2542 ; AVX512BW-SLOW: {{.*}}
2543 ; AVX512DQ-FAST: {{.*}}
2544 ; AVX512DQ-SLOW: {{.*}}
2545 ; AVX512DQBW-FAST: {{.*}}
2546 ; AVX512DQBW-SLOW: {{.*}}
2547 ; AVX512F-ONLY-FAST: {{.*}}
2548 ; AVX512F-ONLY-SLOW: {{.*}}
2551 ; FALLBACK10: {{.*}}
2552 ; FALLBACK11: {{.*}}
2553 ; FALLBACK12: {{.*}}