1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-ONLY,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
16 ; These patterns are produced by LoopVectorizer for interleaved stores.
18 define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
19 ; SSE-LABEL: store_i32_stride4_vf2:
21 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
22 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
23 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
24 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
25 ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
26 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
27 ; SSE-NEXT: movaps %xmm0, %xmm2
28 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
29 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
30 ; SSE-NEXT: movaps %xmm1, 16(%r8)
31 ; SSE-NEXT: movaps %xmm2, (%r8)
34 ; AVX1-ONLY-LABEL: store_i32_stride4_vf2:
36 ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
37 ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
38 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
39 ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
40 ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
41 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
42 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
43 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
44 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3,0,2,5,7,4,6]
45 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7]
46 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
47 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8)
48 ; AVX1-ONLY-NEXT: vzeroupper
49 ; AVX1-ONLY-NEXT: retq
51 ; AVX2-SLOW-LABEL: store_i32_stride4_vf2:
53 ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
54 ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
55 ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
56 ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
57 ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
58 ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
59 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
60 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
61 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
62 ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r8)
63 ; AVX2-SLOW-NEXT: vzeroupper
64 ; AVX2-SLOW-NEXT: retq
66 ; AVX2-FAST-LABEL: store_i32_stride4_vf2:
68 ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
69 ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
70 ; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
71 ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
72 ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
73 ; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
74 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
75 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
76 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
77 ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r8)
78 ; AVX2-FAST-NEXT: vzeroupper
79 ; AVX2-FAST-NEXT: retq
81 ; AVX2-FAST-PERLANE-LABEL: store_i32_stride4_vf2:
82 ; AVX2-FAST-PERLANE: # %bb.0:
83 ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
84 ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
85 ; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
86 ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
87 ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
88 ; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
89 ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
90 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
91 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
92 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r8)
93 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
94 ; AVX2-FAST-PERLANE-NEXT: retq
96 ; AVX512-SLOW-LABEL: store_i32_stride4_vf2:
97 ; AVX512-SLOW: # %bb.0:
98 ; AVX512-SLOW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
99 ; AVX512-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
100 ; AVX512-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
101 ; AVX512-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
102 ; AVX512-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
103 ; AVX512-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
104 ; AVX512-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
105 ; AVX512-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
106 ; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
107 ; AVX512-SLOW-NEXT: vmovaps %ymm0, (%r8)
108 ; AVX512-SLOW-NEXT: vzeroupper
109 ; AVX512-SLOW-NEXT: retq
111 ; AVX512-FAST-LABEL: store_i32_stride4_vf2:
112 ; AVX512-FAST: # %bb.0:
113 ; AVX512-FAST-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
114 ; AVX512-FAST-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
115 ; AVX512-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
116 ; AVX512-FAST-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
117 ; AVX512-FAST-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
118 ; AVX512-FAST-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
119 ; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
120 ; AVX512-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
121 ; AVX512-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
122 ; AVX512-FAST-NEXT: vmovaps %ymm0, (%r8)
123 ; AVX512-FAST-NEXT: vzeroupper
124 ; AVX512-FAST-NEXT: retq
125 %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64
126 %in.vec1 = load <2 x i32>, ptr %in.vecptr1, align 64
127 %in.vec2 = load <2 x i32>, ptr %in.vecptr2, align 64
128 %in.vec3 = load <2 x i32>, ptr %in.vecptr3, align 64
129 %1 = shufflevector <2 x i32> %in.vec0, <2 x i32> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
130 %2 = shufflevector <2 x i32> %in.vec2, <2 x i32> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
131 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
132 %interleaved.vec = shufflevector <8 x i32> %3, <8 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
133 store <8 x i32> %interleaved.vec, ptr %out.vec, align 64
137 define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
138 ; SSE-LABEL: store_i32_stride4_vf4:
140 ; SSE-NEXT: movaps (%rdi), %xmm0
141 ; SSE-NEXT: movaps (%rsi), %xmm1
142 ; SSE-NEXT: movaps (%rdx), %xmm2
143 ; SSE-NEXT: movaps (%rcx), %xmm3
144 ; SSE-NEXT: movaps %xmm2, %xmm4
145 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
146 ; SSE-NEXT: movaps %xmm0, %xmm5
147 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
148 ; SSE-NEXT: movaps %xmm5, %xmm6
149 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0]
150 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1]
151 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
152 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
153 ; SSE-NEXT: movaps %xmm0, %xmm1
154 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
155 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
156 ; SSE-NEXT: movaps %xmm0, 32(%r8)
157 ; SSE-NEXT: movaps %xmm1, 48(%r8)
158 ; SSE-NEXT: movaps %xmm5, 16(%r8)
159 ; SSE-NEXT: movaps %xmm6, (%r8)
162 ; AVX1-ONLY-LABEL: store_i32_stride4_vf4:
163 ; AVX1-ONLY: # %bb.0:
164 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0
165 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1
166 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2
167 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3
168 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4
169 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm5
170 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
171 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,1,1,0,4,5,5,4]
172 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} ymm6 = ymm5[0,0,2,2]
173 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
174 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
175 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0,2,3,5,4,6,7]
176 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7]
177 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
178 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6]
179 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7]
180 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,2,3,7,6,6,7]
181 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1,0,3,2]
182 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7]
183 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
184 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8)
185 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8)
186 ; AVX1-ONLY-NEXT: vzeroupper
187 ; AVX1-ONLY-NEXT: retq
189 ; AVX2-ONLY-LABEL: store_i32_stride4_vf4:
190 ; AVX2-ONLY: # %bb.0:
191 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0
192 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm1
193 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
194 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
195 ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,0,4,u,u,1,5>
196 ; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm2
197 ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,u,1,5,u,u>
198 ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm3, %ymm3
199 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
200 ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = <u,u,2,6,u,u,3,7>
201 ; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm1
202 ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = <2,6,u,u,3,7,u,u>
203 ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm3, %ymm0
204 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
205 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8)
206 ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8)
207 ; AVX2-ONLY-NEXT: vzeroupper
208 ; AVX2-ONLY-NEXT: retq
210 ; AVX512-LABEL: store_i32_stride4_vf4:
212 ; AVX512-NEXT: vmovaps (%rdi), %xmm0
213 ; AVX512-NEXT: vmovaps (%rdx), %xmm1
214 ; AVX512-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1
215 ; AVX512-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0
216 ; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
217 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
218 ; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
219 ; AVX512-NEXT: vmovaps %zmm0, (%r8)
220 ; AVX512-NEXT: vzeroupper
222 %in.vec0 = load <4 x i32>, ptr %in.vecptr0, align 64
223 %in.vec1 = load <4 x i32>, ptr %in.vecptr1, align 64
224 %in.vec2 = load <4 x i32>, ptr %in.vecptr2, align 64
225 %in.vec3 = load <4 x i32>, ptr %in.vecptr3, align 64
226 %1 = shufflevector <4 x i32> %in.vec0, <4 x i32> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
227 %2 = shufflevector <4 x i32> %in.vec2, <4 x i32> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
228 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
229 %interleaved.vec = shufflevector <16 x i32> %3, <16 x i32> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
230 store <16 x i32> %interleaved.vec, ptr %out.vec, align 64
234 define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
235 ; SSE-LABEL: store_i32_stride4_vf8:
237 ; SSE-NEXT: movaps (%rdi), %xmm0
238 ; SSE-NEXT: movaps 16(%rdi), %xmm1
239 ; SSE-NEXT: movaps (%rsi), %xmm5
240 ; SSE-NEXT: movaps 16(%rsi), %xmm6
241 ; SSE-NEXT: movaps (%rdx), %xmm7
242 ; SSE-NEXT: movaps 16(%rdx), %xmm4
243 ; SSE-NEXT: movaps (%rcx), %xmm8
244 ; SSE-NEXT: movaps 16(%rcx), %xmm9
245 ; SSE-NEXT: movaps %xmm7, %xmm10
246 ; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
247 ; SSE-NEXT: movaps %xmm0, %xmm2
248 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
249 ; SSE-NEXT: movaps %xmm2, %xmm3
250 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1]
251 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0]
252 ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
253 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
254 ; SSE-NEXT: movaps %xmm0, %xmm5
255 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1]
256 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0]
257 ; SSE-NEXT: movaps %xmm4, %xmm7
258 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
259 ; SSE-NEXT: movaps %xmm1, %xmm8
260 ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
261 ; SSE-NEXT: movaps %xmm8, %xmm10
262 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1]
263 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0]
264 ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
265 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
266 ; SSE-NEXT: movaps %xmm1, %xmm6
267 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1]
268 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
269 ; SSE-NEXT: movaps %xmm1, 96(%r8)
270 ; SSE-NEXT: movaps %xmm6, 112(%r8)
271 ; SSE-NEXT: movaps %xmm8, 64(%r8)
272 ; SSE-NEXT: movaps %xmm10, 80(%r8)
273 ; SSE-NEXT: movaps %xmm0, 32(%r8)
274 ; SSE-NEXT: movaps %xmm5, 48(%r8)
275 ; SSE-NEXT: movaps %xmm2, (%r8)
276 ; SSE-NEXT: movaps %xmm3, 16(%r8)
279 ; AVX1-ONLY-LABEL: store_i32_stride4_vf8:
280 ; AVX1-ONLY: # %bb.0:
281 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2
282 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0
283 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4
284 ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm1
285 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm2[1],xmm4[1],zero,zero
286 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
287 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
288 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm5
289 ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm6
290 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm7
291 ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm8
292 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
293 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm5[0],xmm7[0]
294 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0]
295 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
296 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7]
297 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm0[1],xmm1[1],zero,zero
298 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
299 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
300 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
301 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm8[0]
302 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,0]
303 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10
304 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7]
305 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
306 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm7[2],xmm5[2]
307 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5
308 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
309 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,0],xmm2[3,0]
310 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3]
311 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2
312 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7]
313 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3]
314 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm8[2],xmm6[2]
315 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
316 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
317 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0]
318 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
319 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
320 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
321 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8)
322 ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8)
323 ; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%r8)
324 ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8)
325 ; AVX1-ONLY-NEXT: vzeroupper
326 ; AVX1-ONLY-NEXT: retq
328 ; AVX2-ONLY-LABEL: store_i32_stride4_vf8:
329 ; AVX2-ONLY: # %bb.0:
330 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0
331 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1
332 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm2
333 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm3
334 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm4
335 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm5
336 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
337 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
338 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm7
339 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8
340 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
341 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3]
342 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
343 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
344 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
345 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
346 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
347 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
348 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
349 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3]
350 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
351 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3]
352 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
353 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
354 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
355 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
356 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
357 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
358 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8)
359 ; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r8)
360 ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8)
361 ; AVX2-ONLY-NEXT: vmovaps %ymm6, (%r8)
362 ; AVX2-ONLY-NEXT: vzeroupper
363 ; AVX2-ONLY-NEXT: retq
365 ; AVX512-LABEL: store_i32_stride4_vf8:
367 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
368 ; AVX512-NEXT: vmovdqa (%rdx), %ymm1
369 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
370 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
371 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27]
372 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
373 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31]
374 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3
375 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%r8)
376 ; AVX512-NEXT: vmovdqa64 %zmm2, (%r8)
377 ; AVX512-NEXT: vzeroupper
379 %in.vec0 = load <8 x i32>, ptr %in.vecptr0, align 64
380 %in.vec1 = load <8 x i32>, ptr %in.vecptr1, align 64
381 %in.vec2 = load <8 x i32>, ptr %in.vecptr2, align 64
382 %in.vec3 = load <8 x i32>, ptr %in.vecptr3, align 64
383 %1 = shufflevector <8 x i32> %in.vec0, <8 x i32> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
384 %2 = shufflevector <8 x i32> %in.vec2, <8 x i32> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
385 %3 = shufflevector <16 x i32> %1, <16 x i32> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
386 %interleaved.vec = shufflevector <32 x i32> %3, <32 x i32> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
387 store <32 x i32> %interleaved.vec, ptr %out.vec, align 64
391 define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
392 ; SSE-LABEL: store_i32_stride4_vf16:
394 ; SSE-NEXT: movaps (%rdi), %xmm5
395 ; SSE-NEXT: movaps 16(%rdi), %xmm11
396 ; SSE-NEXT: movaps 32(%rdi), %xmm4
397 ; SSE-NEXT: movaps 48(%rdi), %xmm2
398 ; SSE-NEXT: movaps (%rsi), %xmm0
399 ; SSE-NEXT: movaps 16(%rsi), %xmm3
400 ; SSE-NEXT: movaps 32(%rsi), %xmm9
401 ; SSE-NEXT: movaps (%rdx), %xmm7
402 ; SSE-NEXT: movaps 16(%rdx), %xmm13
403 ; SSE-NEXT: movaps 32(%rdx), %xmm10
404 ; SSE-NEXT: movaps (%rcx), %xmm8
405 ; SSE-NEXT: movaps 16(%rcx), %xmm14
406 ; SSE-NEXT: movaps 32(%rcx), %xmm12
407 ; SSE-NEXT: movaps %xmm7, %xmm15
408 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1]
409 ; SSE-NEXT: movaps %xmm5, %xmm6
410 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
411 ; SSE-NEXT: movaps %xmm6, %xmm1
412 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1]
413 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
414 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0]
415 ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
416 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
417 ; SSE-NEXT: movaps %xmm5, %xmm0
418 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1]
419 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
420 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0]
421 ; SSE-NEXT: movaps %xmm13, %xmm15
422 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
423 ; SSE-NEXT: movaps %xmm11, %xmm7
424 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
425 ; SSE-NEXT: movaps %xmm7, %xmm0
426 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1]
427 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
428 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm15[0]
429 ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3]
430 ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm3[2],xmm11[3],xmm3[3]
431 ; SSE-NEXT: movaps %xmm11, %xmm8
432 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1]
433 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0]
434 ; SSE-NEXT: movaps %xmm10, %xmm15
435 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1]
436 ; SSE-NEXT: movaps %xmm4, %xmm13
437 ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
438 ; SSE-NEXT: movaps %xmm13, %xmm14
439 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1]
440 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0]
441 ; SSE-NEXT: movaps 48(%rdx), %xmm15
442 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm12[2],xmm10[3],xmm12[3]
443 ; SSE-NEXT: movaps 48(%rcx), %xmm12
444 ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
445 ; SSE-NEXT: movaps %xmm4, %xmm9
446 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1]
447 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm10[0]
448 ; SSE-NEXT: movaps %xmm15, %xmm10
449 ; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1]
450 ; SSE-NEXT: movaps 48(%rsi), %xmm1
451 ; SSE-NEXT: movaps %xmm2, %xmm3
452 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
453 ; SSE-NEXT: movaps %xmm3, %xmm0
454 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1]
455 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0]
456 ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm12[2],xmm15[3],xmm12[3]
457 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
458 ; SSE-NEXT: movaps %xmm2, %xmm1
459 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1]
460 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm15[0]
461 ; SSE-NEXT: movaps %xmm2, 224(%r8)
462 ; SSE-NEXT: movaps %xmm1, 240(%r8)
463 ; SSE-NEXT: movaps %xmm3, 192(%r8)
464 ; SSE-NEXT: movaps %xmm0, 208(%r8)
465 ; SSE-NEXT: movaps %xmm4, 160(%r8)
466 ; SSE-NEXT: movaps %xmm9, 176(%r8)
467 ; SSE-NEXT: movaps %xmm13, 128(%r8)
468 ; SSE-NEXT: movaps %xmm14, 144(%r8)
469 ; SSE-NEXT: movaps %xmm11, 96(%r8)
470 ; SSE-NEXT: movaps %xmm8, 112(%r8)
471 ; SSE-NEXT: movaps %xmm7, 64(%r8)
472 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
473 ; SSE-NEXT: movaps %xmm0, 80(%r8)
474 ; SSE-NEXT: movaps %xmm5, 32(%r8)
475 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
476 ; SSE-NEXT: movaps %xmm0, 48(%r8)
477 ; SSE-NEXT: movaps %xmm6, (%r8)
478 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
479 ; SSE-NEXT: movaps %xmm0, 16(%r8)
482 ; AVX1-ONLY-LABEL: store_i32_stride4_vf16:
483 ; AVX1-ONLY: # %bb.0:
484 ; AVX1-ONLY-NEXT: subq $24, %rsp
485 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7
486 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3
487 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm10
488 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9
489 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4
490 ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm5
491 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[1],xmm9[1],zero,zero
492 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
493 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
494 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14
495 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6
496 ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm11
497 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2
498 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8
499 ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm12
500 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm14[0],xmm2[1],xmm14[1]
501 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm14[0],xmm2[0]
502 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0]
503 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1
504 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
505 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
506 ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
507 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[1],xmm5[1],zero,zero
508 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
509 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
510 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
511 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm12[0]
512 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0]
513 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1
514 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
515 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
516 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
517 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
518 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[1],xmm4[1],zero,zero
519 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
520 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
521 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm8[0]
522 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
523 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
524 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1
525 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
526 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
527 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13
528 ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm15
529 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm13[1],xmm15[1],zero,zero
530 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
531 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3
532 ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1
533 ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm0
534 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm1[0],xmm0[0]
535 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0]
536 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
537 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4
538 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
539 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm14[2],xmm2[3],xmm14[3]
540 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm2[2],xmm14[2]
541 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
542 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
543 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[3,0],xmm7[3,0]
544 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3]
545 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
546 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
547 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
548 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm12[2],xmm11[2]
549 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
550 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
551 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
552 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm7[3,0]
553 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
554 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
555 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
556 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3]
557 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm8[2],xmm6[2]
558 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
559 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
560 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
561 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
562 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm7[3,0]
563 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3]
564 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
565 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
566 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
567 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[2],xmm1[2]
568 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
569 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm15[2],xmm13[3],xmm15[3]
570 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm15[3,0],xmm13[3,0]
571 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
572 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
573 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
574 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8)
575 ; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%r8)
576 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r8)
577 ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8)
578 ; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%r8)
579 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
580 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8)
581 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
582 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8)
583 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
584 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8)
585 ; AVX1-ONLY-NEXT: addq $24, %rsp
586 ; AVX1-ONLY-NEXT: vzeroupper
587 ; AVX1-ONLY-NEXT: retq
589 ; AVX2-ONLY-LABEL: store_i32_stride4_vf16:
590 ; AVX2-ONLY: # %bb.0:
591 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0
592 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2
593 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1
594 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4
595 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm5
596 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm7
597 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm6
598 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8
599 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
600 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
601 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9
602 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10
603 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
604 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3]
605 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7]
606 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm11
607 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
608 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12
609 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1]
610 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
611 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3]
612 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
613 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
614 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1]
615 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
616 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3]
617 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
618 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm9
619 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
620 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm8
621 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1]
622 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
623 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3]
624 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7]
625 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
626 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3]
627 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5]
628 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3]
629 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
630 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm11
631 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7]
632 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm9
633 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,2,3]
634 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7]
635 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
636 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7]
637 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[4],ymm9[4],ymm11[5],ymm9[5]
638 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3]
639 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
640 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3]
641 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7]
642 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[6],ymm9[6],ymm11[7],ymm9[7]
643 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
644 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,2,2,3]
645 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
646 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
647 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8)
648 ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8)
649 ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8)
650 ; AVX2-ONLY-NEXT: vmovaps %ymm10, 192(%r8)
651 ; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r8)
652 ; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8)
653 ; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r8)
654 ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8)
655 ; AVX2-ONLY-NEXT: vzeroupper
656 ; AVX2-ONLY-NEXT: retq
658 ; AVX512F-LABEL: store_i32_stride4_vf16:
660 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
661 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1
662 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm2
663 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm3
664 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19>
665 ; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
666 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u>
667 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
668 ; AVX512F-NEXT: movb $-86, %al
669 ; AVX512F-NEXT: kmovw %eax, %k1
670 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
671 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23>
672 ; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
673 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u>
674 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
675 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
676 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27>
677 ; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
678 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u>
679 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm7
680 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
681 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31>
682 ; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
683 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u>
684 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
685 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1}
686 ; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%r8)
687 ; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%r8)
688 ; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%r8)
689 ; AVX512F-NEXT: vmovdqa64 %zmm5, (%r8)
690 ; AVX512F-NEXT: vzeroupper
693 ; AVX512BW-LABEL: store_i32_stride4_vf16:
695 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
696 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1
697 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2
698 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3
699 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19>
700 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
701 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u>
702 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
703 ; AVX512BW-NEXT: movb $-86, %al
704 ; AVX512BW-NEXT: kmovd %eax, %k1
705 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
706 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23>
707 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
708 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u>
709 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
710 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1}
711 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27>
712 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
713 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u>
714 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7
715 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1}
716 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31>
717 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
718 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u>
719 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
720 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1}
721 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r8)
722 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r8)
723 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%r8)
724 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r8)
725 ; AVX512BW-NEXT: vzeroupper
726 ; AVX512BW-NEXT: retq
727 %in.vec0 = load <16 x i32>, ptr %in.vecptr0, align 64
728 %in.vec1 = load <16 x i32>, ptr %in.vecptr1, align 64
729 %in.vec2 = load <16 x i32>, ptr %in.vecptr2, align 64
730 %in.vec3 = load <16 x i32>, ptr %in.vecptr3, align 64
731 %1 = shufflevector <16 x i32> %in.vec0, <16 x i32> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
732 %2 = shufflevector <16 x i32> %in.vec2, <16 x i32> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
733 %3 = shufflevector <32 x i32> %1, <32 x i32> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
734 %interleaved.vec = shufflevector <64 x i32> %3, <64 x i32> poison, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
735 store <64 x i32> %interleaved.vec, ptr %out.vec, align 64
739 define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
740 ; SSE-LABEL: store_i32_stride4_vf32:
742 ; SSE-NEXT: subq $184, %rsp
743 ; SSE-NEXT: movaps (%rdi), %xmm10
744 ; SSE-NEXT: movaps 16(%rdi), %xmm11
745 ; SSE-NEXT: movaps 32(%rdi), %xmm12
746 ; SSE-NEXT: movaps 48(%rdi), %xmm13
747 ; SSE-NEXT: movaps (%rsi), %xmm5
748 ; SSE-NEXT: movaps 16(%rsi), %xmm2
749 ; SSE-NEXT: movaps 32(%rsi), %xmm0
750 ; SSE-NEXT: movaps (%rdx), %xmm6
751 ; SSE-NEXT: movaps 16(%rdx), %xmm4
752 ; SSE-NEXT: movaps 32(%rdx), %xmm1
753 ; SSE-NEXT: movaps (%rcx), %xmm7
754 ; SSE-NEXT: movaps 16(%rcx), %xmm8
755 ; SSE-NEXT: movaps 32(%rcx), %xmm3
756 ; SSE-NEXT: movaps %xmm6, %xmm9
757 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
758 ; SSE-NEXT: movaps %xmm10, %xmm14
759 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1]
760 ; SSE-NEXT: movaps %xmm14, %xmm15
761 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm9[0]
762 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
763 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1]
764 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
765 ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
766 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3]
767 ; SSE-NEXT: movaps %xmm10, %xmm5
768 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0]
769 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
770 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1]
771 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
772 ; SSE-NEXT: movaps %xmm4, %xmm5
773 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
774 ; SSE-NEXT: movaps %xmm11, %xmm6
775 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
776 ; SSE-NEXT: movaps %xmm6, %xmm7
777 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0]
778 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
779 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1]
780 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
781 ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
782 ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3]
783 ; SSE-NEXT: movaps %xmm11, %xmm2
784 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0]
785 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
786 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1]
787 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
788 ; SSE-NEXT: movaps %xmm1, %xmm2
789 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
790 ; SSE-NEXT: movaps %xmm12, %xmm4
791 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
792 ; SSE-NEXT: movaps %xmm4, %xmm5
793 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0]
794 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
795 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
796 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
797 ; SSE-NEXT: movaps 48(%rdx), %xmm2
798 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
799 ; SSE-NEXT: movaps 48(%rcx), %xmm3
800 ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3]
801 ; SSE-NEXT: movaps %xmm12, %xmm0
802 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
803 ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
804 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1]
805 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
806 ; SSE-NEXT: movaps %xmm2, %xmm0
807 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
808 ; SSE-NEXT: movaps 48(%rsi), %xmm1
809 ; SSE-NEXT: movaps %xmm13, %xmm4
810 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
811 ; SSE-NEXT: movaps %xmm4, %xmm5
812 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
813 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
814 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
815 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
816 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
817 ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3]
818 ; SSE-NEXT: movaps %xmm13, %xmm0
819 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
820 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
821 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1]
822 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
823 ; SSE-NEXT: movaps 64(%rdx), %xmm0
824 ; SSE-NEXT: movaps 64(%rcx), %xmm1
825 ; SSE-NEXT: movaps %xmm0, %xmm2
826 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
827 ; SSE-NEXT: movaps 64(%rdi), %xmm13
828 ; SSE-NEXT: movaps 64(%rsi), %xmm3
829 ; SSE-NEXT: movaps %xmm13, %xmm14
830 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
831 ; SSE-NEXT: movaps %xmm14, %xmm4
832 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0]
833 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
834 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1]
835 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
836 ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3]
837 ; SSE-NEXT: movaps %xmm13, %xmm1
838 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
839 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
840 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
841 ; SSE-NEXT: movaps 80(%rdx), %xmm0
842 ; SSE-NEXT: movaps 80(%rcx), %xmm1
843 ; SSE-NEXT: movaps %xmm0, %xmm2
844 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
845 ; SSE-NEXT: movaps 80(%rdi), %xmm11
846 ; SSE-NEXT: movaps 80(%rsi), %xmm7
847 ; SSE-NEXT: movaps %xmm11, %xmm8
848 ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
849 ; SSE-NEXT: movaps %xmm8, %xmm3
850 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
851 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
852 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1]
853 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
854 ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
855 ; SSE-NEXT: movaps %xmm11, %xmm15
856 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0]
857 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
858 ; SSE-NEXT: movaps 96(%rdx), %xmm1
859 ; SSE-NEXT: movaps 96(%rcx), %xmm6
860 ; SSE-NEXT: movaps %xmm1, %xmm0
861 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
862 ; SSE-NEXT: movaps 96(%rdi), %xmm5
863 ; SSE-NEXT: movaps 96(%rsi), %xmm4
864 ; SSE-NEXT: movaps %xmm5, %xmm9
865 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
866 ; SSE-NEXT: movaps %xmm9, %xmm12
867 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0]
868 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1]
869 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
870 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
871 ; SSE-NEXT: movaps %xmm5, %xmm10
872 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0]
873 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
874 ; SSE-NEXT: movaps 112(%rdx), %xmm2
875 ; SSE-NEXT: movaps 112(%rcx), %xmm7
876 ; SSE-NEXT: movaps %xmm2, %xmm6
877 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
878 ; SSE-NEXT: movaps 112(%rdi), %xmm0
879 ; SSE-NEXT: movaps 112(%rsi), %xmm4
880 ; SSE-NEXT: movaps %xmm0, %xmm1
881 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
882 ; SSE-NEXT: movaps %xmm1, %xmm3
883 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0]
884 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
885 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
886 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
887 ; SSE-NEXT: movaps %xmm0, %xmm4
888 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0]
889 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
890 ; SSE-NEXT: movaps %xmm0, 496(%r8)
891 ; SSE-NEXT: movaps %xmm4, 480(%r8)
892 ; SSE-NEXT: movaps %xmm1, 464(%r8)
893 ; SSE-NEXT: movaps %xmm3, 448(%r8)
894 ; SSE-NEXT: movaps %xmm5, 432(%r8)
895 ; SSE-NEXT: movaps %xmm10, 416(%r8)
896 ; SSE-NEXT: movaps %xmm9, 400(%r8)
897 ; SSE-NEXT: movaps %xmm12, 384(%r8)
898 ; SSE-NEXT: movaps %xmm11, 368(%r8)
899 ; SSE-NEXT: movaps %xmm15, 352(%r8)
900 ; SSE-NEXT: movaps %xmm8, 336(%r8)
901 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
902 ; SSE-NEXT: movaps %xmm0, 320(%r8)
903 ; SSE-NEXT: movaps %xmm13, 304(%r8)
904 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
905 ; SSE-NEXT: movaps %xmm0, 288(%r8)
906 ; SSE-NEXT: movaps %xmm14, 272(%r8)
907 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
908 ; SSE-NEXT: movaps %xmm0, 256(%r8)
909 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
910 ; SSE-NEXT: movaps %xmm0, 240(%r8)
911 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
912 ; SSE-NEXT: movaps %xmm0, 224(%r8)
913 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
914 ; SSE-NEXT: movaps %xmm0, 208(%r8)
915 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
916 ; SSE-NEXT: movaps %xmm0, 192(%r8)
917 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
918 ; SSE-NEXT: movaps %xmm0, 176(%r8)
919 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
920 ; SSE-NEXT: movaps %xmm0, 160(%r8)
921 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
922 ; SSE-NEXT: movaps %xmm0, 144(%r8)
923 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
924 ; SSE-NEXT: movaps %xmm0, 128(%r8)
925 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
926 ; SSE-NEXT: movaps %xmm0, 112(%r8)
927 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
928 ; SSE-NEXT: movaps %xmm0, 96(%r8)
929 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
930 ; SSE-NEXT: movaps %xmm0, 80(%r8)
931 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
932 ; SSE-NEXT: movaps %xmm0, 64(%r8)
933 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
934 ; SSE-NEXT: movaps %xmm0, 48(%r8)
935 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
936 ; SSE-NEXT: movaps %xmm0, 32(%r8)
937 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
938 ; SSE-NEXT: movaps %xmm0, 16(%r8)
939 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
940 ; SSE-NEXT: movaps %xmm0, (%r8)
941 ; SSE-NEXT: addq $184, %rsp
944 ; AVX1-ONLY-LABEL: store_i32_stride4_vf32:
945 ; AVX1-ONLY: # %bb.0:
946 ; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8
947 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2
948 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
949 ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm1
950 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
951 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
952 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
953 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
954 ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm3
955 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
956 ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm2
957 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
958 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
959 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
960 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0]
961 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
962 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
963 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
964 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2
965 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
966 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1
967 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
968 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
969 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
970 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
971 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3
972 ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill
973 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2
974 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
975 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
976 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
977 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0]
978 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
979 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
980 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
981 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2
982 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
983 ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm1
984 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
985 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
986 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
987 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
988 ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm3
989 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
990 ; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2
991 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
992 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
993 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
994 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
995 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
996 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
997 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
998 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2
999 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1000 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1
1001 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1002 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
1003 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1004 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1005 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2
1006 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1007 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13
1008 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm13[0]
1009 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1010 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1]
1011 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1012 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1013 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1014 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2
1015 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1016 ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm1
1017 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1018 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
1019 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1020 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1021 ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm10
1022 ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm9
1023 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0]
1024 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1025 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
1026 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1027 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1028 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1029 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2
1030 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1031 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1
1032 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1033 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
1034 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1035 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1036 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm7
1037 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm5
1038 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm5[0]
1039 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1040 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
1041 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1042 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1043 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1044 ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1
1045 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1046 ; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm12
1047 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm12[1],zero,zero
1048 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
1049 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1050 ; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm4
1051 ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm3
1052 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0]
1053 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1054 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1055 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
1056 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1057 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1058 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11
1059 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8
1060 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[1],xmm8[1],zero,zero
1061 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
1062 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1063 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6
1064 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2
1065 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm6[0],xmm2[0]
1066 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,0]
1067 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
1068 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
1069 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7]
1070 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1071 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1072 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1073 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
1074 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2]
1075 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm1
1076 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1077 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1078 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
1079 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0]
1080 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
1081 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
1082 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7]
1083 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1084 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1085 ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload
1086 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
1087 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2]
1088 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0
1089 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1090 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1091 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm1[2],xmm15[3],xmm1[3]
1092 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0]
1093 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
1094 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
1095 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7]
1096 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1097 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1098 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1099 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
1100 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2]
1101 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0
1102 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1103 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1104 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm1[2],xmm15[3],xmm1[3]
1105 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0]
1106 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
1107 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
1108 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7]
1109 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1110 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm13[2],xmm1[2],xmm13[3],xmm1[3]
1111 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm13[2],xmm1[2]
1112 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0
1113 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1114 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1115 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm1[2],xmm14[3],xmm1[3]
1116 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[3,0],xmm14[3,0]
1117 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3]
1118 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13
1119 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7]
1120 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm9[2],xmm10[2],xmm9[3],xmm10[3]
1121 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm9[2],xmm10[2]
1122 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9
1123 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1124 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1125 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm13[2],xmm1[2],xmm13[3],xmm1[3]
1126 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[3,0],xmm13[3,0]
1127 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0,2,3]
1128 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10
1129 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
1130 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm5[2],xmm7[2],xmm5[3],xmm7[3]
1131 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm5[2],xmm7[2]
1132 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5
1133 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1134 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1135 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
1136 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[3,0],xmm10[3,0]
1137 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3]
1138 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7
1139 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
1140 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1141 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm4[2]
1142 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3
1143 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1144 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm1[2],xmm12[2],xmm1[3],xmm12[3]
1145 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm12[3,0],xmm1[3,0]
1146 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,0,2,3]
1147 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4
1148 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
1149 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
1150 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm6[2]
1151 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
1152 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
1153 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[3,0],xmm11[3,0]
1154 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3]
1155 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
1156 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
1157 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8)
1158 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 480(%r8)
1159 ; AVX1-ONLY-NEXT: vmovaps %ymm5, 416(%r8)
1160 ; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%r8)
1161 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8)
1162 ; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%r8)
1163 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1164 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r8)
1165 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1166 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8)
1167 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1168 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8)
1169 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1170 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r8)
1171 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1172 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r8)
1173 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1174 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8)
1175 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1176 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8)
1177 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1178 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%r8)
1179 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1180 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%r8)
1181 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1182 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8)
1183 ; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8
1184 ; AVX1-ONLY-NEXT: vzeroupper
1185 ; AVX1-ONLY-NEXT: retq
1187 ; AVX2-ONLY-LABEL: store_i32_stride4_vf32:
1188 ; AVX2-ONLY: # %bb.0:
1189 ; AVX2-ONLY-NEXT: pushq %rax
1190 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2
1191 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6
1192 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm4
1193 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7
1194 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm11
1195 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm5
1196 ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm1
1197 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm12
1198 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8
1199 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm3
1200 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1201 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
1202 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm9
1203 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm10
1204 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13
1205 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm14
1206 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1]
1207 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3]
1208 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
1209 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1210 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1211 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
1212 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm10[2],xmm14[3],xmm10[3]
1213 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
1214 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7]
1215 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1216 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
1217 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1]
1218 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
1219 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3]
1220 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7]
1221 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1222 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
1223 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1]
1224 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm13[2],xmm9[2],xmm13[3],xmm9[3]
1225 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3]
1226 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7]
1227 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1228 ; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm9
1229 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm10
1230 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
1231 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1]
1232 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm13
1233 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm14
1234 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
1235 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3]
1236 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7]
1237 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm15
1238 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
1239 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0
1240 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1]
1241 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
1242 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3]
1243 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
1244 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
1245 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1]
1246 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
1247 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3]
1248 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7]
1249 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm13
1250 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
1251 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm14
1252 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1]
1253 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
1254 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
1255 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3],ymm0[4,5],ymm11[6,7]
1256 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7]
1257 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
1258 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7]
1259 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3]
1260 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7]
1261 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0
1262 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5]
1263 ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm14
1264 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3]
1265 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5]
1266 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3]
1267 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5],ymm13[6,7]
1268 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5]
1269 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,2,3]
1270 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5]
1271 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3]
1272 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7]
1273 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13
1274 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7]
1275 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm14
1276 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7]
1277 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm4
1278 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
1279 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
1280 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
1281 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm14[0],ymm4[0],ymm14[1],ymm4[1],ymm14[4],ymm4[4],ymm14[5],ymm4[5]
1282 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
1283 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15
1284 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[4],ymm15[4],ymm13[5],ymm15[5]
1285 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
1286 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
1287 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[6],ymm4[6],ymm14[7],ymm4[7]
1288 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7]
1289 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13
1290 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
1291 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3]
1292 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
1293 ; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm4
1294 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5]
1295 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,2,3]
1296 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15
1297 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm0
1298 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5]
1299 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3]
1300 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7]
1301 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7]
1302 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7]
1303 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3]
1304 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
1305 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7]
1306 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8)
1307 ; AVX2-ONLY-NEXT: vmovaps %ymm5, 448(%r8)
1308 ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8)
1309 ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8)
1310 ; AVX2-ONLY-NEXT: vmovaps %ymm3, 352(%r8)
1311 ; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%r8)
1312 ; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8)
1313 ; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%r8)
1314 ; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r8)
1315 ; AVX2-ONLY-NEXT: vmovaps %ymm10, (%r8)
1316 ; AVX2-ONLY-NEXT: vmovaps %ymm9, 416(%r8)
1317 ; AVX2-ONLY-NEXT: vmovaps %ymm8, 384(%r8)
1318 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1319 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8)
1320 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1321 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8)
1322 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1323 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r8)
1324 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1325 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r8)
1326 ; AVX2-ONLY-NEXT: popq %rax
1327 ; AVX2-ONLY-NEXT: vzeroupper
1328 ; AVX2-ONLY-NEXT: retq
1330 ; AVX512F-LABEL: store_i32_stride4_vf32:
1332 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
1333 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1
1334 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm2
1335 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm3
1336 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm4
1337 ; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm5
1338 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm6
1339 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm7
1340 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23>
1341 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9
1342 ; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
1343 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u>
1344 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11
1345 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm10, %zmm11
1346 ; AVX512F-NEXT: movb $-86, %al
1347 ; AVX512F-NEXT: kmovw %eax, %k1
1348 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1}
1349 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19>
1350 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12
1351 ; AVX512F-NEXT: vpermt2d %zmm6, %zmm9, %zmm12
1352 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u>
1353 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14
1354 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm13, %zmm14
1355 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
1356 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31>
1357 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm15
1358 ; AVX512F-NEXT: vpermt2d %zmm6, %zmm12, %zmm15
1359 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u>
1360 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17
1361 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm17
1362 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1}
1363 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27>
1364 ; AVX512F-NEXT: vpermt2d %zmm6, %zmm15, %zmm4
1365 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u>
1366 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm0
1367 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
1368 ; AVX512F-NEXT: vpermi2d %zmm7, %zmm5, %zmm8
1369 ; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm10
1370 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
1371 ; AVX512F-NEXT: vpermi2d %zmm7, %zmm5, %zmm9
1372 ; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm13
1373 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1}
1374 ; AVX512F-NEXT: vpermi2d %zmm7, %zmm5, %zmm12
1375 ; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm16
1376 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1}
1377 ; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm5
1378 ; AVX512F-NEXT: vpermt2d %zmm3, %zmm6, %zmm1
1379 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1}
1380 ; AVX512F-NEXT: vmovdqa64 %zmm1, 384(%r8)
1381 ; AVX512F-NEXT: vmovdqa64 %zmm16, 448(%r8)
1382 ; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%r8)
1383 ; AVX512F-NEXT: vmovdqa64 %zmm10, 320(%r8)
1384 ; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r8)
1385 ; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%r8)
1386 ; AVX512F-NEXT: vmovdqa64 %zmm14, (%r8)
1387 ; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%r8)
1388 ; AVX512F-NEXT: vzeroupper
1389 ; AVX512F-NEXT: retq
1391 ; AVX512BW-LABEL: store_i32_stride4_vf32:
1392 ; AVX512BW: # %bb.0:
1393 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1394 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1395 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2
1396 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3
1397 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4
1398 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5
1399 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6
1400 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7
1401 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23>
1402 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9
1403 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9
1404 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u>
1405 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11
1406 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm11
1407 ; AVX512BW-NEXT: movb $-86, %al
1408 ; AVX512BW-NEXT: kmovd %eax, %k1
1409 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1}
1410 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19>
1411 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12
1412 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm12
1413 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u>
1414 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14
1415 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm14
1416 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1}
1417 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31>
1418 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15
1419 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm15
1420 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u>
1421 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17
1422 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm17
1423 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1}
1424 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27>
1425 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm4
1426 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u>
1427 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm0
1428 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1}
1429 ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm8
1430 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm10
1431 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
1432 ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm9
1433 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm13
1434 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1}
1435 ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm12
1436 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm16
1437 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1}
1438 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm5
1439 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1
1440 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1}
1441 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8)
1442 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%r8)
1443 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%r8)
1444 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%r8)
1445 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8)
1446 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%r8)
1447 ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r8)
1448 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%r8)
1449 ; AVX512BW-NEXT: vzeroupper
1450 ; AVX512BW-NEXT: retq
1451 %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64
1452 %in.vec1 = load <32 x i32>, ptr %in.vecptr1, align 64
1453 %in.vec2 = load <32 x i32>, ptr %in.vecptr2, align 64
1454 %in.vec3 = load <32 x i32>, ptr %in.vecptr3, align 64
1455 %1 = shufflevector <32 x i32> %in.vec0, <32 x i32> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1456 %2 = shufflevector <32 x i32> %in.vec2, <32 x i32> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1457 %3 = shufflevector <64 x i32> %1, <64 x i32> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1458 %interleaved.vec = shufflevector <128 x i32> %3, <128 x i32> poison, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
1459 store <128 x i32> %interleaved.vec, ptr %out.vec, align 64
1463 define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind {
1464 ; SSE-LABEL: store_i32_stride4_vf64:
1466 ; SSE-NEXT: subq $696, %rsp # imm = 0x2B8
1467 ; SSE-NEXT: movaps (%rdi), %xmm10
1468 ; SSE-NEXT: movaps 16(%rdi), %xmm11
1469 ; SSE-NEXT: movaps 32(%rdi), %xmm12
1470 ; SSE-NEXT: movaps 48(%rdi), %xmm13
1471 ; SSE-NEXT: movaps (%rsi), %xmm4
1472 ; SSE-NEXT: movaps 16(%rsi), %xmm2
1473 ; SSE-NEXT: movaps 32(%rsi), %xmm0
1474 ; SSE-NEXT: movaps (%rdx), %xmm6
1475 ; SSE-NEXT: movaps 16(%rdx), %xmm3
1476 ; SSE-NEXT: movaps 32(%rdx), %xmm1
1477 ; SSE-NEXT: movaps (%rcx), %xmm7
1478 ; SSE-NEXT: movaps 16(%rcx), %xmm8
1479 ; SSE-NEXT: movaps 32(%rcx), %xmm5
1480 ; SSE-NEXT: movaps %xmm6, %xmm9
1481 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
1482 ; SSE-NEXT: movaps %xmm10, %xmm14
1483 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
1484 ; SSE-NEXT: movaps %xmm14, %xmm15
1485 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm9[0]
1486 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1487 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1]
1488 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1489 ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1490 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3]
1491 ; SSE-NEXT: movaps %xmm10, %xmm4
1492 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0]
1493 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1494 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1]
1495 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1496 ; SSE-NEXT: movaps %xmm3, %xmm4
1497 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
1498 ; SSE-NEXT: movaps %xmm11, %xmm6
1499 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
1500 ; SSE-NEXT: movaps %xmm6, %xmm7
1501 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0]
1502 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1503 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1]
1504 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1505 ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
1506 ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3]
1507 ; SSE-NEXT: movaps %xmm11, %xmm2
1508 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1509 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1510 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1]
1511 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1512 ; SSE-NEXT: movaps %xmm1, %xmm2
1513 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1514 ; SSE-NEXT: movaps %xmm12, %xmm3
1515 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
1516 ; SSE-NEXT: movaps %xmm3, %xmm4
1517 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0]
1518 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1519 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1520 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1521 ; SSE-NEXT: movaps 48(%rdx), %xmm2
1522 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
1523 ; SSE-NEXT: movaps 48(%rcx), %xmm3
1524 ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3]
1525 ; SSE-NEXT: movaps %xmm12, %xmm0
1526 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1527 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1528 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1]
1529 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1530 ; SSE-NEXT: movaps %xmm2, %xmm0
1531 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
1532 ; SSE-NEXT: movaps 48(%rsi), %xmm1
1533 ; SSE-NEXT: movaps %xmm13, %xmm4
1534 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1535 ; SSE-NEXT: movaps %xmm4, %xmm5
1536 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
1537 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1538 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
1539 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1540 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1541 ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3]
1542 ; SSE-NEXT: movaps %xmm13, %xmm0
1543 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1544 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1545 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1]
1546 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1547 ; SSE-NEXT: movaps 64(%rdx), %xmm0
1548 ; SSE-NEXT: movaps 64(%rcx), %xmm1
1549 ; SSE-NEXT: movaps %xmm0, %xmm2
1550 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1551 ; SSE-NEXT: movaps 64(%rdi), %xmm5
1552 ; SSE-NEXT: movaps 64(%rsi), %xmm3
1553 ; SSE-NEXT: movaps %xmm5, %xmm4
1554 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1555 ; SSE-NEXT: movaps %xmm4, %xmm6
1556 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0]
1557 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1558 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
1559 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1560 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1561 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1562 ; SSE-NEXT: movaps %xmm5, %xmm1
1563 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1564 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1565 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
1566 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1567 ; SSE-NEXT: movaps 80(%rdx), %xmm0
1568 ; SSE-NEXT: movaps 80(%rcx), %xmm1
1569 ; SSE-NEXT: movaps %xmm0, %xmm2
1570 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1571 ; SSE-NEXT: movaps 80(%rdi), %xmm5
1572 ; SSE-NEXT: movaps 80(%rsi), %xmm3
1573 ; SSE-NEXT: movaps %xmm5, %xmm4
1574 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1575 ; SSE-NEXT: movaps %xmm4, %xmm6
1576 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0]
1577 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1578 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
1579 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1580 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1581 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1582 ; SSE-NEXT: movaps %xmm5, %xmm1
1583 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1584 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1585 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
1586 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1587 ; SSE-NEXT: movaps 96(%rdx), %xmm0
1588 ; SSE-NEXT: movaps 96(%rcx), %xmm1
1589 ; SSE-NEXT: movaps %xmm0, %xmm2
1590 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1591 ; SSE-NEXT: movaps 96(%rdi), %xmm5
1592 ; SSE-NEXT: movaps 96(%rsi), %xmm3
1593 ; SSE-NEXT: movaps %xmm5, %xmm4
1594 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1595 ; SSE-NEXT: movaps %xmm4, %xmm6
1596 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0]
1597 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1598 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
1599 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1600 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1601 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1602 ; SSE-NEXT: movaps %xmm5, %xmm1
1603 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1604 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1605 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
1606 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1607 ; SSE-NEXT: movaps 112(%rdx), %xmm0
1608 ; SSE-NEXT: movaps 112(%rcx), %xmm1
1609 ; SSE-NEXT: movaps %xmm0, %xmm2
1610 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1611 ; SSE-NEXT: movaps 112(%rdi), %xmm5
1612 ; SSE-NEXT: movaps 112(%rsi), %xmm3
1613 ; SSE-NEXT: movaps %xmm5, %xmm4
1614 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1615 ; SSE-NEXT: movaps %xmm4, %xmm6
1616 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0]
1617 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1618 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
1619 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1620 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1621 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1622 ; SSE-NEXT: movaps %xmm5, %xmm1
1623 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1624 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1625 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
1626 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1627 ; SSE-NEXT: movaps 128(%rdx), %xmm0
1628 ; SSE-NEXT: movaps 128(%rcx), %xmm1
1629 ; SSE-NEXT: movaps %xmm0, %xmm2
1630 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1631 ; SSE-NEXT: movaps 128(%rdi), %xmm5
1632 ; SSE-NEXT: movaps 128(%rsi), %xmm3
1633 ; SSE-NEXT: movaps %xmm5, %xmm4
1634 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1635 ; SSE-NEXT: movaps %xmm4, %xmm6
1636 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0]
1637 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1638 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
1639 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1640 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1641 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1642 ; SSE-NEXT: movaps %xmm5, %xmm1
1643 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1644 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1645 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
1646 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1647 ; SSE-NEXT: movaps 144(%rdx), %xmm0
1648 ; SSE-NEXT: movaps 144(%rcx), %xmm1
1649 ; SSE-NEXT: movaps %xmm0, %xmm2
1650 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1651 ; SSE-NEXT: movaps 144(%rdi), %xmm5
1652 ; SSE-NEXT: movaps 144(%rsi), %xmm3
1653 ; SSE-NEXT: movaps %xmm5, %xmm4
1654 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1655 ; SSE-NEXT: movaps %xmm4, %xmm6
1656 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0]
1657 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1658 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
1659 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1660 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1661 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1662 ; SSE-NEXT: movaps %xmm5, %xmm1
1663 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1664 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1665 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
1666 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1667 ; SSE-NEXT: movaps 160(%rdx), %xmm0
1668 ; SSE-NEXT: movaps 160(%rcx), %xmm1
1669 ; SSE-NEXT: movaps %xmm0, %xmm2
1670 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1671 ; SSE-NEXT: movaps 160(%rdi), %xmm5
1672 ; SSE-NEXT: movaps 160(%rsi), %xmm3
1673 ; SSE-NEXT: movaps %xmm5, %xmm4
1674 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1675 ; SSE-NEXT: movaps %xmm4, %xmm6
1676 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0]
1677 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1678 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
1679 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1680 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1681 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1682 ; SSE-NEXT: movaps %xmm5, %xmm1
1683 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1684 ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
1685 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
1686 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1687 ; SSE-NEXT: movaps 176(%rdx), %xmm0
1688 ; SSE-NEXT: movaps 176(%rcx), %xmm1
1689 ; SSE-NEXT: movaps %xmm0, %xmm2
1690 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1691 ; SSE-NEXT: movaps 176(%rdi), %xmm15
1692 ; SSE-NEXT: movaps 176(%rsi), %xmm3
1693 ; SSE-NEXT: movaps %xmm15, %xmm4
1694 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1695 ; SSE-NEXT: movaps %xmm4, %xmm5
1696 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0]
1697 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1698 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
1699 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1700 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1701 ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm3[2],xmm15[3],xmm3[3]
1702 ; SSE-NEXT: movaps %xmm15, %xmm1
1703 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1704 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1705 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1]
1706 ; SSE-NEXT: movaps 192(%rdx), %xmm0
1707 ; SSE-NEXT: movaps 192(%rcx), %xmm1
1708 ; SSE-NEXT: movaps %xmm0, %xmm2
1709 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1710 ; SSE-NEXT: movaps 192(%rdi), %xmm12
1711 ; SSE-NEXT: movaps 192(%rsi), %xmm3
1712 ; SSE-NEXT: movaps %xmm12, %xmm14
1713 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
1714 ; SSE-NEXT: movaps %xmm14, %xmm4
1715 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0]
1716 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1717 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1]
1718 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1719 ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3]
1720 ; SSE-NEXT: movaps %xmm12, %xmm1
1721 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1722 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1723 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1]
1724 ; SSE-NEXT: movaps 208(%rdx), %xmm0
1725 ; SSE-NEXT: movaps 208(%rcx), %xmm1
1726 ; SSE-NEXT: movaps %xmm0, %xmm2
1727 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1728 ; SSE-NEXT: movaps 208(%rdi), %xmm13
1729 ; SSE-NEXT: movaps 208(%rsi), %xmm7
1730 ; SSE-NEXT: movaps %xmm13, %xmm8
1731 ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
1732 ; SSE-NEXT: movaps %xmm8, %xmm3
1733 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1734 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1735 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1]
1736 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1737 ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm7[2],xmm13[3],xmm7[3]
1738 ; SSE-NEXT: movaps %xmm13, %xmm1
1739 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1740 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1741 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1]
1742 ; SSE-NEXT: movaps 224(%rdx), %xmm1
1743 ; SSE-NEXT: movaps 224(%rcx), %xmm6
1744 ; SSE-NEXT: movaps %xmm1, %xmm0
1745 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
1746 ; SSE-NEXT: movaps 224(%rdi), %xmm5
1747 ; SSE-NEXT: movaps 224(%rsi), %xmm4
1748 ; SSE-NEXT: movaps %xmm5, %xmm9
1749 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
1750 ; SSE-NEXT: movaps %xmm9, %xmm11
1751 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0]
1752 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1]
1753 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
1754 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1755 ; SSE-NEXT: movaps %xmm5, %xmm10
1756 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0]
1757 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
1758 ; SSE-NEXT: movaps 240(%rdx), %xmm2
1759 ; SSE-NEXT: movaps 240(%rcx), %xmm7
1760 ; SSE-NEXT: movaps %xmm2, %xmm6
1761 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
1762 ; SSE-NEXT: movaps 240(%rdi), %xmm0
1763 ; SSE-NEXT: movaps 240(%rsi), %xmm4
1764 ; SSE-NEXT: movaps %xmm0, %xmm1
1765 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
1766 ; SSE-NEXT: movaps %xmm1, %xmm3
1767 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0]
1768 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
1769 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
1770 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1771 ; SSE-NEXT: movaps %xmm0, %xmm4
1772 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0]
1773 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
1774 ; SSE-NEXT: movaps %xmm0, 1008(%r8)
1775 ; SSE-NEXT: movaps %xmm4, 992(%r8)
1776 ; SSE-NEXT: movaps %xmm1, 976(%r8)
1777 ; SSE-NEXT: movaps %xmm3, 960(%r8)
1778 ; SSE-NEXT: movaps %xmm5, 944(%r8)
1779 ; SSE-NEXT: movaps %xmm10, 928(%r8)
1780 ; SSE-NEXT: movaps %xmm9, 912(%r8)
1781 ; SSE-NEXT: movaps %xmm11, 896(%r8)
1782 ; SSE-NEXT: movaps %xmm13, 880(%r8)
1783 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1784 ; SSE-NEXT: movaps %xmm0, 864(%r8)
1785 ; SSE-NEXT: movaps %xmm8, 848(%r8)
1786 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1787 ; SSE-NEXT: movaps %xmm0, 832(%r8)
1788 ; SSE-NEXT: movaps %xmm12, 816(%r8)
1789 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1790 ; SSE-NEXT: movaps %xmm0, 800(%r8)
1791 ; SSE-NEXT: movaps %xmm14, 784(%r8)
1792 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1793 ; SSE-NEXT: movaps %xmm0, 768(%r8)
1794 ; SSE-NEXT: movaps %xmm15, 752(%r8)
1795 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1796 ; SSE-NEXT: movaps %xmm0, 736(%r8)
1797 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1798 ; SSE-NEXT: movaps %xmm0, 720(%r8)
1799 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1800 ; SSE-NEXT: movaps %xmm0, 704(%r8)
1801 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1802 ; SSE-NEXT: movaps %xmm0, 688(%r8)
1803 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1804 ; SSE-NEXT: movaps %xmm0, 672(%r8)
1805 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1806 ; SSE-NEXT: movaps %xmm0, 656(%r8)
1807 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1808 ; SSE-NEXT: movaps %xmm0, 640(%r8)
1809 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1810 ; SSE-NEXT: movaps %xmm0, 624(%r8)
1811 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1812 ; SSE-NEXT: movaps %xmm0, 608(%r8)
1813 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1814 ; SSE-NEXT: movaps %xmm0, 592(%r8)
1815 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1816 ; SSE-NEXT: movaps %xmm0, 576(%r8)
1817 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1818 ; SSE-NEXT: movaps %xmm0, 560(%r8)
1819 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1820 ; SSE-NEXT: movaps %xmm0, 544(%r8)
1821 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1822 ; SSE-NEXT: movaps %xmm0, 528(%r8)
1823 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1824 ; SSE-NEXT: movaps %xmm0, 512(%r8)
1825 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1826 ; SSE-NEXT: movaps %xmm0, 496(%r8)
1827 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1828 ; SSE-NEXT: movaps %xmm0, 480(%r8)
1829 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1830 ; SSE-NEXT: movaps %xmm0, 464(%r8)
1831 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1832 ; SSE-NEXT: movaps %xmm0, 448(%r8)
1833 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1834 ; SSE-NEXT: movaps %xmm0, 432(%r8)
1835 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1836 ; SSE-NEXT: movaps %xmm0, 416(%r8)
1837 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1838 ; SSE-NEXT: movaps %xmm0, 400(%r8)
1839 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1840 ; SSE-NEXT: movaps %xmm0, 384(%r8)
1841 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1842 ; SSE-NEXT: movaps %xmm0, 368(%r8)
1843 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1844 ; SSE-NEXT: movaps %xmm0, 352(%r8)
1845 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1846 ; SSE-NEXT: movaps %xmm0, 336(%r8)
1847 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1848 ; SSE-NEXT: movaps %xmm0, 320(%r8)
1849 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1850 ; SSE-NEXT: movaps %xmm0, 304(%r8)
1851 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1852 ; SSE-NEXT: movaps %xmm0, 288(%r8)
1853 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1854 ; SSE-NEXT: movaps %xmm0, 272(%r8)
1855 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1856 ; SSE-NEXT: movaps %xmm0, 256(%r8)
1857 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1858 ; SSE-NEXT: movaps %xmm0, 240(%r8)
1859 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1860 ; SSE-NEXT: movaps %xmm0, 224(%r8)
1861 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1862 ; SSE-NEXT: movaps %xmm0, 208(%r8)
1863 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1864 ; SSE-NEXT: movaps %xmm0, 192(%r8)
1865 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1866 ; SSE-NEXT: movaps %xmm0, 176(%r8)
1867 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1868 ; SSE-NEXT: movaps %xmm0, 160(%r8)
1869 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1870 ; SSE-NEXT: movaps %xmm0, 144(%r8)
1871 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1872 ; SSE-NEXT: movaps %xmm0, 128(%r8)
1873 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1874 ; SSE-NEXT: movaps %xmm0, 112(%r8)
1875 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1876 ; SSE-NEXT: movaps %xmm0, 96(%r8)
1877 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1878 ; SSE-NEXT: movaps %xmm0, 80(%r8)
1879 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1880 ; SSE-NEXT: movaps %xmm0, 64(%r8)
1881 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1882 ; SSE-NEXT: movaps %xmm0, 48(%r8)
1883 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1884 ; SSE-NEXT: movaps %xmm0, 32(%r8)
1885 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1886 ; SSE-NEXT: movaps %xmm0, 16(%r8)
1887 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1888 ; SSE-NEXT: movaps %xmm0, (%r8)
1889 ; SSE-NEXT: addq $696, %rsp # imm = 0x2B8
1892 ; AVX1-ONLY-LABEL: store_i32_stride4_vf64:
1893 ; AVX1-ONLY: # %bb.0:
1894 ; AVX1-ONLY-NEXT: subq $1384, %rsp # imm = 0x568
1895 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2
1896 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1897 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5
1898 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3
1899 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1
1900 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1901 ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm6
1902 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4
1903 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
1904 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1905 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1906 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11
1907 ; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1908 ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm9
1909 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm7
1910 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2
1911 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1912 ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm10
1913 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8
1914 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
1915 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm11[0],xmm2[0]
1916 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0]
1917 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1918 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1919 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1920 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1921 ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1922 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[1],xmm6[1],zero,zero
1923 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
1924 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1925 ; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1926 ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1927 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
1928 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm10[0]
1929 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0]
1930 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1931 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1932 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1933 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1934 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1935 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[1],xmm4[1],zero,zero
1936 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1937 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1938 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1939 ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1940 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0]
1941 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1942 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
1943 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1944 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1945 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1946 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2
1947 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1948 ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm1
1949 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1950 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
1951 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1952 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1953 ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm3
1954 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1955 ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm2
1956 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1957 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
1958 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1959 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1960 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1961 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1962 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1963 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2
1964 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1965 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1
1966 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1967 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
1968 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1969 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1970 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3
1971 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1972 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2
1973 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1974 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
1975 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1976 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1977 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1978 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1979 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1980 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2
1981 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1982 ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm1
1983 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1984 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
1985 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1986 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1987 ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm3
1988 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1989 ; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2
1990 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1991 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
1992 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1993 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1994 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1995 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1996 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1997 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2
1998 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1999 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1
2000 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2001 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
2002 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2003 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2004 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3
2005 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2006 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2
2007 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2008 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
2009 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
2010 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2011 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2012 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2013 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2014 ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2
2015 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2016 ; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm1
2017 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2018 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
2019 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2020 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2021 ; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm3
2022 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2023 ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm2
2024 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2025 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
2026 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
2027 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2028 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2029 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2030 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2031 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2
2032 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2033 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm1
2034 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2035 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
2036 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2037 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2038 ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm3
2039 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2040 ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2
2041 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2042 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
2043 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
2044 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2045 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2046 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2047 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2048 ; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm2
2049 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2050 ; AVX1-ONLY-NEXT: vmovaps 144(%rsi), %xmm1
2051 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2052 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
2053 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2054 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2055 ; AVX1-ONLY-NEXT: vmovaps 144(%rcx), %xmm3
2056 ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill
2057 ; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2
2058 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2059 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
2060 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
2061 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2062 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2063 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2064 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2065 ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2
2066 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2067 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm1
2068 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2069 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
2070 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2071 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2072 ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm3
2073 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2074 ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm2
2075 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2076 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
2077 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
2078 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2079 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2080 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2081 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2082 ; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm2
2083 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2084 ; AVX1-ONLY-NEXT: vmovaps 176(%rsi), %xmm1
2085 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2086 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
2087 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2088 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2089 ; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm2
2090 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2091 ; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm13
2092 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm13[0]
2093 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
2094 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1]
2095 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2096 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2097 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2098 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2
2099 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2100 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm1
2101 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2102 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
2103 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2104 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2105 ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm10
2106 ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm8
2107 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm8[0]
2108 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
2109 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
2110 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2111 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2112 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2113 ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm2
2114 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2115 ; AVX1-ONLY-NEXT: vmovaps 208(%rsi), %xmm1
2116 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2117 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero
2118 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2119 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2120 ; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm6
2121 ; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm4
2122 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm4[0]
2123 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
2124 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
2125 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2126 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2127 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2128 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1
2129 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2130 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm11
2131 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm11[1],zero,zero
2132 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
2133 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2134 ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm3
2135 ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm2
2136 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
2137 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
2138 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2139 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
2140 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2141 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2142 ; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm9
2143 ; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm7
2144 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[1],xmm7[1],zero,zero
2145 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
2146 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12
2147 ; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm5
2148 ; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1
2149 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm5[0],xmm1[0]
2150 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,0]
2151 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
2152 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
2153 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7]
2154 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2155 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2156 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2157 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2158 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2]
2159 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12
2160 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2161 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2162 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
2163 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0]
2164 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
2165 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
2166 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7]
2167 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2168 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2169 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2170 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2171 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2]
2172 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12
2173 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2174 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2175 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2176 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
2177 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
2178 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
2179 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7]
2180 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2181 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2182 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2183 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2184 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2]
2185 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12
2186 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2187 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2188 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2189 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
2190 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
2191 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
2192 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7]
2193 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2194 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2195 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2196 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2197 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2]
2198 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12
2199 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2200 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2201 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2202 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
2203 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
2204 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
2205 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7]
2206 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2207 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2208 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2209 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2210 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2]
2211 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12
2212 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2213 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2214 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2215 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
2216 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
2217 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
2218 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7]
2219 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2220 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2221 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2222 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2223 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2]
2224 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12
2225 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2226 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2227 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2228 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
2229 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
2230 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
2231 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7]
2232 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2233 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2234 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2235 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2236 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2]
2237 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12
2238 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2239 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2240 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2241 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
2242 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
2243 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
2244 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7]
2245 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2246 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2247 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2248 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2249 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2]
2250 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12
2251 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2252 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2253 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2254 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
2255 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
2256 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
2257 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7]
2258 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2259 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2260 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2261 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2262 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2]
2263 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12
2264 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2265 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2266 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2267 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
2268 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
2269 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
2270 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7]
2271 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2272 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2273 ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload
2274 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2275 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2]
2276 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12
2277 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2278 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2279 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2280 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
2281 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
2282 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
2283 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7]
2284 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2285 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2286 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2287 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
2288 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2]
2289 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12
2290 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2291 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2292 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2293 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
2294 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
2295 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
2296 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7]
2297 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2298 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm0[2],xmm13[3],xmm0[3]
2299 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm13[2],xmm0[2]
2300 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13
2301 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2302 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2303 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
2304 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0]
2305 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3]
2306 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
2307 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
2308 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm10[2],xmm8[3],xmm10[3]
2309 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm8[2],xmm10[2]
2310 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8
2311 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2312 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2313 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm14[2],xmm0[2],xmm14[3],xmm0[3]
2314 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[3,0],xmm14[3,0]
2315 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3]
2316 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm10
2317 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7]
2318 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
2319 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm4[2],xmm6[2]
2320 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4
2321 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2322 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2323 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm0[2],xmm10[3],xmm0[3]
2324 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm0[3,0],xmm10[3,0]
2325 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3]
2326 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6
2327 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
2328 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2329 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm2[2],xmm3[2]
2330 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
2331 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2332 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
2333 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm11[3,0],xmm0[3,0]
2334 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3]
2335 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
2336 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2337 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
2338 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm1[2],xmm5[2]
2339 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
2340 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm7[2],xmm9[3],xmm7[3]
2341 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm9[3,0]
2342 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0,2,3]
2343 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
2344 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2345 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r8)
2346 ; AVX1-ONLY-NEXT: vmovaps %ymm2, 928(%r8)
2347 ; AVX1-ONLY-NEXT: vmovaps %ymm4, 864(%r8)
2348 ; AVX1-ONLY-NEXT: vmovaps %ymm8, 800(%r8)
2349 ; AVX1-ONLY-NEXT: vmovaps %ymm13, 736(%r8)
2350 ; AVX1-ONLY-NEXT: vmovaps %ymm12, 672(%r8)
2351 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2352 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%r8)
2353 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2354 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%r8)
2355 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2356 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%r8)
2357 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2358 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r8)
2359 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2360 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r8)
2361 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2362 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r8)
2363 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2364 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8)
2365 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2366 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8)
2367 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2368 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8)
2369 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2370 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8)
2371 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2372 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 960(%r8)
2373 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2374 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 896(%r8)
2375 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2376 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%r8)
2377 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2378 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%r8)
2379 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2380 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%r8)
2381 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2382 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 640(%r8)
2383 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2384 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%r8)
2385 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2386 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r8)
2387 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2388 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r8)
2389 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2390 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r8)
2391 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2392 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%r8)
2393 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2394 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%r8)
2395 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2396 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8)
2397 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2398 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8)
2399 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2400 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8)
2401 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2402 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8)
2403 ; AVX1-ONLY-NEXT: addq $1384, %rsp # imm = 0x568
2404 ; AVX1-ONLY-NEXT: vzeroupper
2405 ; AVX1-ONLY-NEXT: retq
2407 ; AVX2-ONLY-LABEL: store_i32_stride4_vf64:
2408 ; AVX2-ONLY: # %bb.0:
2409 ; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208
2410 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm4
2411 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm5
2412 ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm0
2413 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm6
2414 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm7
2415 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm1
2416 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
2417 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[0,0,2,1]
2418 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9
2419 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm10
2420 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm2
2421 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11
2422 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12
2423 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm3
2424 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
2425 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3]
2426 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3],ymm13[4,5],ymm8[6,7]
2427 ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2428 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
2429 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
2430 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
2431 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3]
2432 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
2433 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2434 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
2435 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
2436 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
2437 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3]
2438 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
2439 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2440 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
2441 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
2442 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
2443 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
2444 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
2445 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2446 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2447 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1]
2448 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2449 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
2450 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
2451 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2452 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2453 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
2454 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
2455 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
2456 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2457 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2458 ; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm0
2459 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm1
2460 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2461 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
2462 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm3
2463 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm4
2464 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2465 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
2466 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
2467 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2468 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2469 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
2470 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2471 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
2472 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2473 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2474 ; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %xmm0
2475 ; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm1
2476 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2477 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
2478 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm3
2479 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm4
2480 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2481 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
2482 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
2483 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2484 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2485 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
2486 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2487 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
2488 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2489 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2490 ; AVX2-ONLY-NEXT: vmovaps 160(%rcx), %xmm0
2491 ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm1
2492 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2493 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
2494 ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %xmm3
2495 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm4
2496 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2497 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
2498 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
2499 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2500 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2501 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
2502 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2503 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
2504 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2505 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2506 ; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %xmm0
2507 ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm1
2508 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2509 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
2510 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm3
2511 ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm4
2512 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2513 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
2514 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
2515 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2516 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2517 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
2518 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2519 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
2520 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2521 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2522 ; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm0
2523 ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm1
2524 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2525 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
2526 ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm3
2527 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm4
2528 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2529 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3]
2530 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
2531 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2532 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2533 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2534 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
2535 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
2536 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2537 ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
2538 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm0
2539 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm1
2540 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2541 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
2542 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3
2543 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm4
2544 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
2545 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3]
2546 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
2547 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2548 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2549 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
2550 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2551 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
2552 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2553 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2554 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm0
2555 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm1
2556 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2557 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
2558 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3
2559 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4
2560 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
2561 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3]
2562 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
2563 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2564 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2565 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
2566 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2567 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
2568 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2569 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2570 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0
2571 ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm1
2572 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2573 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
2574 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3
2575 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm4
2576 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
2577 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3]
2578 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
2579 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2580 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
2581 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2582 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
2583 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2584 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0
2585 ; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm1
2586 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2587 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
2588 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3
2589 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm4
2590 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
2591 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3]
2592 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
2593 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2594 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
2595 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
2596 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3]
2597 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2598 ; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm3
2599 ; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm1
2600 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5]
2601 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,2,2,3]
2602 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2
2603 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm0
2604 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5]
2605 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3]
2606 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm4[2,3],ymm13[4,5],ymm4[6,7]
2607 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7]
2608 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7]
2609 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2610 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2611 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2612 ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm2
2613 ; AVX2-ONLY-NEXT: vmovaps 160(%rcx), %ymm3
2614 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
2615 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
2616 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm13
2617 ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm0
2618 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[4],ymm0[4],ymm13[5],ymm0[5]
2619 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,3,3]
2620 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7]
2621 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
2622 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[6],ymm0[6],ymm13[7],ymm0[7]
2623 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
2624 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2625 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
2626 ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm2
2627 ; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm3
2628 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
2629 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3]
2630 ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm15
2631 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm0
2632 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5]
2633 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3]
2634 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
2635 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
2636 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7]
2637 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
2638 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2639 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
2640 ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm2
2641 ; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %ymm3
2642 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
2643 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,2,3]
2644 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm15
2645 ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm0
2646 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5]
2647 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3]
2648 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7]
2649 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
2650 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7]
2651 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
2652 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
2653 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
2654 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%r8)
2655 ; AVX2-ONLY-NEXT: vmovaps %ymm12, 960(%r8)
2656 ; AVX2-ONLY-NEXT: vmovaps %ymm1, 864(%r8)
2657 ; AVX2-ONLY-NEXT: vmovaps %ymm13, 832(%r8)
2658 ; AVX2-ONLY-NEXT: vmovaps %ymm4, 736(%r8)
2659 ; AVX2-ONLY-NEXT: vmovaps %ymm5, 704(%r8)
2660 ; AVX2-ONLY-NEXT: vmovaps %ymm6, 608(%r8)
2661 ; AVX2-ONLY-NEXT: vmovaps %ymm7, 576(%r8)
2662 ; AVX2-ONLY-NEXT: vmovaps %ymm8, 480(%r8)
2663 ; AVX2-ONLY-NEXT: vmovaps %ymm9, 448(%r8)
2664 ; AVX2-ONLY-NEXT: vmovaps %ymm10, 352(%r8)
2665 ; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%r8)
2666 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2667 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8)
2668 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2669 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8)
2670 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2671 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8)
2672 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2673 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8)
2674 ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2675 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%r8)
2676 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2677 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 896(%r8)
2678 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2679 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 800(%r8)
2680 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2681 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 768(%r8)
2682 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2683 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%r8)
2684 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2685 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%r8)
2686 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2687 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r8)
2688 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2689 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%r8)
2690 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2691 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%r8)
2692 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2693 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%r8)
2694 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2695 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r8)
2696 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2697 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r8)
2698 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2699 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8)
2700 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2701 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8)
2702 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2703 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8)
2704 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2705 ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8)
2706 ; AVX2-ONLY-NEXT: addq $520, %rsp # imm = 0x208
2707 ; AVX2-ONLY-NEXT: vzeroupper
2708 ; AVX2-ONLY-NEXT: retq
2710 ; AVX512F-LABEL: store_i32_stride4_vf64:
2712 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
2713 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1
2714 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2
2715 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3
2716 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm17
2717 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm23
2718 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm12
2719 ; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm5
2720 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm22
2721 ; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm25
2722 ; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm13
2723 ; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm6
2724 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm21
2725 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm26
2726 ; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm19
2727 ; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm9
2728 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23>
2729 ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm8
2730 ; AVX512F-NEXT: vpermt2d %zmm21, %zmm14, %zmm8
2731 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u>
2732 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4
2733 ; AVX512F-NEXT: vpermt2d %zmm17, %zmm7, %zmm4
2734 ; AVX512F-NEXT: movb $-86, %al
2735 ; AVX512F-NEXT: kmovw %eax, %k1
2736 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1}
2737 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19>
2738 ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm10
2739 ; AVX512F-NEXT: vpermt2d %zmm21, %zmm16, %zmm10
2740 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u>
2741 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8
2742 ; AVX512F-NEXT: vpermt2d %zmm17, %zmm11, %zmm8
2743 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1}
2744 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = <u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31>
2745 ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm20
2746 ; AVX512F-NEXT: vpermt2d %zmm21, %zmm18, %zmm20
2747 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u>
2748 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10
2749 ; AVX512F-NEXT: vpermt2d %zmm17, %zmm15, %zmm10
2750 ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1}
2751 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27>
2752 ; AVX512F-NEXT: vpermt2d %zmm21, %zmm20, %zmm22
2753 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u>
2754 ; AVX512F-NEXT: vpermt2d %zmm17, %zmm21, %zmm0
2755 ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1}
2756 ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm22
2757 ; AVX512F-NEXT: vpermt2d %zmm26, %zmm14, %zmm22
2758 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17
2759 ; AVX512F-NEXT: vpermt2d %zmm23, %zmm7, %zmm17
2760 ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1}
2761 ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm24
2762 ; AVX512F-NEXT: vpermt2d %zmm26, %zmm16, %zmm24
2763 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22
2764 ; AVX512F-NEXT: vpermt2d %zmm23, %zmm11, %zmm22
2765 ; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1}
2766 ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm27
2767 ; AVX512F-NEXT: vpermt2d %zmm26, %zmm18, %zmm27
2768 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24
2769 ; AVX512F-NEXT: vpermt2d %zmm23, %zmm15, %zmm24
2770 ; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1}
2771 ; AVX512F-NEXT: vpermt2d %zmm26, %zmm20, %zmm25
2772 ; AVX512F-NEXT: vpermt2d %zmm23, %zmm21, %zmm1
2773 ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1}
2774 ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23
2775 ; AVX512F-NEXT: vpermt2d %zmm19, %zmm14, %zmm23
2776 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25
2777 ; AVX512F-NEXT: vpermt2d %zmm12, %zmm7, %zmm25
2778 ; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1}
2779 ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23
2780 ; AVX512F-NEXT: vpermt2d %zmm19, %zmm16, %zmm23
2781 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26
2782 ; AVX512F-NEXT: vpermt2d %zmm12, %zmm11, %zmm26
2783 ; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
2784 ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23
2785 ; AVX512F-NEXT: vpermt2d %zmm19, %zmm18, %zmm23
2786 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27
2787 ; AVX512F-NEXT: vpermt2d %zmm12, %zmm15, %zmm27
2788 ; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1}
2789 ; AVX512F-NEXT: vpermt2d %zmm19, %zmm20, %zmm13
2790 ; AVX512F-NEXT: vpermt2d %zmm12, %zmm21, %zmm2
2791 ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1}
2792 ; AVX512F-NEXT: vpermi2d %zmm9, %zmm6, %zmm14
2793 ; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm7
2794 ; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1}
2795 ; AVX512F-NEXT: vpermi2d %zmm9, %zmm6, %zmm16
2796 ; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm11
2797 ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1}
2798 ; AVX512F-NEXT: vpermi2d %zmm9, %zmm6, %zmm18
2799 ; AVX512F-NEXT: vpermi2d %zmm5, %zmm3, %zmm15
2800 ; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1}
2801 ; AVX512F-NEXT: vpermt2d %zmm9, %zmm20, %zmm6
2802 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm21, %zmm3
2803 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
2804 ; AVX512F-NEXT: vmovdqa64 %zmm3, 896(%r8)
2805 ; AVX512F-NEXT: vmovdqa64 %zmm15, 960(%r8)
2806 ; AVX512F-NEXT: vmovdqa64 %zmm11, 768(%r8)
2807 ; AVX512F-NEXT: vmovdqa64 %zmm7, 832(%r8)
2808 ; AVX512F-NEXT: vmovdqa64 %zmm2, 640(%r8)
2809 ; AVX512F-NEXT: vmovdqa64 %zmm27, 704(%r8)
2810 ; AVX512F-NEXT: vmovdqa64 %zmm26, 512(%r8)
2811 ; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r8)
2812 ; AVX512F-NEXT: vmovdqa64 %zmm1, 384(%r8)
2813 ; AVX512F-NEXT: vmovdqa64 %zmm24, 448(%r8)
2814 ; AVX512F-NEXT: vmovdqa64 %zmm22, 256(%r8)
2815 ; AVX512F-NEXT: vmovdqa64 %zmm17, 320(%r8)
2816 ; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r8)
2817 ; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%r8)
2818 ; AVX512F-NEXT: vmovdqa64 %zmm8, (%r8)
2819 ; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%r8)
2820 ; AVX512F-NEXT: vzeroupper
2821 ; AVX512F-NEXT: retq
2823 ; AVX512BW-LABEL: store_i32_stride4_vf64:
2824 ; AVX512BW: # %bb.0:
2825 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2826 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
2827 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
2828 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
2829 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm17
2830 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm23
2831 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm12
2832 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm5
2833 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm22
2834 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm25
2835 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm13
2836 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm6
2837 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm21
2838 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26
2839 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm19
2840 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm9
2841 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23>
2842 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8
2843 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm14, %zmm8
2844 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u>
2845 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4
2846 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm4
2847 ; AVX512BW-NEXT: movb $-86, %al
2848 ; AVX512BW-NEXT: kmovd %eax, %k1
2849 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1}
2850 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19>
2851 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10
2852 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm16, %zmm10
2853 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u>
2854 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8
2855 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm11, %zmm8
2856 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1}
2857 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31>
2858 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20
2859 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm18, %zmm20
2860 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u>
2861 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10
2862 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm15, %zmm10
2863 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1}
2864 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27>
2865 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm20, %zmm22
2866 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u>
2867 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm0
2868 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1}
2869 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22
2870 ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm14, %zmm22
2871 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17
2872 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm7, %zmm17
2873 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1}
2874 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24
2875 ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm16, %zmm24
2876 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22
2877 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm11, %zmm22
2878 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1}
2879 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27
2880 ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm18, %zmm27
2881 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24
2882 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm15, %zmm24
2883 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm24 {%k1}
2884 ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm25
2885 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm21, %zmm1
2886 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1}
2887 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23
2888 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm14, %zmm23
2889 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25
2890 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm7, %zmm25
2891 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1}
2892 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23
2893 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm16, %zmm23
2894 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26
2895 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm11, %zmm26
2896 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1}
2897 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23
2898 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm18, %zmm23
2899 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27
2900 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm15, %zmm27
2901 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1}
2902 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm20, %zmm13
2903 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm2
2904 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 {%k1}
2905 ; AVX512BW-NEXT: vpermi2d %zmm9, %zmm6, %zmm14
2906 ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm7
2907 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k1}
2908 ; AVX512BW-NEXT: vpermi2d %zmm9, %zmm6, %zmm16
2909 ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm11
2910 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1}
2911 ; AVX512BW-NEXT: vpermi2d %zmm9, %zmm6, %zmm18
2912 ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm15
2913 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1}
2914 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6
2915 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm21, %zmm3
2916 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
2917 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 896(%r8)
2918 ; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%r8)
2919 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 768(%r8)
2920 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 832(%r8)
2921 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%r8)
2922 ; AVX512BW-NEXT: vmovdqa64 %zmm27, 704(%r8)
2923 ; AVX512BW-NEXT: vmovdqa64 %zmm26, 512(%r8)
2924 ; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r8)
2925 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8)
2926 ; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r8)
2927 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 256(%r8)
2928 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 320(%r8)
2929 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8)
2930 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r8)
2931 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8)
2932 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8)
2933 ; AVX512BW-NEXT: vzeroupper
2934 ; AVX512BW-NEXT: retq
2935 %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64
2936 %in.vec1 = load <64 x i32>, ptr %in.vecptr1, align 64
2937 %in.vec2 = load <64 x i32>, ptr %in.vecptr2, align 64
2938 %in.vec3 = load <64 x i32>, ptr %in.vecptr3, align 64
2939 %1 = shufflevector <64 x i32> %in.vec0, <64 x i32> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
2940 %2 = shufflevector <64 x i32> %in.vec2, <64 x i32> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
2941 %3 = shufflevector <128 x i32> %1, <128 x i32> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
2942 %interleaved.vec = shufflevector <256 x i32> %3, <256 x i32> poison, <256 x i32> <i32 0, i32 64, i32 128, i32 192, i32 1, i32 65, i32 129, i32 193, i32 2, i32 66, i32 130, i32 194, i32 3, i32 67, i32 131, i32 195, i32 4, i32 68, i32 132, i32 196, i32 5, i32 69, i32 133, i32 197, i32 6, i32 70, i32 134, i32 198, i32 7, i32 71, i32 135, i32 199, i32 8, i32 72, i32 136, i32 200, i32 9, i32 73, i32 137, i32 201, i32 10, i32 74, i32 138, i32 202, i32 11, i32 75, i32 139, i32 203, i32 12, i32 76, i32 140, i32 204, i32 13, i32 77, i32 141, i32 205, i32 14, i32 78, i32 142, i32 206, i32 15, i32 79, i32 143, i32 207, i32 16, i32 80, i32 144, i32 208, i32 17, i32 81, i32 145, i32 209, i32 18, i32 82, i32 146, i32 210, i32 19, i32 83, i32 147, i32 211, i32 20, i32 84, i32 148, i32 212, i32 21, i32 85, i32 149, i32 213, i32 22, i32 86, i32 150, i32 214, i32 23, i32 87, i32 151, i32 215, i32 24, i32 88, i32 152, i32 216, i32 25, i32 89, i32 153, i32 217, i32 26, i32 90, i32 154, i32 218, i32 27, i32 91, i32 155, i32 219, i32 28, i32 92, i32 156, i32 220, i32 29, i32 93, i32 157, i32 221, i32 30, i32 94, i32 158, i32 222, i32 31, i32 95, i32 159, i32 223, i32 32, i32 96, i32 160, i32 224, i32 33, i32 97, i32 161, i32 225, i32 34, i32 98, i32 162, i32 226, i32 35, i32 99, i32 163, i32 227, i32 36, i32 100, i32 164, i32 228, i32 37, i32 101, i32 165, i32 229, i32 38, i32 102, i32 166, i32 230, i32 39, i32 103, i32 167, i32 231, i32 40, i32 104, i32 168, i32 232, i32 41, i32 105, i32 169, i32 233, i32 42, i32 106, i32 170, i32 234, i32 43, i32 107, i32 171, i32 235, i32 44, i32 108, i32 172, i32 236, i32 45, i32 109, i32 173, i32 237, i32 46, i32 110, i32 174, i32 238, i32 47, i32 111, i32 175, i32 239, i32 48, i32 112, i32 176, i32 240, i32 49, i32 113, i32 177, i32 241, i32 50, i32 114, i32 178, i32 242, i32 51, i32 115, i32 179, i32 243, i32 52, i32 116, i32 180, i32 244, i32 53, i32 117, i32 181, i32 245, i32 54, i32 118, i32 182, i32 246, i32 55, i32 119, i32 183, i32 247, i32 56, i32 120, i32 184, i32 248, i32 57, i32 121, i32 185, i32 249, i32 58, i32 122, i32 186, i32 250, i32 59, i32 123, i32 187, i32 251, i32 60, i32 124, i32 188, i32 252, i32 61, i32 125, i32 189, i32 253, i32 62, i32 126, i32 190, i32 254, i32 63, i32 127, i32 191, i32 255>
2943 store <256 x i32> %interleaved.vec, ptr %out.vec, align 64
2946 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2950 ; AVX512BW-FAST: {{.*}}
2951 ; AVX512BW-ONLY-FAST: {{.*}}
2952 ; AVX512BW-ONLY-SLOW: {{.*}}
2953 ; AVX512BW-SLOW: {{.*}}
2954 ; AVX512DQ-FAST: {{.*}}
2955 ; AVX512DQ-SLOW: {{.*}}
2956 ; AVX512DQBW-FAST: {{.*}}
2957 ; AVX512DQBW-SLOW: {{.*}}
2958 ; AVX512F-FAST: {{.*}}
2959 ; AVX512F-ONLY-FAST: {{.*}}
2960 ; AVX512F-ONLY-SLOW: {{.*}}
2961 ; AVX512F-SLOW: {{.*}}
2964 ; FALLBACK10: {{.*}}
2965 ; FALLBACK11: {{.*}}
2966 ; FALLBACK12: {{.*}}