1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
10 define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
11 ; AVX-LABEL: shuffle_v32i8_to_v16i8_1:
13 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
14 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
15 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
16 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
17 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
18 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
19 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
22 ; AVX512-LABEL: shuffle_v32i8_to_v16i8_1:
24 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
25 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
26 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
27 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
28 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
29 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
30 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
32 %vec = load <32 x i8>, <32 x i8>* %L
33 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
34 store <16 x i8> %strided.vec, <16 x i8>* %S
38 define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
39 ; AVX-LABEL: shuffle_v16i16_to_v8i16_1:
41 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
42 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
43 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
44 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
45 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
46 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
47 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
50 ; AVX512F-LABEL: shuffle_v16i16_to_v8i16_1:
52 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
53 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
54 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
55 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
56 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
57 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
58 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
61 ; AVX512VL-LABEL: shuffle_v16i16_to_v8i16_1:
63 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
64 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
65 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
66 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
67 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
68 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
69 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
72 ; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1:
74 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
75 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
76 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
77 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
78 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
79 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
80 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
83 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1:
84 ; AVX512BWVL: # %bb.0:
85 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
86 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15]
87 ; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
88 ; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
89 ; AVX512BWVL-NEXT: retq
90 %vec = load <16 x i16>, <16 x i16>* %L
91 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
92 store <8 x i16> %strided.vec, <8 x i16>* %S
96 define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind {
97 ; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
99 ; AVX-NEXT: vmovaps (%rdi), %xmm0
100 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
101 ; AVX-NEXT: vmovaps %xmm0, (%rsi)
104 ; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
106 ; AVX512-NEXT: vmovaps (%rdi), %xmm0
107 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
108 ; AVX512-NEXT: vmovaps %xmm0, (%rsi)
110 %vec = load <8 x i32>, <8 x i32>* %L
111 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
112 store <4 x i32> %strided.vec, <4 x i32>* %S
116 define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind {
117 ; AVX-LABEL: shuffle_v32i8_to_v8i8_1:
119 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
120 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
121 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
122 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
123 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
124 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
125 ; AVX-NEXT: vmovq %xmm0, (%rsi)
128 ; AVX512-LABEL: shuffle_v32i8_to_v8i8_1:
130 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
131 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
132 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
133 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
134 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
135 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
136 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
138 %vec = load <32 x i8>, <32 x i8>* %L
139 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
140 store <8 x i8> %strided.vec, <8 x i8>* %S
144 define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
145 ; AVX-LABEL: shuffle_v32i8_to_v8i8_2:
147 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
148 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
149 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
150 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
151 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
152 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
153 ; AVX-NEXT: vmovq %xmm0, (%rsi)
156 ; AVX512-LABEL: shuffle_v32i8_to_v8i8_2:
158 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
159 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
160 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
161 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
162 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
163 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
164 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
166 %vec = load <32 x i8>, <32 x i8>* %L
167 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
168 store <8 x i8> %strided.vec, <8 x i8>* %S
172 define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind {
173 ; AVX-LABEL: shuffle_v32i8_to_v8i8_3:
175 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
176 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
177 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
178 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
179 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
180 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
181 ; AVX-NEXT: vmovq %xmm0, (%rsi)
184 ; AVX512-LABEL: shuffle_v32i8_to_v8i8_3:
186 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
187 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
188 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
189 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
190 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
191 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
192 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
194 %vec = load <32 x i8>, <32 x i8>* %L
195 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
196 store <8 x i8> %strided.vec, <8 x i8>* %S
200 define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
201 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
203 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
204 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
205 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
206 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
207 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
208 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
211 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1:
212 ; AVX2-SLOW: # %bb.0:
213 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
214 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
215 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
216 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
217 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
218 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
219 ; AVX2-SLOW-NEXT: retq
221 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1:
222 ; AVX2-FAST: # %bb.0:
223 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
224 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
225 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
226 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
227 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
228 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
229 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
230 ; AVX2-FAST-NEXT: retq
232 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
234 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
235 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
236 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
237 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
238 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
239 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
242 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
244 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
245 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
246 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
247 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
248 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
249 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
250 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
251 ; AVX512VL-NEXT: retq
253 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
255 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
256 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
257 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
258 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
259 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
260 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
261 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
262 ; AVX512BW-NEXT: retq
264 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
265 ; AVX512BWVL: # %bb.0:
266 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
267 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,4,5,12,13]
268 ; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
269 ; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
270 ; AVX512BWVL-NEXT: retq
271 %vec = load <16 x i16>, <16 x i16>* %L
272 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
273 store <4 x i16> %strided.vec, <4 x i16>* %S
277 define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
278 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
280 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
281 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
282 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
283 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
284 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
285 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
288 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2:
289 ; AVX2-SLOW: # %bb.0:
290 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
291 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
292 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
293 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
294 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
295 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
296 ; AVX2-SLOW-NEXT: retq
298 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2:
299 ; AVX2-FAST: # %bb.0:
300 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
301 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
302 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
303 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
304 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
305 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
306 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
307 ; AVX2-FAST-NEXT: retq
309 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
311 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
312 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
313 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
314 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
315 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
316 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
319 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
321 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
322 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
323 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
324 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
325 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
326 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
327 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
328 ; AVX512VL-NEXT: retq
330 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
332 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
333 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
334 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
335 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
336 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
337 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
338 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
339 ; AVX512BW-NEXT: retq
341 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
342 ; AVX512BWVL: # %bb.0:
343 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
344 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [2,6,10,14,2,3,10,11]
345 ; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
346 ; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
347 ; AVX512BWVL-NEXT: retq
348 %vec = load <16 x i16>, <16 x i16>* %L
349 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
350 store <4 x i16> %strided.vec, <4 x i16>* %S
354 define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
355 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
357 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
358 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
359 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
360 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
361 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
362 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
365 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3:
366 ; AVX2-SLOW: # %bb.0:
367 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
368 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
369 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
370 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
371 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
372 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
373 ; AVX2-SLOW-NEXT: retq
375 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3:
376 ; AVX2-FAST: # %bb.0:
377 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
378 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
379 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
380 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
381 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
382 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
383 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
384 ; AVX2-FAST-NEXT: retq
386 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
388 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
389 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
390 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
391 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
392 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
393 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
396 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
398 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
399 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
400 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
401 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
402 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
403 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
404 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
405 ; AVX512VL-NEXT: retq
407 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
409 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
410 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
411 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
412 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
413 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
414 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
415 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
416 ; AVX512BW-NEXT: retq
418 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
419 ; AVX512BWVL: # %bb.0:
420 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
421 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,11,15,2,3,10,11]
422 ; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
423 ; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
424 ; AVX512BWVL-NEXT: retq
425 %vec = load <16 x i16>, <16 x i16>* %L
426 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
427 store <4 x i16> %strided.vec, <4 x i16>* %S
431 define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
432 ; AVX-LABEL: shuffle_v32i8_to_v4i8_1:
434 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
435 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
436 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
437 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
438 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
439 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
440 ; AVX-NEXT: vmovd %xmm0, (%rsi)
443 ; AVX512-LABEL: shuffle_v32i8_to_v4i8_1:
445 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
446 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
447 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
448 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
449 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
450 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
451 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
453 %vec = load <32 x i8>, <32 x i8>* %L
454 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
455 store <4 x i8> %strided.vec, <4 x i8>* %S
459 define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
460 ; AVX-LABEL: shuffle_v32i8_to_v4i8_2:
462 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
463 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
464 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
465 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
466 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
467 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
468 ; AVX-NEXT: vmovd %xmm0, (%rsi)
471 ; AVX512-LABEL: shuffle_v32i8_to_v4i8_2:
473 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
474 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
475 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
476 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
477 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
478 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
479 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
481 %vec = load <32 x i8>, <32 x i8>* %L
482 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
483 store <4 x i8> %strided.vec, <4 x i8>* %S
487 define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
488 ; AVX-LABEL: shuffle_v32i8_to_v4i8_3:
490 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
491 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
492 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
493 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
494 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
495 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
496 ; AVX-NEXT: vmovd %xmm0, (%rsi)
499 ; AVX512-LABEL: shuffle_v32i8_to_v4i8_3:
501 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
502 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
503 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
504 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
505 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
506 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
507 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
509 %vec = load <32 x i8>, <32 x i8>* %L
510 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
511 store <4 x i8> %strided.vec, <4 x i8>* %S
515 define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind {
516 ; AVX-LABEL: shuffle_v32i8_to_v4i8_4:
518 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
519 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
520 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
521 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
522 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
523 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
524 ; AVX-NEXT: vmovd %xmm0, (%rsi)
527 ; AVX512-LABEL: shuffle_v32i8_to_v4i8_4:
529 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
530 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
531 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
532 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
533 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
534 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
535 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
537 %vec = load <32 x i8>, <32 x i8>* %L
538 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
539 store <4 x i8> %strided.vec, <4 x i8>* %S
543 define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
544 ; AVX-LABEL: shuffle_v32i8_to_v4i8_5:
546 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
547 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
548 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
549 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
550 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
551 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
552 ; AVX-NEXT: vmovd %xmm0, (%rsi)
555 ; AVX512-LABEL: shuffle_v32i8_to_v4i8_5:
557 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
558 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
559 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
560 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
561 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
562 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
563 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
565 %vec = load <32 x i8>, <32 x i8>* %L
566 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
567 store <4 x i8> %strided.vec, <4 x i8>* %S
571 define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
572 ; AVX-LABEL: shuffle_v32i8_to_v4i8_6:
574 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
575 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
576 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
577 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
578 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
579 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
580 ; AVX-NEXT: vmovd %xmm0, (%rsi)
583 ; AVX512-LABEL: shuffle_v32i8_to_v4i8_6:
585 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
586 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
587 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
588 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
589 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
590 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
591 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
593 %vec = load <32 x i8>, <32 x i8>* %L
594 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
595 store <4 x i8> %strided.vec, <4 x i8>* %S
599 define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind {
600 ; AVX-LABEL: shuffle_v32i8_to_v4i8_7:
602 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
603 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
604 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
605 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
606 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
607 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
608 ; AVX-NEXT: vmovd %xmm0, (%rsi)
611 ; AVX512-LABEL: shuffle_v32i8_to_v4i8_7:
613 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
614 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
615 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
616 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
617 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
618 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
619 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
621 %vec = load <32 x i8>, <32 x i8>* %L
622 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
623 store <4 x i8> %strided.vec, <4 x i8>* %S