1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2
7 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2
8 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
9 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512
10 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512
11 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefixes=AVX,XOP
13 define void @insert_v7i8_v2i16_2(<7 x i8> *%a0, <2 x i16> *%a1) nounwind {
14 ; SSE-LABEL: insert_v7i8_v2i16_2:
16 ; SSE-NEXT: movl (%rsi), %eax
17 ; SSE-NEXT: movd %eax, %xmm0
18 ; SSE-NEXT: movq (%rdi), %rcx
19 ; SSE-NEXT: movq %rcx, %xmm1
20 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
21 ; SSE-NEXT: shrq $48, %rcx
22 ; SSE-NEXT: movb %cl, 6(%rdi)
23 ; SSE-NEXT: shrl $16, %eax
24 ; SSE-NEXT: movw %ax, 4(%rdi)
25 ; SSE-NEXT: movd %xmm1, (%rdi)
28 ; AVX-LABEL: insert_v7i8_v2i16_2:
30 ; AVX-NEXT: movl (%rsi), %eax
31 ; AVX-NEXT: vmovd %eax, %xmm0
32 ; AVX-NEXT: movq (%rdi), %rcx
33 ; AVX-NEXT: vmovq %rcx, %xmm1
34 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
35 ; AVX-NEXT: shrq $48, %rcx
36 ; AVX-NEXT: movb %cl, 6(%rdi)
37 ; AVX-NEXT: shrl $16, %eax
38 ; AVX-NEXT: movw %ax, 4(%rdi)
39 ; AVX-NEXT: vmovd %xmm0, (%rdi)
42 ; AVX512-LABEL: insert_v7i8_v2i16_2:
44 ; AVX512-NEXT: movl (%rsi), %eax
45 ; AVX512-NEXT: vmovd %eax, %xmm0
46 ; AVX512-NEXT: movq (%rdi), %rcx
47 ; AVX512-NEXT: vmovq %rcx, %xmm1
48 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
49 ; AVX512-NEXT: shrq $48, %rcx
50 ; AVX512-NEXT: movb %cl, 6(%rdi)
51 ; AVX512-NEXT: shrl $16, %eax
52 ; AVX512-NEXT: movw %ax, 4(%rdi)
53 ; AVX512-NEXT: vmovd %xmm0, (%rdi)
55 %1 = load <2 x i16>, <2 x i16> *%a1
56 %2 = bitcast <2 x i16> %1 to <4 x i8>
57 %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef>
58 %4 = load <7 x i8>, <7 x i8> *%a0
59 %5 = shufflevector <7 x i8> %4, <7 x i8> %3, <7 x i32> <i32 0, i32 1, i32 7, i32 8, i32 9, i32 10, i32 6>
60 store <7 x i8> %5, <7 x i8>* %a0
64 %struct.Mat4 = type { %struct.storage }
65 %struct.storage = type { [16 x float] }
67 define void @PR40815(%struct.Mat4* nocapture readonly dereferenceable(64), %struct.Mat4* nocapture dereferenceable(64)) {
70 ; SSE-NEXT: movaps (%rdi), %xmm0
71 ; SSE-NEXT: movaps 16(%rdi), %xmm1
72 ; SSE-NEXT: movaps 32(%rdi), %xmm2
73 ; SSE-NEXT: movaps 48(%rdi), %xmm3
74 ; SSE-NEXT: movaps %xmm3, (%rsi)
75 ; SSE-NEXT: movaps %xmm2, 16(%rsi)
76 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
77 ; SSE-NEXT: movaps %xmm0, 48(%rsi)
82 ; AVX-NEXT: vmovaps (%rdi), %xmm0
83 ; AVX-NEXT: vmovaps 16(%rdi), %xmm1
84 ; AVX-NEXT: vmovaps 32(%rdi), %xmm2
85 ; AVX-NEXT: vmovaps 48(%rdi), %xmm3
86 ; AVX-NEXT: vmovaps %xmm2, 16(%rsi)
87 ; AVX-NEXT: vmovaps %xmm3, (%rsi)
88 ; AVX-NEXT: vmovaps %xmm0, 48(%rsi)
89 ; AVX-NEXT: vmovaps %xmm1, 32(%rsi)
92 ; AVX512-LABEL: PR40815:
94 ; AVX512-NEXT: vmovaps 16(%rdi), %xmm0
95 ; AVX512-NEXT: vmovaps 48(%rdi), %xmm1
96 ; AVX512-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
97 ; AVX512-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1
98 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
99 ; AVX512-NEXT: vmovups %zmm0, (%rsi)
100 ; AVX512-NEXT: vzeroupper
102 %3 = bitcast %struct.Mat4* %0 to <16 x float>*
103 %4 = load <16 x float>, <16 x float>* %3, align 64
104 %5 = shufflevector <16 x float> %4, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
105 %6 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 4
106 %7 = bitcast <16 x float> %4 to <4 x i128>
107 %8 = extractelement <4 x i128> %7, i32 1
108 %9 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 8
109 %10 = bitcast <16 x float> %4 to <4 x i128>
110 %11 = extractelement <4 x i128> %10, i32 2
111 %12 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 12
112 %13 = bitcast float* %12 to <4 x float>*
113 %14 = bitcast <16 x float> %4 to <4 x i128>
114 %15 = extractelement <4 x i128> %14, i32 3
115 %16 = bitcast %struct.Mat4* %1 to i128*
116 store i128 %15, i128* %16, align 16
117 %17 = bitcast float* %6 to i128*
118 store i128 %11, i128* %17, align 16
119 %18 = bitcast float* %9 to i128*
120 store i128 %8, i128* %18, align 16
121 store <4 x float> %5, <4 x float>* %13, align 16
125 define <16 x i32> @PR42819(<8 x i32>* %a0) {
126 ; SSE-LABEL: PR42819:
128 ; SSE-NEXT: movdqu (%rdi), %xmm3
129 ; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11]
130 ; SSE-NEXT: xorps %xmm0, %xmm0
131 ; SSE-NEXT: xorps %xmm1, %xmm1
132 ; SSE-NEXT: xorps %xmm2, %xmm2
135 ; AVX-LABEL: PR42819:
137 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,1,2]
138 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
139 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
140 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7]
141 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
144 ; AVX512-LABEL: PR42819:
146 ; AVX512-NEXT: vmovdqu (%rdi), %ymm0
147 ; AVX512-NEXT: movw $-8192, %ax # imm = 0xE000
148 ; AVX512-NEXT: kmovw %eax, %k1
149 ; AVX512-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
151 %1 = load <8 x i32>, <8 x i32>* %a0, align 4
152 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
153 %3 = shufflevector <16 x i32> zeroinitializer, <16 x i32> %2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
157 @b = dso_local local_unnamed_addr global i32 0, align 4
158 @c = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16
159 @d = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16
161 define void @PR42833() {
162 ; SSE2-LABEL: PR42833:
164 ; SSE2-NEXT: movdqa c+144(%rip), %xmm1
165 ; SSE2-NEXT: movdqa c+128(%rip), %xmm0
166 ; SSE2-NEXT: movd %xmm0, %eax
167 ; SSE2-NEXT: addl b(%rip), %eax
168 ; SSE2-NEXT: movd %eax, %xmm2
169 ; SSE2-NEXT: movd %eax, %xmm3
170 ; SSE2-NEXT: paddd %xmm0, %xmm3
171 ; SSE2-NEXT: movdqa d+144(%rip), %xmm4
172 ; SSE2-NEXT: psubd %xmm1, %xmm4
173 ; SSE2-NEXT: paddd %xmm1, %xmm1
174 ; SSE2-NEXT: movdqa %xmm0, %xmm5
175 ; SSE2-NEXT: paddd %xmm0, %xmm5
176 ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
177 ; SSE2-NEXT: movdqa %xmm1, c+144(%rip)
178 ; SSE2-NEXT: movaps %xmm5, c+128(%rip)
179 ; SSE2-NEXT: movdqa c+160(%rip), %xmm1
180 ; SSE2-NEXT: movdqa c+176(%rip), %xmm3
181 ; SSE2-NEXT: movdqa d+160(%rip), %xmm5
182 ; SSE2-NEXT: movdqa d+176(%rip), %xmm6
183 ; SSE2-NEXT: movdqa d+128(%rip), %xmm7
184 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
185 ; SSE2-NEXT: psubd %xmm0, %xmm7
186 ; SSE2-NEXT: psubd %xmm3, %xmm6
187 ; SSE2-NEXT: psubd %xmm1, %xmm5
188 ; SSE2-NEXT: movdqa %xmm5, d+160(%rip)
189 ; SSE2-NEXT: movdqa %xmm6, d+176(%rip)
190 ; SSE2-NEXT: movdqa %xmm4, d+144(%rip)
191 ; SSE2-NEXT: movdqa %xmm7, d+128(%rip)
192 ; SSE2-NEXT: paddd %xmm3, %xmm3
193 ; SSE2-NEXT: paddd %xmm1, %xmm1
194 ; SSE2-NEXT: movdqa %xmm1, c+160(%rip)
195 ; SSE2-NEXT: movdqa %xmm3, c+176(%rip)
198 ; SSE42-LABEL: PR42833:
200 ; SSE42-NEXT: movdqa c+144(%rip), %xmm0
201 ; SSE42-NEXT: movdqa c+128(%rip), %xmm1
202 ; SSE42-NEXT: movd %xmm1, %eax
203 ; SSE42-NEXT: addl b(%rip), %eax
204 ; SSE42-NEXT: movd %eax, %xmm2
205 ; SSE42-NEXT: paddd %xmm1, %xmm2
206 ; SSE42-NEXT: movdqa d+144(%rip), %xmm3
207 ; SSE42-NEXT: psubd %xmm0, %xmm3
208 ; SSE42-NEXT: paddd %xmm0, %xmm0
209 ; SSE42-NEXT: movdqa %xmm1, %xmm4
210 ; SSE42-NEXT: paddd %xmm1, %xmm4
211 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
212 ; SSE42-NEXT: movdqa %xmm0, c+144(%rip)
213 ; SSE42-NEXT: movdqa %xmm4, c+128(%rip)
214 ; SSE42-NEXT: movdqa c+160(%rip), %xmm0
215 ; SSE42-NEXT: movdqa c+176(%rip), %xmm2
216 ; SSE42-NEXT: movdqa d+160(%rip), %xmm4
217 ; SSE42-NEXT: movdqa d+176(%rip), %xmm5
218 ; SSE42-NEXT: movdqa d+128(%rip), %xmm6
219 ; SSE42-NEXT: pinsrd $0, %eax, %xmm1
220 ; SSE42-NEXT: psubd %xmm1, %xmm6
221 ; SSE42-NEXT: psubd %xmm2, %xmm5
222 ; SSE42-NEXT: psubd %xmm0, %xmm4
223 ; SSE42-NEXT: movdqa %xmm4, d+160(%rip)
224 ; SSE42-NEXT: movdqa %xmm5, d+176(%rip)
225 ; SSE42-NEXT: movdqa %xmm3, d+144(%rip)
226 ; SSE42-NEXT: movdqa %xmm6, d+128(%rip)
227 ; SSE42-NEXT: paddd %xmm2, %xmm2
228 ; SSE42-NEXT: paddd %xmm0, %xmm0
229 ; SSE42-NEXT: movdqa %xmm0, c+160(%rip)
230 ; SSE42-NEXT: movdqa %xmm2, c+176(%rip)
233 ; AVX1-LABEL: PR42833:
235 ; AVX1-NEXT: vmovdqa c+128(%rip), %xmm0
236 ; AVX1-NEXT: vmovd %xmm0, %eax
237 ; AVX1-NEXT: addl b(%rip), %eax
238 ; AVX1-NEXT: vmovd %eax, %xmm1
239 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
240 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2
241 ; AVX1-NEXT: vmovdqa c+144(%rip), %xmm3
242 ; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3
243 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
244 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
245 ; AVX1-NEXT: vmovdqa d+144(%rip), %xmm2
246 ; AVX1-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2
247 ; AVX1-NEXT: vmovups %ymm1, c+128(%rip)
248 ; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
249 ; AVX1-NEXT: vmovdqa d+128(%rip), %xmm1
250 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
251 ; AVX1-NEXT: vmovdqa d+176(%rip), %xmm1
252 ; AVX1-NEXT: vmovdqa c+176(%rip), %xmm3
253 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
254 ; AVX1-NEXT: vmovdqa d+160(%rip), %xmm4
255 ; AVX1-NEXT: vmovdqa c+160(%rip), %xmm5
256 ; AVX1-NEXT: vpsubd %xmm5, %xmm4, %xmm4
257 ; AVX1-NEXT: vmovdqa %xmm2, d+144(%rip)
258 ; AVX1-NEXT: vmovdqa %xmm4, d+160(%rip)
259 ; AVX1-NEXT: vmovdqa %xmm1, d+176(%rip)
260 ; AVX1-NEXT: vmovdqa %xmm0, d+128(%rip)
261 ; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm0
262 ; AVX1-NEXT: vpaddd %xmm5, %xmm5, %xmm1
263 ; AVX1-NEXT: vmovdqa %xmm1, c+160(%rip)
264 ; AVX1-NEXT: vmovdqa %xmm0, c+176(%rip)
265 ; AVX1-NEXT: vzeroupper
268 ; AVX2-LABEL: PR42833:
270 ; AVX2-NEXT: movl b(%rip), %eax
271 ; AVX2-NEXT: vmovdqu c+128(%rip), %ymm0
272 ; AVX2-NEXT: addl c+128(%rip), %eax
273 ; AVX2-NEXT: vmovd %eax, %xmm1
274 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2
275 ; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3
276 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7]
277 ; AVX2-NEXT: vmovdqu %ymm2, c+128(%rip)
278 ; AVX2-NEXT: vmovdqu c+160(%rip), %ymm2
279 ; AVX2-NEXT: vmovdqu d+160(%rip), %ymm3
280 ; AVX2-NEXT: vmovdqu d+128(%rip), %ymm4
281 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
282 ; AVX2-NEXT: vpsubd %ymm0, %ymm4, %ymm0
283 ; AVX2-NEXT: vpsubd %ymm2, %ymm3, %ymm1
284 ; AVX2-NEXT: vmovdqu %ymm1, d+160(%rip)
285 ; AVX2-NEXT: vmovdqu %ymm0, d+128(%rip)
286 ; AVX2-NEXT: vpaddd %ymm2, %ymm2, %ymm0
287 ; AVX2-NEXT: vmovdqu %ymm0, c+160(%rip)
288 ; AVX2-NEXT: vzeroupper
291 ; AVX512-LABEL: PR42833:
293 ; AVX512-NEXT: movl b(%rip), %eax
294 ; AVX512-NEXT: vmovdqu c+128(%rip), %ymm0
295 ; AVX512-NEXT: vmovdqu64 c+128(%rip), %zmm1
296 ; AVX512-NEXT: addl c+128(%rip), %eax
297 ; AVX512-NEXT: vmovd %eax, %xmm2
298 ; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm2
299 ; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0
300 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7]
301 ; AVX512-NEXT: vmovdqa c+128(%rip), %xmm2
302 ; AVX512-NEXT: vmovdqu %ymm0, c+128(%rip)
303 ; AVX512-NEXT: vmovdqu c+160(%rip), %ymm0
304 ; AVX512-NEXT: vmovdqu64 d+128(%rip), %zmm3
305 ; AVX512-NEXT: vpinsrd $0, %eax, %xmm2, %xmm2
306 ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
307 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1
308 ; AVX512-NEXT: vpsubd %zmm1, %zmm3, %zmm1
309 ; AVX512-NEXT: vmovdqu64 %zmm1, d+128(%rip)
310 ; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0
311 ; AVX512-NEXT: vmovdqu %ymm0, c+160(%rip)
312 ; AVX512-NEXT: vzeroupper
315 ; XOP-LABEL: PR42833:
317 ; XOP-NEXT: vmovdqa c+128(%rip), %xmm0
318 ; XOP-NEXT: vmovd %xmm0, %eax
319 ; XOP-NEXT: addl b(%rip), %eax
320 ; XOP-NEXT: vmovd %eax, %xmm1
321 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
322 ; XOP-NEXT: vpaddd %xmm0, %xmm0, %xmm2
323 ; XOP-NEXT: vmovdqa c+144(%rip), %xmm3
324 ; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm3
325 ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
326 ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
327 ; XOP-NEXT: vmovdqa d+144(%rip), %xmm2
328 ; XOP-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2
329 ; XOP-NEXT: vmovups %ymm1, c+128(%rip)
330 ; XOP-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
331 ; XOP-NEXT: vmovdqa d+128(%rip), %xmm1
332 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0
333 ; XOP-NEXT: vmovdqa d+176(%rip), %xmm1
334 ; XOP-NEXT: vmovdqa c+176(%rip), %xmm3
335 ; XOP-NEXT: vpsubd %xmm3, %xmm1, %xmm1
336 ; XOP-NEXT: vmovdqa d+160(%rip), %xmm4
337 ; XOP-NEXT: vmovdqa c+160(%rip), %xmm5
338 ; XOP-NEXT: vpsubd %xmm5, %xmm4, %xmm4
339 ; XOP-NEXT: vmovdqa %xmm2, d+144(%rip)
340 ; XOP-NEXT: vmovdqa %xmm4, d+160(%rip)
341 ; XOP-NEXT: vmovdqa %xmm1, d+176(%rip)
342 ; XOP-NEXT: vmovdqa %xmm0, d+128(%rip)
343 ; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm0
344 ; XOP-NEXT: vpaddd %xmm5, %xmm5, %xmm1
345 ; XOP-NEXT: vmovdqa %xmm1, c+160(%rip)
346 ; XOP-NEXT: vmovdqa %xmm0, c+176(%rip)
347 ; XOP-NEXT: vzeroupper
349 %1 = load i32, i32* @b, align 4
350 %2 = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 32) to <8 x i32>*), align 16
351 %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
352 %4 = extractelement <8 x i32> %2, i32 0
354 %6 = insertelement <8 x i32> <i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %5, i32 0
355 %7 = add <8 x i32> %2, %6
356 %8 = shl <8 x i32> %2, %6
357 %9 = shufflevector <8 x i32> %7, <8 x i32> %8, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
358 store <8 x i32> %9, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 32) to <8 x i32>*), align 16
359 %10 = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 40) to <8 x i32>*), align 16
360 %11 = shufflevector <8 x i32> %10, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
361 %12 = load <16 x i32>, <16 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @d, i64 0, i64 32) to <16 x i32>*), align 16
362 %13 = insertelement <16 x i32> %3, i32 %5, i32 0
363 %14 = shufflevector <16 x i32> %13, <16 x i32> %11, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
364 %15 = sub <16 x i32> %12, %14
365 store <16 x i32> %15, <16 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @d, i64 0, i64 32) to <16 x i32>*), align 16
366 %16 = shl <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
367 store <8 x i32> %16, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 40) to <8 x i32>*), align 16