1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX
8 ; 32-bit SSE tests to make sure we do reasonable things.
9 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86-SSE,X86-SSE1
10 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86-SSE,X86-SSE41
12 define <2 x double> @merge_2f64_f64_23(ptr %ptr) nounwind uwtable noinline ssp {
13 ; SSE-LABEL: merge_2f64_f64_23:
15 ; SSE-NEXT: movups 16(%rdi), %xmm0
18 ; AVX-LABEL: merge_2f64_f64_23:
20 ; AVX-NEXT: vmovups 16(%rdi), %xmm0
23 ; X86-SSE1-LABEL: merge_2f64_f64_23:
25 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
26 ; X86-SSE1-NEXT: fldl 16(%eax)
27 ; X86-SSE1-NEXT: fldl 24(%eax)
28 ; X86-SSE1-NEXT: fxch %st(1)
31 ; X86-SSE41-LABEL: merge_2f64_f64_23:
33 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
34 ; X86-SSE41-NEXT: movups 16(%eax), %xmm0
35 ; X86-SSE41-NEXT: retl
36 %ptr0 = getelementptr inbounds double, ptr %ptr, i64 2
37 %ptr1 = getelementptr inbounds double, ptr %ptr, i64 3
38 %val0 = load double, ptr %ptr0
39 %val1 = load double, ptr %ptr1
40 %res0 = insertelement <2 x double> undef, double %val0, i32 0
41 %res1 = insertelement <2 x double> %res0, double %val1, i32 1
42 ret <2 x double> %res1
45 define <2 x i64> @merge_2i64_i64_12(ptr %ptr) nounwind uwtable noinline ssp {
46 ; SSE-LABEL: merge_2i64_i64_12:
48 ; SSE-NEXT: movups 8(%rdi), %xmm0
51 ; AVX-LABEL: merge_2i64_i64_12:
53 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
56 ; X86-SSE1-LABEL: merge_2i64_i64_12:
58 ; X86-SSE1-NEXT: pushl %edi
59 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
60 ; X86-SSE1-NEXT: pushl %esi
61 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
62 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
63 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
64 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
65 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
66 ; X86-SSE1-NEXT: movl 8(%ecx), %edx
67 ; X86-SSE1-NEXT: movl 12(%ecx), %esi
68 ; X86-SSE1-NEXT: movl 16(%ecx), %edi
69 ; X86-SSE1-NEXT: movl 20(%ecx), %ecx
70 ; X86-SSE1-NEXT: movl %ecx, 12(%eax)
71 ; X86-SSE1-NEXT: movl %edi, 8(%eax)
72 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
73 ; X86-SSE1-NEXT: movl %edx, (%eax)
74 ; X86-SSE1-NEXT: popl %esi
75 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
76 ; X86-SSE1-NEXT: popl %edi
77 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
78 ; X86-SSE1-NEXT: retl $4
80 ; X86-SSE41-LABEL: merge_2i64_i64_12:
82 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
83 ; X86-SSE41-NEXT: movups 8(%eax), %xmm0
84 ; X86-SSE41-NEXT: retl
85 %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 1
86 %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 2
87 %val0 = load i64, ptr %ptr0
88 %val1 = load i64, ptr %ptr1
89 %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
90 %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
94 define <4 x float> @merge_4f32_f32_2345(ptr %ptr) nounwind uwtable noinline ssp {
95 ; SSE-LABEL: merge_4f32_f32_2345:
97 ; SSE-NEXT: movups 8(%rdi), %xmm0
100 ; AVX-LABEL: merge_4f32_f32_2345:
102 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
105 ; X86-SSE-LABEL: merge_4f32_f32_2345:
107 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
108 ; X86-SSE-NEXT: movups 8(%eax), %xmm0
110 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 2
111 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 3
112 %ptr2 = getelementptr inbounds float, ptr %ptr, i64 4
113 %ptr3 = getelementptr inbounds float, ptr %ptr, i64 5
114 %val0 = load float, ptr %ptr0
115 %val1 = load float, ptr %ptr1
116 %val2 = load float, ptr %ptr2
117 %val3 = load float, ptr %ptr3
118 %res0 = insertelement <4 x float> undef, float %val0, i32 0
119 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
120 %res2 = insertelement <4 x float> %res1, float %val2, i32 2
121 %res3 = insertelement <4 x float> %res2, float %val3, i32 3
122 ret <4 x float> %res3
125 define <4 x float> @merge_4f32_f32_3zuu(ptr %ptr) nounwind uwtable noinline ssp {
126 ; SSE-LABEL: merge_4f32_f32_3zuu:
128 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
131 ; AVX-LABEL: merge_4f32_f32_3zuu:
133 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
136 ; X86-SSE-LABEL: merge_4f32_f32_3zuu:
138 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
139 ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
141 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3
142 %val0 = load float, ptr %ptr0
143 %res0 = insertelement <4 x float> undef, float %val0, i32 0
144 %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
145 ret <4 x float> %res1
148 define <4 x float> @merge_4f32_f32_34uu(ptr %ptr) nounwind uwtable noinline ssp {
149 ; SSE-LABEL: merge_4f32_f32_34uu:
151 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
154 ; AVX-LABEL: merge_4f32_f32_34uu:
156 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
159 ; X86-SSE1-LABEL: merge_4f32_f32_34uu:
161 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
162 ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
163 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
164 ; X86-SSE1-NEXT: retl
166 ; X86-SSE41-LABEL: merge_4f32_f32_34uu:
167 ; X86-SSE41: # %bb.0:
168 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
169 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
170 ; X86-SSE41-NEXT: retl
171 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3
172 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 4
173 %val0 = load float, ptr %ptr0
174 %val1 = load float, ptr %ptr1
175 %res0 = insertelement <4 x float> undef, float %val0, i32 0
176 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
177 ret <4 x float> %res1
180 define <4 x float> @merge_4f32_f32_34z6(ptr %ptr) nounwind uwtable noinline ssp {
181 ; SSE2-LABEL: merge_4f32_f32_34z6:
183 ; SSE2-NEXT: movups 12(%rdi), %xmm0
184 ; SSE2-NEXT: xorps %xmm1, %xmm1
185 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
186 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
189 ; SSE41-LABEL: merge_4f32_f32_34z6:
191 ; SSE41-NEXT: movups 12(%rdi), %xmm1
192 ; SSE41-NEXT: xorps %xmm0, %xmm0
193 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
196 ; AVX-LABEL: merge_4f32_f32_34z6:
198 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
199 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3]
202 ; X86-SSE1-LABEL: merge_4f32_f32_34z6:
204 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
205 ; X86-SSE1-NEXT: movups 12(%eax), %xmm0
206 ; X86-SSE1-NEXT: xorps %xmm1, %xmm1
207 ; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
208 ; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
209 ; X86-SSE1-NEXT: retl
211 ; X86-SSE41-LABEL: merge_4f32_f32_34z6:
212 ; X86-SSE41: # %bb.0:
213 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
214 ; X86-SSE41-NEXT: movups 12(%eax), %xmm1
215 ; X86-SSE41-NEXT: xorps %xmm0, %xmm0
216 ; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
217 ; X86-SSE41-NEXT: retl
218 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3
219 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 4
220 %ptr3 = getelementptr inbounds float, ptr %ptr, i64 6
221 %val0 = load float, ptr %ptr0
222 %val1 = load float, ptr %ptr1
223 %val3 = load float, ptr %ptr3
224 %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
225 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
226 %res3 = insertelement <4 x float> %res1, float %val3, i32 3
227 ret <4 x float> %res3
230 define <4 x float> @merge_4f32_f32_45zz(ptr %ptr) nounwind uwtable noinline ssp {
231 ; SSE-LABEL: merge_4f32_f32_45zz:
233 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
236 ; AVX-LABEL: merge_4f32_f32_45zz:
238 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
241 ; X86-SSE1-LABEL: merge_4f32_f32_45zz:
243 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
244 ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
245 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
246 ; X86-SSE1-NEXT: retl
248 ; X86-SSE41-LABEL: merge_4f32_f32_45zz:
249 ; X86-SSE41: # %bb.0:
250 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
251 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
252 ; X86-SSE41-NEXT: retl
253 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 4
254 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 5
255 %val0 = load float, ptr %ptr0
256 %val1 = load float, ptr %ptr1
257 %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
258 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
259 ret <4 x float> %res1
262 define <4 x float> @merge_4f32_f32_012u(ptr %ptr) nounwind uwtable noinline ssp {
263 ; SSE2-LABEL: merge_4f32_f32_012u:
265 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
266 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
267 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
270 ; SSE41-LABEL: merge_4f32_f32_012u:
272 ; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
273 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
276 ; AVX-LABEL: merge_4f32_f32_012u:
278 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
279 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
282 ; X86-SSE1-LABEL: merge_4f32_f32_012u:
284 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
285 ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
286 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
287 ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
288 ; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
289 ; X86-SSE1-NEXT: retl
291 ; X86-SSE41-LABEL: merge_4f32_f32_012u:
292 ; X86-SSE41: # %bb.0:
293 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
294 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
295 ; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
296 ; X86-SSE41-NEXT: retl
297 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 1
298 %ptr2 = getelementptr inbounds float, ptr %ptr, i64 2
299 %val0 = load float, ptr %ptr
300 %val1 = load float, ptr %ptr1
301 %val2 = load float, ptr %ptr2
302 %res0 = insertelement <4 x float> undef, float %val0, i32 0
303 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
304 %res2 = insertelement <4 x float> %res1, float %val2, i32 2
305 %res3 = insertelement <4 x float> %res2, float undef, i32 3
306 ret <4 x float> %res3
309 define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp {
310 ; SSE2-LABEL: merge_4f32_f32_019u:
312 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
313 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
314 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
317 ; SSE41-LABEL: merge_4f32_f32_019u:
319 ; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
320 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
323 ; AVX-LABEL: merge_4f32_f32_019u:
325 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
326 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
329 ; X86-SSE1-LABEL: merge_4f32_f32_019u:
331 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
332 ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
333 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
334 ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
335 ; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
336 ; X86-SSE1-NEXT: retl
338 ; X86-SSE41-LABEL: merge_4f32_f32_019u:
339 ; X86-SSE41: # %bb.0:
340 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
341 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
342 ; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
343 ; X86-SSE41-NEXT: retl
344 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 1
345 %ptr2 = getelementptr inbounds float, ptr %ptr, i64 9
346 %val0 = load float, ptr %ptr
347 %val1 = load float, ptr %ptr1
348 %val2 = load float, ptr %ptr2
349 %res0 = insertelement <4 x float> undef, float %val0, i32 0
350 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
351 %res2 = insertelement <4 x float> %res1, float %val2, i32 2
352 %res3 = insertelement <4 x float> %res2, float undef, i32 3
353 ret <4 x float> %res3
356 define <4 x i32> @merge_4i32_i32_23u5(ptr %ptr) nounwind uwtable noinline ssp {
357 ; SSE-LABEL: merge_4i32_i32_23u5:
359 ; SSE-NEXT: movups 8(%rdi), %xmm0
362 ; AVX-LABEL: merge_4i32_i32_23u5:
364 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
367 ; X86-SSE1-LABEL: merge_4i32_i32_23u5:
369 ; X86-SSE1-NEXT: pushl %esi
370 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
371 ; X86-SSE1-NEXT: .cfi_offset %esi, -8
372 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
373 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
374 ; X86-SSE1-NEXT: movl 8(%ecx), %edx
375 ; X86-SSE1-NEXT: movl 12(%ecx), %esi
376 ; X86-SSE1-NEXT: movl 20(%ecx), %ecx
377 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
378 ; X86-SSE1-NEXT: movl %edx, (%eax)
379 ; X86-SSE1-NEXT: movl %ecx, 12(%eax)
380 ; X86-SSE1-NEXT: popl %esi
381 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
382 ; X86-SSE1-NEXT: retl $4
384 ; X86-SSE41-LABEL: merge_4i32_i32_23u5:
385 ; X86-SSE41: # %bb.0:
386 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
387 ; X86-SSE41-NEXT: movups 8(%eax), %xmm0
388 ; X86-SSE41-NEXT: retl
389 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 2
390 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 3
391 %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 5
392 %val0 = load i32, ptr %ptr0
393 %val1 = load i32, ptr %ptr1
394 %val3 = load i32, ptr %ptr3
395 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
396 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
397 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
401 define <4 x i32> @merge_4i32_i32_23u5_inc2(ptr %ptr) nounwind uwtable noinline ssp {
402 ; SSE-LABEL: merge_4i32_i32_23u5_inc2:
404 ; SSE-NEXT: movups 8(%rdi), %xmm0
405 ; SSE-NEXT: incl 8(%rdi)
408 ; AVX-LABEL: merge_4i32_i32_23u5_inc2:
410 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
411 ; AVX-NEXT: incl 8(%rdi)
414 ; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc2:
416 ; X86-SSE1-NEXT: pushl %edi
417 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
418 ; X86-SSE1-NEXT: pushl %esi
419 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
420 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
421 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
422 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
423 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
424 ; X86-SSE1-NEXT: movl 8(%ecx), %edx
425 ; X86-SSE1-NEXT: movl 12(%ecx), %esi
426 ; X86-SSE1-NEXT: leal 1(%edx), %edi
427 ; X86-SSE1-NEXT: movl %edi, 8(%ecx)
428 ; X86-SSE1-NEXT: movl 20(%ecx), %ecx
429 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
430 ; X86-SSE1-NEXT: movl %edx, (%eax)
431 ; X86-SSE1-NEXT: movl %ecx, 12(%eax)
432 ; X86-SSE1-NEXT: popl %esi
433 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
434 ; X86-SSE1-NEXT: popl %edi
435 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
436 ; X86-SSE1-NEXT: retl $4
438 ; X86-SSE41-LABEL: merge_4i32_i32_23u5_inc2:
439 ; X86-SSE41: # %bb.0:
440 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
441 ; X86-SSE41-NEXT: movups 8(%eax), %xmm0
442 ; X86-SSE41-NEXT: incl 8(%eax)
443 ; X86-SSE41-NEXT: retl
444 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 2
445 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 3
446 %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 5
447 %val0 = load i32, ptr %ptr0
448 %inc = add i32 %val0, 1
449 store i32 %inc, ptr %ptr0
450 %val1 = load i32, ptr %ptr1
451 %val3 = load i32, ptr %ptr3
452 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
453 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
454 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
458 define <4 x i32> @merge_4i32_i32_23u5_inc3(ptr %ptr) nounwind uwtable noinline ssp {
459 ; SSE-LABEL: merge_4i32_i32_23u5_inc3:
461 ; SSE-NEXT: movups 8(%rdi), %xmm0
462 ; SSE-NEXT: incl 12(%rdi)
465 ; AVX-LABEL: merge_4i32_i32_23u5_inc3:
467 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
468 ; AVX-NEXT: incl 12(%rdi)
471 ; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc3:
473 ; X86-SSE1-NEXT: pushl %edi
474 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
475 ; X86-SSE1-NEXT: pushl %esi
476 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
477 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
478 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
479 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
480 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
481 ; X86-SSE1-NEXT: movl 8(%ecx), %edx
482 ; X86-SSE1-NEXT: movl 12(%ecx), %esi
483 ; X86-SSE1-NEXT: leal 1(%esi), %edi
484 ; X86-SSE1-NEXT: movl %edi, 12(%ecx)
485 ; X86-SSE1-NEXT: movl 20(%ecx), %ecx
486 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
487 ; X86-SSE1-NEXT: movl %edx, (%eax)
488 ; X86-SSE1-NEXT: movl %ecx, 12(%eax)
489 ; X86-SSE1-NEXT: popl %esi
490 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
491 ; X86-SSE1-NEXT: popl %edi
492 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
493 ; X86-SSE1-NEXT: retl $4
495 ; X86-SSE41-LABEL: merge_4i32_i32_23u5_inc3:
496 ; X86-SSE41: # %bb.0:
497 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
498 ; X86-SSE41-NEXT: movups 8(%eax), %xmm0
499 ; X86-SSE41-NEXT: incl 12(%eax)
500 ; X86-SSE41-NEXT: retl
501 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 2
502 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 3
503 %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 5
504 %val0 = load i32, ptr %ptr0
505 %val1 = load i32, ptr %ptr1
506 %inc = add i32 %val1, 1
507 store i32 %inc, ptr %ptr1
508 %val3 = load i32, ptr %ptr3
509 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
510 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
511 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
515 define <4 x i32> @merge_4i32_i32_3zuu(ptr %ptr) nounwind uwtable noinline ssp {
516 ; SSE-LABEL: merge_4i32_i32_3zuu:
518 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
521 ; AVX-LABEL: merge_4i32_i32_3zuu:
523 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
526 ; X86-SSE1-LABEL: merge_4i32_i32_3zuu:
528 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
529 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
530 ; X86-SSE1-NEXT: movl 12(%ecx), %ecx
531 ; X86-SSE1-NEXT: movl %ecx, (%eax)
532 ; X86-SSE1-NEXT: movl $0, 4(%eax)
533 ; X86-SSE1-NEXT: retl $4
535 ; X86-SSE41-LABEL: merge_4i32_i32_3zuu:
536 ; X86-SSE41: # %bb.0:
537 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
538 ; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
539 ; X86-SSE41-NEXT: retl
540 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 3
541 %val0 = load i32, ptr %ptr0
542 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
543 %res1 = insertelement <4 x i32> %res0, i32 0, i32 1
547 define <4 x i32> @merge_4i32_i32_34uu(ptr %ptr) nounwind uwtable noinline ssp {
548 ; SSE-LABEL: merge_4i32_i32_34uu:
550 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
553 ; AVX-LABEL: merge_4i32_i32_34uu:
555 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
558 ; X86-SSE1-LABEL: merge_4i32_i32_34uu:
560 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
561 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
562 ; X86-SSE1-NEXT: movl 12(%ecx), %edx
563 ; X86-SSE1-NEXT: movl 16(%ecx), %ecx
564 ; X86-SSE1-NEXT: movl %ecx, 4(%eax)
565 ; X86-SSE1-NEXT: movl %edx, (%eax)
566 ; X86-SSE1-NEXT: retl $4
568 ; X86-SSE41-LABEL: merge_4i32_i32_34uu:
569 ; X86-SSE41: # %bb.0:
570 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
571 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
572 ; X86-SSE41-NEXT: retl
573 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 3
574 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 4
575 %val0 = load i32, ptr %ptr0
576 %val1 = load i32, ptr %ptr1
577 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
578 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
582 define <4 x i32> @merge_4i32_i32_45zz(ptr %ptr) nounwind uwtable noinline ssp {
583 ; SSE-LABEL: merge_4i32_i32_45zz:
585 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
588 ; AVX-LABEL: merge_4i32_i32_45zz:
590 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
593 ; X86-SSE1-LABEL: merge_4i32_i32_45zz:
595 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
596 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
597 ; X86-SSE1-NEXT: movl 16(%ecx), %edx
598 ; X86-SSE1-NEXT: movl 20(%ecx), %ecx
599 ; X86-SSE1-NEXT: movl %ecx, 4(%eax)
600 ; X86-SSE1-NEXT: movl %edx, (%eax)
601 ; X86-SSE1-NEXT: movl $0, 12(%eax)
602 ; X86-SSE1-NEXT: movl $0, 8(%eax)
603 ; X86-SSE1-NEXT: retl $4
605 ; X86-SSE41-LABEL: merge_4i32_i32_45zz:
606 ; X86-SSE41: # %bb.0:
607 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
608 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
609 ; X86-SSE41-NEXT: retl
610 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 4
611 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 5
612 %val0 = load i32, ptr %ptr0
613 %val1 = load i32, ptr %ptr1
614 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
615 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
619 define <4 x i32> @merge_4i32_i32_45zz_inc4(ptr %ptr) nounwind uwtable noinline ssp {
620 ; SSE-LABEL: merge_4i32_i32_45zz_inc4:
622 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
623 ; SSE-NEXT: incl 16(%rdi)
626 ; AVX-LABEL: merge_4i32_i32_45zz_inc4:
628 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
629 ; AVX-NEXT: incl 16(%rdi)
632 ; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc4:
634 ; X86-SSE1-NEXT: pushl %edi
635 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
636 ; X86-SSE1-NEXT: pushl %esi
637 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
638 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
639 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
640 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
641 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
642 ; X86-SSE1-NEXT: movl 16(%ecx), %edx
643 ; X86-SSE1-NEXT: movl 20(%ecx), %esi
644 ; X86-SSE1-NEXT: leal 1(%edx), %edi
645 ; X86-SSE1-NEXT: movl %edi, 16(%ecx)
646 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
647 ; X86-SSE1-NEXT: movl %edx, (%eax)
648 ; X86-SSE1-NEXT: movl $0, 12(%eax)
649 ; X86-SSE1-NEXT: movl $0, 8(%eax)
650 ; X86-SSE1-NEXT: popl %esi
651 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
652 ; X86-SSE1-NEXT: popl %edi
653 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
654 ; X86-SSE1-NEXT: retl $4
656 ; X86-SSE41-LABEL: merge_4i32_i32_45zz_inc4:
657 ; X86-SSE41: # %bb.0:
658 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
659 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
660 ; X86-SSE41-NEXT: incl 16(%eax)
661 ; X86-SSE41-NEXT: retl
662 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 4
663 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 5
664 %val0 = load i32, ptr %ptr0
665 %inc = add i32 %val0, 1
666 store i32 %inc, ptr %ptr0
667 %val1 = load i32, ptr %ptr1
668 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
669 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
673 define <4 x i32> @merge_4i32_i32_45zz_inc5(ptr %ptr) nounwind uwtable noinline ssp {
674 ; SSE-LABEL: merge_4i32_i32_45zz_inc5:
676 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
677 ; SSE-NEXT: incl 20(%rdi)
680 ; AVX-LABEL: merge_4i32_i32_45zz_inc5:
682 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
683 ; AVX-NEXT: incl 20(%rdi)
686 ; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc5:
688 ; X86-SSE1-NEXT: pushl %edi
689 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
690 ; X86-SSE1-NEXT: pushl %esi
691 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
692 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
693 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
694 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
695 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
696 ; X86-SSE1-NEXT: movl 16(%ecx), %edx
697 ; X86-SSE1-NEXT: movl 20(%ecx), %esi
698 ; X86-SSE1-NEXT: leal 1(%esi), %edi
699 ; X86-SSE1-NEXT: movl %edi, 20(%ecx)
700 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
701 ; X86-SSE1-NEXT: movl %edx, (%eax)
702 ; X86-SSE1-NEXT: movl $0, 12(%eax)
703 ; X86-SSE1-NEXT: movl $0, 8(%eax)
704 ; X86-SSE1-NEXT: popl %esi
705 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
706 ; X86-SSE1-NEXT: popl %edi
707 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
708 ; X86-SSE1-NEXT: retl $4
710 ; X86-SSE41-LABEL: merge_4i32_i32_45zz_inc5:
711 ; X86-SSE41: # %bb.0:
712 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
713 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
714 ; X86-SSE41-NEXT: incl 20(%eax)
715 ; X86-SSE41-NEXT: retl
716 %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 4
717 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 5
718 %val0 = load i32, ptr %ptr0
719 %val1 = load i32, ptr %ptr1
720 %inc = add i32 %val1, 1
721 store i32 %inc, ptr %ptr1
722 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
723 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
727 define <8 x i16> @merge_8i16_i16_23u567u9(ptr %ptr) nounwind uwtable noinline ssp {
728 ; SSE-LABEL: merge_8i16_i16_23u567u9:
730 ; SSE-NEXT: movups 4(%rdi), %xmm0
733 ; AVX-LABEL: merge_8i16_i16_23u567u9:
735 ; AVX-NEXT: vmovups 4(%rdi), %xmm0
738 ; X86-SSE1-LABEL: merge_8i16_i16_23u567u9:
740 ; X86-SSE1-NEXT: pushl %edi
741 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
742 ; X86-SSE1-NEXT: pushl %esi
743 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
744 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
745 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
746 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
747 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
748 ; X86-SSE1-NEXT: movl 4(%ecx), %edx
749 ; X86-SSE1-NEXT: movl 10(%ecx), %esi
750 ; X86-SSE1-NEXT: movzwl 14(%ecx), %edi
751 ; X86-SSE1-NEXT: movzwl 18(%ecx), %ecx
752 ; X86-SSE1-NEXT: movw %di, 10(%eax)
753 ; X86-SSE1-NEXT: movw %cx, 14(%eax)
754 ; X86-SSE1-NEXT: movl %esi, 6(%eax)
755 ; X86-SSE1-NEXT: movl %edx, (%eax)
756 ; X86-SSE1-NEXT: popl %esi
757 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
758 ; X86-SSE1-NEXT: popl %edi
759 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
760 ; X86-SSE1-NEXT: retl $4
762 ; X86-SSE41-LABEL: merge_8i16_i16_23u567u9:
763 ; X86-SSE41: # %bb.0:
764 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
765 ; X86-SSE41-NEXT: movups 4(%eax), %xmm0
766 ; X86-SSE41-NEXT: retl
767 %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 2
768 %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 3
769 %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 5
770 %ptr4 = getelementptr inbounds i16, ptr %ptr, i64 6
771 %ptr5 = getelementptr inbounds i16, ptr %ptr, i64 7
772 %ptr7 = getelementptr inbounds i16, ptr %ptr, i64 9
773 %val0 = load i16, ptr %ptr0
774 %val1 = load i16, ptr %ptr1
775 %val3 = load i16, ptr %ptr3
776 %val4 = load i16, ptr %ptr4
777 %val5 = load i16, ptr %ptr5
778 %val7 = load i16, ptr %ptr7
779 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
780 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
781 %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
782 %res4 = insertelement <8 x i16> %res3, i16 %val4, i32 4
783 %res5 = insertelement <8 x i16> %res4, i16 %val5, i32 5
784 %res7 = insertelement <8 x i16> %res5, i16 %val7, i32 7
788 define <8 x i16> @merge_8i16_i16_34uuuuuu(ptr %ptr) nounwind uwtable noinline ssp {
789 ; SSE-LABEL: merge_8i16_i16_34uuuuuu:
791 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
794 ; AVX-LABEL: merge_8i16_i16_34uuuuuu:
796 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
799 ; X86-SSE1-LABEL: merge_8i16_i16_34uuuuuu:
801 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
802 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
803 ; X86-SSE1-NEXT: movl 6(%ecx), %ecx
804 ; X86-SSE1-NEXT: movl %ecx, (%eax)
805 ; X86-SSE1-NEXT: retl $4
807 ; X86-SSE41-LABEL: merge_8i16_i16_34uuuuuu:
808 ; X86-SSE41: # %bb.0:
809 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
810 ; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
811 ; X86-SSE41-NEXT: retl
812 %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 3
813 %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 4
814 %val0 = load i16, ptr %ptr0
815 %val1 = load i16, ptr %ptr1
816 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
817 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
821 define <8 x i16> @merge_8i16_i16_45u7zzzz(ptr %ptr) nounwind uwtable noinline ssp {
822 ; SSE-LABEL: merge_8i16_i16_45u7zzzz:
824 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
827 ; AVX-LABEL: merge_8i16_i16_45u7zzzz:
829 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
832 ; X86-SSE1-LABEL: merge_8i16_i16_45u7zzzz:
834 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
835 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
836 ; X86-SSE1-NEXT: movl 8(%ecx), %edx
837 ; X86-SSE1-NEXT: movzwl 14(%ecx), %ecx
838 ; X86-SSE1-NEXT: movw %cx, 6(%eax)
839 ; X86-SSE1-NEXT: movl %edx, (%eax)
840 ; X86-SSE1-NEXT: movl $0, 12(%eax)
841 ; X86-SSE1-NEXT: movl $0, 8(%eax)
842 ; X86-SSE1-NEXT: retl $4
844 ; X86-SSE41-LABEL: merge_8i16_i16_45u7zzzz:
845 ; X86-SSE41: # %bb.0:
846 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
847 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
848 ; X86-SSE41-NEXT: retl
849 %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 4
850 %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 5
851 %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 7
852 %val0 = load i16, ptr %ptr0
853 %val1 = load i16, ptr %ptr1
854 %val3 = load i16, ptr %ptr3
855 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
856 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
857 %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
858 %res4 = insertelement <8 x i16> %res3, i16 0, i32 4
859 %res5 = insertelement <8 x i16> %res4, i16 0, i32 5
860 %res6 = insertelement <8 x i16> %res5, i16 0, i32 6
861 %res7 = insertelement <8 x i16> %res6, i16 0, i32 7
865 define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(ptr %ptr) nounwind uwtable noinline ssp {
866 ; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
868 ; SSE-NEXT: movups (%rdi), %xmm0
871 ; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF:
873 ; AVX-NEXT: vmovups (%rdi), %xmm0
876 ; X86-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF:
878 ; X86-SSE1-NEXT: pushl %ebp
879 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
880 ; X86-SSE1-NEXT: pushl %ebx
881 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
882 ; X86-SSE1-NEXT: pushl %edi
883 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 16
884 ; X86-SSE1-NEXT: pushl %esi
885 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 20
886 ; X86-SSE1-NEXT: .cfi_offset %esi, -20
887 ; X86-SSE1-NEXT: .cfi_offset %edi, -16
888 ; X86-SSE1-NEXT: .cfi_offset %ebx, -12
889 ; X86-SSE1-NEXT: .cfi_offset %ebp, -8
890 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
891 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
892 ; X86-SSE1-NEXT: movzwl (%ecx), %ebp
893 ; X86-SSE1-NEXT: movl 3(%ecx), %esi
894 ; X86-SSE1-NEXT: movl 7(%ecx), %edi
895 ; X86-SSE1-NEXT: movzwl 11(%ecx), %ebx
896 ; X86-SSE1-NEXT: movzbl 13(%ecx), %edx
897 ; X86-SSE1-NEXT: movzbl 15(%ecx), %ecx
898 ; X86-SSE1-NEXT: movb %dl, 13(%eax)
899 ; X86-SSE1-NEXT: movb %cl, 15(%eax)
900 ; X86-SSE1-NEXT: movw %bx, 11(%eax)
901 ; X86-SSE1-NEXT: movl %edi, 7(%eax)
902 ; X86-SSE1-NEXT: movl %esi, 3(%eax)
903 ; X86-SSE1-NEXT: movw %bp, (%eax)
904 ; X86-SSE1-NEXT: popl %esi
905 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 16
906 ; X86-SSE1-NEXT: popl %edi
907 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
908 ; X86-SSE1-NEXT: popl %ebx
909 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
910 ; X86-SSE1-NEXT: popl %ebp
911 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
912 ; X86-SSE1-NEXT: retl $4
914 ; X86-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF:
915 ; X86-SSE41: # %bb.0:
916 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
917 ; X86-SSE41-NEXT: movups (%eax), %xmm0
918 ; X86-SSE41-NEXT: retl
919 %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 1
920 %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 3
921 %ptr4 = getelementptr inbounds i8, ptr %ptr, i64 4
922 %ptr5 = getelementptr inbounds i8, ptr %ptr, i64 5
923 %ptr6 = getelementptr inbounds i8, ptr %ptr, i64 6
924 %ptr7 = getelementptr inbounds i8, ptr %ptr, i64 7
925 %ptr8 = getelementptr inbounds i8, ptr %ptr, i64 8
926 %ptr9 = getelementptr inbounds i8, ptr %ptr, i64 9
927 %ptrA = getelementptr inbounds i8, ptr %ptr, i64 10
928 %ptrB = getelementptr inbounds i8, ptr %ptr, i64 11
929 %ptrC = getelementptr inbounds i8, ptr %ptr, i64 12
930 %ptrD = getelementptr inbounds i8, ptr %ptr, i64 13
931 %ptrF = getelementptr inbounds i8, ptr %ptr, i64 15
932 %val0 = load i8, ptr %ptr
933 %val1 = load i8, ptr %ptr1
934 %val3 = load i8, ptr %ptr3
935 %val4 = load i8, ptr %ptr4
936 %val5 = load i8, ptr %ptr5
937 %val6 = load i8, ptr %ptr6
938 %val7 = load i8, ptr %ptr7
939 %val8 = load i8, ptr %ptr8
940 %val9 = load i8, ptr %ptr9
941 %valA = load i8, ptr %ptrA
942 %valB = load i8, ptr %ptrB
943 %valC = load i8, ptr %ptrC
944 %valD = load i8, ptr %ptrD
945 %valF = load i8, ptr %ptrF
946 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
947 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
948 %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
949 %res4 = insertelement <16 x i8> %res3, i8 %val4, i32 4
950 %res5 = insertelement <16 x i8> %res4, i8 %val5, i32 5
951 %res6 = insertelement <16 x i8> %res5, i8 %val6, i32 6
952 %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
953 %res8 = insertelement <16 x i8> %res7, i8 %val8, i32 8
954 %res9 = insertelement <16 x i8> %res8, i8 %val9, i32 9
955 %resA = insertelement <16 x i8> %res9, i8 %valA, i32 10
956 %resB = insertelement <16 x i8> %resA, i8 %valB, i32 11
957 %resC = insertelement <16 x i8> %resB, i8 %valC, i32 12
958 %resD = insertelement <16 x i8> %resC, i8 %valD, i32 13
959 %resF = insertelement <16 x i8> %resD, i8 %valF, i32 15
963 define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(ptr %ptr) nounwind uwtable noinline ssp {
964 ; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
966 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
969 ; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
971 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
974 ; X86-SSE1-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
976 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
977 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
978 ; X86-SSE1-NEXT: movzwl (%ecx), %edx
979 ; X86-SSE1-NEXT: movzbl 3(%ecx), %ecx
980 ; X86-SSE1-NEXT: movb %cl, 3(%eax)
981 ; X86-SSE1-NEXT: movw %dx, (%eax)
982 ; X86-SSE1-NEXT: movb $0, 15(%eax)
983 ; X86-SSE1-NEXT: movw $0, 13(%eax)
984 ; X86-SSE1-NEXT: movw $0, 6(%eax)
985 ; X86-SSE1-NEXT: retl $4
987 ; X86-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
988 ; X86-SSE41: # %bb.0:
989 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
990 ; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
991 ; X86-SSE41-NEXT: retl
992 %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 1
993 %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 3
994 %val0 = load i8, ptr %ptr
995 %val1 = load i8, ptr %ptr1
996 %val3 = load i8, ptr %ptr3
997 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
998 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
999 %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
1000 %res6 = insertelement <16 x i8> %res3, i8 0, i32 6
1001 %res7 = insertelement <16 x i8> %res6, i8 0, i32 7
1002 %resD = insertelement <16 x i8> %res7, i8 0, i32 13
1003 %resE = insertelement <16 x i8> %resD, i8 0, i32 14
1004 %resF = insertelement <16 x i8> %resE, i8 0, i32 15
1008 define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(ptr %ptr) nounwind uwtable noinline ssp {
1009 ; SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1011 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1014 ; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1016 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1019 ; X86-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1020 ; X86-SSE1: # %bb.0:
1021 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1022 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1023 ; X86-SSE1-NEXT: movl (%ecx), %edx
1024 ; X86-SSE1-NEXT: movzwl 6(%ecx), %ecx
1025 ; X86-SSE1-NEXT: movw %cx, 6(%eax)
1026 ; X86-SSE1-NEXT: movl %edx, (%eax)
1027 ; X86-SSE1-NEXT: movb $0, 15(%eax)
1028 ; X86-SSE1-NEXT: movw $0, 13(%eax)
1029 ; X86-SSE1-NEXT: retl $4
1031 ; X86-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1032 ; X86-SSE41: # %bb.0:
1033 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1034 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1035 ; X86-SSE41-NEXT: retl
1036 %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 1
1037 %ptr2 = getelementptr inbounds i8, ptr %ptr, i64 2
1038 %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 3
1039 %ptr6 = getelementptr inbounds i8, ptr %ptr, i64 6
1040 %ptr7 = getelementptr inbounds i8, ptr %ptr, i64 7
1041 %val0 = load i8, ptr %ptr
1042 %val1 = load i8, ptr %ptr1
1043 %val2 = load i8, ptr %ptr2
1044 %val3 = load i8, ptr %ptr3
1045 %val6 = load i8, ptr %ptr6
1046 %val7 = load i8, ptr %ptr7
1047 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
1048 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
1049 %res2 = insertelement <16 x i8> %res1, i8 %val2, i32 2
1050 %res3 = insertelement <16 x i8> %res2, i8 %val3, i32 3
1051 %res6 = insertelement <16 x i8> %res3, i8 %val6, i32 6
1052 %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
1053 %resD = insertelement <16 x i8> %res7, i8 0, i32 13
1054 %resE = insertelement <16 x i8> %resD, i8 0, i32 14
1055 %resF = insertelement <16 x i8> %resE, i8 0, i32 15
1059 define void @merge_4i32_i32_combine(ptr %dst, ptr %src) {
1060 ; SSE-LABEL: merge_4i32_i32_combine:
1062 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1063 ; SSE-NEXT: movaps %xmm0, (%rdi)
1066 ; AVX-LABEL: merge_4i32_i32_combine:
1068 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1069 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
1072 ; X86-SSE1-LABEL: merge_4i32_i32_combine:
1073 ; X86-SSE1: # %bb.0:
1074 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1075 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1076 ; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1077 ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1078 ; X86-SSE1-NEXT: andps %xmm0, %xmm1
1079 ; X86-SSE1-NEXT: movaps %xmm1, (%eax)
1080 ; X86-SSE1-NEXT: retl
1082 ; X86-SSE41-LABEL: merge_4i32_i32_combine:
1083 ; X86-SSE41: # %bb.0:
1084 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1085 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
1086 ; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1087 ; X86-SSE41-NEXT: movaps %xmm0, (%eax)
1088 ; X86-SSE41-NEXT: retl
1089 %1 = load i32, ptr %src
1090 %2 = insertelement <4 x i32> undef, i32 %1, i32 0
1091 %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
1092 %4 = lshr <4 x i32> %3, <i32 0, i32 undef, i32 undef, i32 undef>
1093 %5 = and <4 x i32> %4, <i32 -1, i32 0, i32 0, i32 0>
1094 store <4 x i32> %5, ptr %dst
1099 ; consecutive loads including any/all volatiles may not be combined
1102 define <2 x i64> @merge_2i64_i64_12_volatile(ptr %ptr) nounwind uwtable noinline ssp {
1103 ; SSE-LABEL: merge_2i64_i64_12_volatile:
1105 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1106 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
1107 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1110 ; AVX-LABEL: merge_2i64_i64_12_volatile:
1112 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1113 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1114 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1117 ; X86-SSE1-LABEL: merge_2i64_i64_12_volatile:
1118 ; X86-SSE1: # %bb.0:
1119 ; X86-SSE1-NEXT: pushl %edi
1120 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
1121 ; X86-SSE1-NEXT: pushl %esi
1122 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
1123 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
1124 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
1125 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1126 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1127 ; X86-SSE1-NEXT: movl 8(%ecx), %edx
1128 ; X86-SSE1-NEXT: movl 12(%ecx), %esi
1129 ; X86-SSE1-NEXT: movl 16(%ecx), %edi
1130 ; X86-SSE1-NEXT: movl 20(%ecx), %ecx
1131 ; X86-SSE1-NEXT: movl %ecx, 12(%eax)
1132 ; X86-SSE1-NEXT: movl %edi, 8(%eax)
1133 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
1134 ; X86-SSE1-NEXT: movl %edx, (%eax)
1135 ; X86-SSE1-NEXT: popl %esi
1136 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
1137 ; X86-SSE1-NEXT: popl %edi
1138 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
1139 ; X86-SSE1-NEXT: retl $4
1141 ; X86-SSE41-LABEL: merge_2i64_i64_12_volatile:
1142 ; X86-SSE41: # %bb.0:
1143 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1144 ; X86-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1145 ; X86-SSE41-NEXT: pinsrd $1, 12(%eax), %xmm0
1146 ; X86-SSE41-NEXT: pinsrd $2, 16(%eax), %xmm0
1147 ; X86-SSE41-NEXT: pinsrd $3, 20(%eax), %xmm0
1148 ; X86-SSE41-NEXT: retl
1149 %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 1
1150 %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 2
1151 %val0 = load volatile i64, ptr %ptr0
1152 %val1 = load volatile i64, ptr %ptr1
1153 %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
1154 %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
1158 define <4 x float> @merge_4f32_f32_2345_volatile(ptr %ptr) nounwind uwtable noinline ssp {
1159 ; SSE2-LABEL: merge_4f32_f32_2345_volatile:
1161 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1162 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1163 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1164 ; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1167 ; SSE41-LABEL: merge_4f32_f32_2345_volatile:
1169 ; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1170 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1171 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1172 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1175 ; AVX-LABEL: merge_4f32_f32_2345_volatile:
1177 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1178 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1179 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1180 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1183 ; X86-SSE1-LABEL: merge_4f32_f32_2345_volatile:
1184 ; X86-SSE1: # %bb.0:
1185 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1186 ; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1187 ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1188 ; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1189 ; X86-SSE1-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1190 ; X86-SSE1-NEXT: retl
1192 ; X86-SSE41-LABEL: merge_4f32_f32_2345_volatile:
1193 ; X86-SSE41: # %bb.0:
1194 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1195 ; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1196 ; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1197 ; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1198 ; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1199 ; X86-SSE41-NEXT: retl
1200 %ptr0 = getelementptr inbounds float, ptr %ptr, i64 2
1201 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 3
1202 %ptr2 = getelementptr inbounds float, ptr %ptr, i64 4
1203 %ptr3 = getelementptr inbounds float, ptr %ptr, i64 5
1204 %val0 = load volatile float, ptr %ptr0
1205 %val1 = load float, ptr %ptr1
1206 %val2 = load float, ptr %ptr2
1207 %val3 = load float, ptr %ptr3
1208 %res0 = insertelement <4 x float> undef, float %val0, i32 0
1209 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
1210 %res2 = insertelement <4 x float> %res1, float %val2, i32 2
1211 %res3 = insertelement <4 x float> %res2, float %val3, i32 3
1212 ret <4 x float> %res3
1216 ; Non-consecutive test.
1219 define <4 x float> @merge_4f32_f32_X0YY(ptr %ptr0, ptr %ptr1) nounwind uwtable noinline ssp {
1220 ; SSE-LABEL: merge_4f32_f32_X0YY:
1222 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1223 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1224 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1227 ; AVX-LABEL: merge_4f32_f32_X0YY:
1229 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1230 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1231 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
1234 ; X86-SSE-LABEL: merge_4f32_f32_X0YY:
1236 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1237 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1238 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1239 ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1240 ; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1241 ; X86-SSE-NEXT: retl
1242 %val0 = load float, ptr %ptr0, align 4
1243 %val1 = load float, ptr %ptr1, align 4
1244 %res0 = insertelement <4 x float> undef, float %val0, i32 0
1245 %res1 = insertelement <4 x float> %res0, float 0.000000e+00, i32 1
1246 %res2 = insertelement <4 x float> %res1, float %val1, i32 2
1247 %res3 = insertelement <4 x float> %res2, float %val1, i32 3
1248 ret <4 x float> %res3
1256 define <4 x i32> @load_i32_zext_i128_v4i32(ptr %ptr) {
1257 ; SSE-LABEL: load_i32_zext_i128_v4i32:
1259 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1262 ; AVX-LABEL: load_i32_zext_i128_v4i32:
1264 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1267 ; X86-SSE1-LABEL: load_i32_zext_i128_v4i32:
1268 ; X86-SSE1: # %bb.0:
1269 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1270 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1271 ; X86-SSE1-NEXT: movl (%ecx), %ecx
1272 ; X86-SSE1-NEXT: movl %ecx, (%eax)
1273 ; X86-SSE1-NEXT: movl $0, 12(%eax)
1274 ; X86-SSE1-NEXT: movl $0, 8(%eax)
1275 ; X86-SSE1-NEXT: movl $0, 4(%eax)
1276 ; X86-SSE1-NEXT: retl $4
1278 ; X86-SSE41-LABEL: load_i32_zext_i128_v4i32:
1279 ; X86-SSE41: # %bb.0:
1280 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1281 ; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1282 ; X86-SSE41-NEXT: retl
1283 %1 = load i32, ptr %ptr
1284 %2 = zext i32 %1 to i128
1285 %3 = bitcast i128 %2 to <4 x i32>