1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX
8 ; 32-bit SSE tests to make sure we do reasonable things.
9 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86-SSE,X86-SSE1
10 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86-SSE,X86-SSE41
12 define <2 x double> @merge_2f64_f64_23(double* %ptr) nounwind uwtable noinline ssp {
13 ; SSE-LABEL: merge_2f64_f64_23:
15 ; SSE-NEXT: movups 16(%rdi), %xmm0
18 ; AVX-LABEL: merge_2f64_f64_23:
20 ; AVX-NEXT: vmovups 16(%rdi), %xmm0
23 ; X86-SSE1-LABEL: merge_2f64_f64_23:
25 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
26 ; X86-SSE1-NEXT: fldl 16(%eax)
27 ; X86-SSE1-NEXT: fldl 24(%eax)
28 ; X86-SSE1-NEXT: fxch %st(1)
31 ; X86-SSE41-LABEL: merge_2f64_f64_23:
33 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
34 ; X86-SSE41-NEXT: movups 16(%eax), %xmm0
35 ; X86-SSE41-NEXT: retl
36 %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
37 %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
38 %val0 = load double, double* %ptr0
39 %val1 = load double, double* %ptr1
40 %res0 = insertelement <2 x double> undef, double %val0, i32 0
41 %res1 = insertelement <2 x double> %res0, double %val1, i32 1
42 ret <2 x double> %res1
45 define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp {
46 ; SSE-LABEL: merge_2i64_i64_12:
48 ; SSE-NEXT: movups 8(%rdi), %xmm0
51 ; AVX-LABEL: merge_2i64_i64_12:
53 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
56 ; X86-SSE1-LABEL: merge_2i64_i64_12:
58 ; X86-SSE1-NEXT: pushl %edi
59 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
60 ; X86-SSE1-NEXT: pushl %esi
61 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
62 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
63 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
64 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
65 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
66 ; X86-SSE1-NEXT: movl 8(%ecx), %edx
67 ; X86-SSE1-NEXT: movl 12(%ecx), %esi
68 ; X86-SSE1-NEXT: movl 16(%ecx), %edi
69 ; X86-SSE1-NEXT: movl 20(%ecx), %ecx
70 ; X86-SSE1-NEXT: movl %ecx, 12(%eax)
71 ; X86-SSE1-NEXT: movl %edi, 8(%eax)
72 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
73 ; X86-SSE1-NEXT: movl %edx, (%eax)
74 ; X86-SSE1-NEXT: popl %esi
75 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
76 ; X86-SSE1-NEXT: popl %edi
77 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
78 ; X86-SSE1-NEXT: retl $4
80 ; X86-SSE41-LABEL: merge_2i64_i64_12:
82 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
83 ; X86-SSE41-NEXT: movups 8(%eax), %xmm0
84 ; X86-SSE41-NEXT: retl
85 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
86 %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
87 %val0 = load i64, i64* %ptr0
88 %val1 = load i64, i64* %ptr1
89 %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
90 %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
94 define <4 x float> @merge_4f32_f32_2345(float* %ptr) nounwind uwtable noinline ssp {
95 ; SSE-LABEL: merge_4f32_f32_2345:
97 ; SSE-NEXT: movups 8(%rdi), %xmm0
100 ; AVX-LABEL: merge_4f32_f32_2345:
102 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
105 ; X86-SSE-LABEL: merge_4f32_f32_2345:
107 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
108 ; X86-SSE-NEXT: movups 8(%eax), %xmm0
110 %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
111 %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
112 %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
113 %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
114 %val0 = load float, float* %ptr0
115 %val1 = load float, float* %ptr1
116 %val2 = load float, float* %ptr2
117 %val3 = load float, float* %ptr3
118 %res0 = insertelement <4 x float> undef, float %val0, i32 0
119 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
120 %res2 = insertelement <4 x float> %res1, float %val2, i32 2
121 %res3 = insertelement <4 x float> %res2, float %val3, i32 3
122 ret <4 x float> %res3
125 define <4 x float> @merge_4f32_f32_3zuu(float* %ptr) nounwind uwtable noinline ssp {
126 ; SSE-LABEL: merge_4f32_f32_3zuu:
128 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
131 ; AVX-LABEL: merge_4f32_f32_3zuu:
133 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
136 ; X86-SSE-LABEL: merge_4f32_f32_3zuu:
138 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
139 ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
141 %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
142 %val0 = load float, float* %ptr0
143 %res0 = insertelement <4 x float> undef, float %val0, i32 0
144 %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
145 ret <4 x float> %res1
148 define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline ssp {
149 ; SSE-LABEL: merge_4f32_f32_34uu:
151 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
154 ; AVX-LABEL: merge_4f32_f32_34uu:
156 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
159 ; X86-SSE1-LABEL: merge_4f32_f32_34uu:
161 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
162 ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
163 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
164 ; X86-SSE1-NEXT: retl
166 ; X86-SSE41-LABEL: merge_4f32_f32_34uu:
167 ; X86-SSE41: # %bb.0:
168 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
169 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
170 ; X86-SSE41-NEXT: retl
171 %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
172 %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
173 %val0 = load float, float* %ptr0
174 %val1 = load float, float* %ptr1
175 %res0 = insertelement <4 x float> undef, float %val0, i32 0
176 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
177 ret <4 x float> %res1
180 define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline ssp {
181 ; SSE2-LABEL: merge_4f32_f32_34z6:
183 ; SSE2-NEXT: movups 12(%rdi), %xmm0
184 ; SSE2-NEXT: xorps %xmm1, %xmm1
185 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
186 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
189 ; SSE41-LABEL: merge_4f32_f32_34z6:
191 ; SSE41-NEXT: movups 12(%rdi), %xmm1
192 ; SSE41-NEXT: xorps %xmm0, %xmm0
193 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
196 ; AVX-LABEL: merge_4f32_f32_34z6:
198 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
199 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3]
202 ; X86-SSE1-LABEL: merge_4f32_f32_34z6:
204 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
205 ; X86-SSE1-NEXT: movups 12(%eax), %xmm0
206 ; X86-SSE1-NEXT: xorps %xmm1, %xmm1
207 ; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
208 ; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
209 ; X86-SSE1-NEXT: retl
211 ; X86-SSE41-LABEL: merge_4f32_f32_34z6:
212 ; X86-SSE41: # %bb.0:
213 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
214 ; X86-SSE41-NEXT: movups 12(%eax), %xmm1
215 ; X86-SSE41-NEXT: xorps %xmm0, %xmm0
216 ; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
217 ; X86-SSE41-NEXT: retl
218 %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
219 %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
220 %ptr3 = getelementptr inbounds float, float* %ptr, i64 6
221 %val0 = load float, float* %ptr0
222 %val1 = load float, float* %ptr1
223 %val3 = load float, float* %ptr3
224 %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
225 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
226 %res3 = insertelement <4 x float> %res1, float %val3, i32 3
227 ret <4 x float> %res3
230 define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline ssp {
231 ; SSE-LABEL: merge_4f32_f32_45zz:
233 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
236 ; AVX-LABEL: merge_4f32_f32_45zz:
238 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
241 ; X86-SSE1-LABEL: merge_4f32_f32_45zz:
243 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
244 ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
245 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
246 ; X86-SSE1-NEXT: retl
248 ; X86-SSE41-LABEL: merge_4f32_f32_45zz:
249 ; X86-SSE41: # %bb.0:
250 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
251 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
252 ; X86-SSE41-NEXT: retl
253 %ptr0 = getelementptr inbounds float, float* %ptr, i64 4
254 %ptr1 = getelementptr inbounds float, float* %ptr, i64 5
255 %val0 = load float, float* %ptr0
256 %val1 = load float, float* %ptr1
257 %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
258 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
259 ret <4 x float> %res1
262 define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline ssp {
263 ; SSE2-LABEL: merge_4f32_f32_012u:
265 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
266 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
267 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
270 ; SSE41-LABEL: merge_4f32_f32_012u:
272 ; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
273 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
276 ; AVX-LABEL: merge_4f32_f32_012u:
278 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
279 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
282 ; X86-SSE1-LABEL: merge_4f32_f32_012u:
284 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
285 ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
286 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
287 ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
288 ; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
289 ; X86-SSE1-NEXT: retl
291 ; X86-SSE41-LABEL: merge_4f32_f32_012u:
292 ; X86-SSE41: # %bb.0:
293 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
294 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
295 ; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
296 ; X86-SSE41-NEXT: retl
297 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
298 %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
299 %ptr2 = getelementptr inbounds float, float* %ptr, i64 2
300 %val0 = load float, float* %ptr0
301 %val1 = load float, float* %ptr1
302 %val2 = load float, float* %ptr2
303 %res0 = insertelement <4 x float> undef, float %val0, i32 0
304 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
305 %res2 = insertelement <4 x float> %res1, float %val2, i32 2
306 %res3 = insertelement <4 x float> %res2, float undef, i32 3
307 ret <4 x float> %res3
310 define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline ssp {
311 ; SSE2-LABEL: merge_4f32_f32_019u:
313 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
314 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
315 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
318 ; SSE41-LABEL: merge_4f32_f32_019u:
320 ; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
321 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
324 ; AVX-LABEL: merge_4f32_f32_019u:
326 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
327 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
330 ; X86-SSE1-LABEL: merge_4f32_f32_019u:
332 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
333 ; X86-SSE1-NEXT: xorps %xmm0, %xmm0
334 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
335 ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
336 ; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
337 ; X86-SSE1-NEXT: retl
339 ; X86-SSE41-LABEL: merge_4f32_f32_019u:
340 ; X86-SSE41: # %bb.0:
341 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
342 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
343 ; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
344 ; X86-SSE41-NEXT: retl
345 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
346 %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
347 %ptr2 = getelementptr inbounds float, float* %ptr, i64 9
348 %val0 = load float, float* %ptr0
349 %val1 = load float, float* %ptr1
350 %val2 = load float, float* %ptr2
351 %res0 = insertelement <4 x float> undef, float %val0, i32 0
352 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
353 %res2 = insertelement <4 x float> %res1, float %val2, i32 2
354 %res3 = insertelement <4 x float> %res2, float undef, i32 3
355 ret <4 x float> %res3
358 define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp {
359 ; SSE-LABEL: merge_4i32_i32_23u5:
361 ; SSE-NEXT: movups 8(%rdi), %xmm0
364 ; AVX-LABEL: merge_4i32_i32_23u5:
366 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
369 ; X86-SSE1-LABEL: merge_4i32_i32_23u5:
371 ; X86-SSE1-NEXT: pushl %esi
372 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
373 ; X86-SSE1-NEXT: .cfi_offset %esi, -8
374 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
375 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
376 ; X86-SSE1-NEXT: movl 8(%ecx), %edx
377 ; X86-SSE1-NEXT: movl 12(%ecx), %esi
378 ; X86-SSE1-NEXT: movl 20(%ecx), %ecx
379 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
380 ; X86-SSE1-NEXT: movl %edx, (%eax)
381 ; X86-SSE1-NEXT: movl %ecx, 12(%eax)
382 ; X86-SSE1-NEXT: popl %esi
383 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
384 ; X86-SSE1-NEXT: retl $4
386 ; X86-SSE41-LABEL: merge_4i32_i32_23u5:
387 ; X86-SSE41: # %bb.0:
388 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
389 ; X86-SSE41-NEXT: movups 8(%eax), %xmm0
390 ; X86-SSE41-NEXT: retl
391 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
392 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
393 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
394 %val0 = load i32, i32* %ptr0
395 %val1 = load i32, i32* %ptr1
396 %val3 = load i32, i32* %ptr3
397 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
398 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
399 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
403 define <4 x i32> @merge_4i32_i32_23u5_inc2(i32* %ptr) nounwind uwtable noinline ssp {
404 ; SSE-LABEL: merge_4i32_i32_23u5_inc2:
406 ; SSE-NEXT: movups 8(%rdi), %xmm0
407 ; SSE-NEXT: incl 8(%rdi)
410 ; AVX-LABEL: merge_4i32_i32_23u5_inc2:
412 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
413 ; AVX-NEXT: incl 8(%rdi)
416 ; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc2:
418 ; X86-SSE1-NEXT: pushl %edi
419 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
420 ; X86-SSE1-NEXT: pushl %esi
421 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
422 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
423 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
424 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
425 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
426 ; X86-SSE1-NEXT: movl 8(%ecx), %edx
427 ; X86-SSE1-NEXT: movl 12(%ecx), %esi
428 ; X86-SSE1-NEXT: leal 1(%edx), %edi
429 ; X86-SSE1-NEXT: movl %edi, 8(%ecx)
430 ; X86-SSE1-NEXT: movl 20(%ecx), %ecx
431 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
432 ; X86-SSE1-NEXT: movl %edx, (%eax)
433 ; X86-SSE1-NEXT: movl %ecx, 12(%eax)
434 ; X86-SSE1-NEXT: popl %esi
435 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
436 ; X86-SSE1-NEXT: popl %edi
437 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
438 ; X86-SSE1-NEXT: retl $4
440 ; X86-SSE41-LABEL: merge_4i32_i32_23u5_inc2:
441 ; X86-SSE41: # %bb.0:
442 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
443 ; X86-SSE41-NEXT: movups 8(%eax), %xmm0
444 ; X86-SSE41-NEXT: incl 8(%eax)
445 ; X86-SSE41-NEXT: retl
446 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
447 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
448 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
449 %val0 = load i32, i32* %ptr0
450 %inc = add i32 %val0, 1
451 store i32 %inc, i32* %ptr0
452 %val1 = load i32, i32* %ptr1
453 %val3 = load i32, i32* %ptr3
454 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
455 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
456 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
460 define <4 x i32> @merge_4i32_i32_23u5_inc3(i32* %ptr) nounwind uwtable noinline ssp {
461 ; SSE-LABEL: merge_4i32_i32_23u5_inc3:
463 ; SSE-NEXT: movups 8(%rdi), %xmm0
464 ; SSE-NEXT: incl 12(%rdi)
467 ; AVX-LABEL: merge_4i32_i32_23u5_inc3:
469 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
470 ; AVX-NEXT: incl 12(%rdi)
473 ; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc3:
475 ; X86-SSE1-NEXT: pushl %edi
476 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
477 ; X86-SSE1-NEXT: pushl %esi
478 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
479 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
480 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
481 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
482 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
483 ; X86-SSE1-NEXT: movl 8(%ecx), %edx
484 ; X86-SSE1-NEXT: movl 12(%ecx), %esi
485 ; X86-SSE1-NEXT: leal 1(%esi), %edi
486 ; X86-SSE1-NEXT: movl %edi, 12(%ecx)
487 ; X86-SSE1-NEXT: movl 20(%ecx), %ecx
488 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
489 ; X86-SSE1-NEXT: movl %edx, (%eax)
490 ; X86-SSE1-NEXT: movl %ecx, 12(%eax)
491 ; X86-SSE1-NEXT: popl %esi
492 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
493 ; X86-SSE1-NEXT: popl %edi
494 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
495 ; X86-SSE1-NEXT: retl $4
497 ; X86-SSE41-LABEL: merge_4i32_i32_23u5_inc3:
498 ; X86-SSE41: # %bb.0:
499 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
500 ; X86-SSE41-NEXT: movups 8(%eax), %xmm0
501 ; X86-SSE41-NEXT: incl 12(%eax)
502 ; X86-SSE41-NEXT: retl
503 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
504 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
505 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
506 %val0 = load i32, i32* %ptr0
507 %val1 = load i32, i32* %ptr1
508 %inc = add i32 %val1, 1
509 store i32 %inc, i32* %ptr1
510 %val3 = load i32, i32* %ptr3
511 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
512 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
513 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
517 define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp {
518 ; SSE-LABEL: merge_4i32_i32_3zuu:
520 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
523 ; AVX-LABEL: merge_4i32_i32_3zuu:
525 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
528 ; X86-SSE1-LABEL: merge_4i32_i32_3zuu:
530 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
531 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
532 ; X86-SSE1-NEXT: movl 12(%ecx), %ecx
533 ; X86-SSE1-NEXT: movl %ecx, (%eax)
534 ; X86-SSE1-NEXT: movl $0, 4(%eax)
535 ; X86-SSE1-NEXT: retl $4
537 ; X86-SSE41-LABEL: merge_4i32_i32_3zuu:
538 ; X86-SSE41: # %bb.0:
539 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
540 ; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
541 ; X86-SSE41-NEXT: retl
542 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
543 %val0 = load i32, i32* %ptr0
544 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
545 %res1 = insertelement <4 x i32> %res0, i32 0, i32 1
549 define <4 x i32> @merge_4i32_i32_34uu(i32* %ptr) nounwind uwtable noinline ssp {
550 ; SSE-LABEL: merge_4i32_i32_34uu:
552 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
555 ; AVX-LABEL: merge_4i32_i32_34uu:
557 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
560 ; X86-SSE1-LABEL: merge_4i32_i32_34uu:
562 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
563 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
564 ; X86-SSE1-NEXT: movl 12(%ecx), %edx
565 ; X86-SSE1-NEXT: movl 16(%ecx), %ecx
566 ; X86-SSE1-NEXT: movl %ecx, 4(%eax)
567 ; X86-SSE1-NEXT: movl %edx, (%eax)
568 ; X86-SSE1-NEXT: retl $4
570 ; X86-SSE41-LABEL: merge_4i32_i32_34uu:
571 ; X86-SSE41: # %bb.0:
572 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
573 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
574 ; X86-SSE41-NEXT: retl
575 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
576 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 4
577 %val0 = load i32, i32* %ptr0
578 %val1 = load i32, i32* %ptr1
579 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
580 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
584 define <4 x i32> @merge_4i32_i32_45zz(i32* %ptr) nounwind uwtable noinline ssp {
585 ; SSE-LABEL: merge_4i32_i32_45zz:
587 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
590 ; AVX-LABEL: merge_4i32_i32_45zz:
592 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
595 ; X86-SSE1-LABEL: merge_4i32_i32_45zz:
597 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
598 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
599 ; X86-SSE1-NEXT: movl 16(%ecx), %edx
600 ; X86-SSE1-NEXT: movl 20(%ecx), %ecx
601 ; X86-SSE1-NEXT: movl %ecx, 4(%eax)
602 ; X86-SSE1-NEXT: movl %edx, (%eax)
603 ; X86-SSE1-NEXT: movl $0, 12(%eax)
604 ; X86-SSE1-NEXT: movl $0, 8(%eax)
605 ; X86-SSE1-NEXT: retl $4
607 ; X86-SSE41-LABEL: merge_4i32_i32_45zz:
608 ; X86-SSE41: # %bb.0:
609 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
610 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
611 ; X86-SSE41-NEXT: retl
612 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
613 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
614 %val0 = load i32, i32* %ptr0
615 %val1 = load i32, i32* %ptr1
616 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
617 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
621 define <4 x i32> @merge_4i32_i32_45zz_inc4(i32* %ptr) nounwind uwtable noinline ssp {
622 ; SSE-LABEL: merge_4i32_i32_45zz_inc4:
624 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
625 ; SSE-NEXT: incl 16(%rdi)
628 ; AVX-LABEL: merge_4i32_i32_45zz_inc4:
630 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
631 ; AVX-NEXT: incl 16(%rdi)
634 ; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc4:
636 ; X86-SSE1-NEXT: pushl %edi
637 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
638 ; X86-SSE1-NEXT: pushl %esi
639 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
640 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
641 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
642 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
643 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
644 ; X86-SSE1-NEXT: movl 16(%ecx), %edx
645 ; X86-SSE1-NEXT: movl 20(%ecx), %esi
646 ; X86-SSE1-NEXT: leal 1(%edx), %edi
647 ; X86-SSE1-NEXT: movl %edi, 16(%ecx)
648 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
649 ; X86-SSE1-NEXT: movl %edx, (%eax)
650 ; X86-SSE1-NEXT: movl $0, 12(%eax)
651 ; X86-SSE1-NEXT: movl $0, 8(%eax)
652 ; X86-SSE1-NEXT: popl %esi
653 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
654 ; X86-SSE1-NEXT: popl %edi
655 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
656 ; X86-SSE1-NEXT: retl $4
658 ; X86-SSE41-LABEL: merge_4i32_i32_45zz_inc4:
659 ; X86-SSE41: # %bb.0:
660 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
661 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
662 ; X86-SSE41-NEXT: incl 16(%eax)
663 ; X86-SSE41-NEXT: retl
664 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
665 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
666 %val0 = load i32, i32* %ptr0
667 %inc = add i32 %val0, 1
668 store i32 %inc, i32* %ptr0
669 %val1 = load i32, i32* %ptr1
670 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
671 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
675 define <4 x i32> @merge_4i32_i32_45zz_inc5(i32* %ptr) nounwind uwtable noinline ssp {
676 ; SSE-LABEL: merge_4i32_i32_45zz_inc5:
678 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
679 ; SSE-NEXT: incl 20(%rdi)
682 ; AVX-LABEL: merge_4i32_i32_45zz_inc5:
684 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
685 ; AVX-NEXT: incl 20(%rdi)
688 ; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc5:
690 ; X86-SSE1-NEXT: pushl %edi
691 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
692 ; X86-SSE1-NEXT: pushl %esi
693 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
694 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
695 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
696 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
697 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
698 ; X86-SSE1-NEXT: movl 16(%ecx), %edx
699 ; X86-SSE1-NEXT: movl 20(%ecx), %esi
700 ; X86-SSE1-NEXT: leal 1(%esi), %edi
701 ; X86-SSE1-NEXT: movl %edi, 20(%ecx)
702 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
703 ; X86-SSE1-NEXT: movl %edx, (%eax)
704 ; X86-SSE1-NEXT: movl $0, 12(%eax)
705 ; X86-SSE1-NEXT: movl $0, 8(%eax)
706 ; X86-SSE1-NEXT: popl %esi
707 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
708 ; X86-SSE1-NEXT: popl %edi
709 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
710 ; X86-SSE1-NEXT: retl $4
712 ; X86-SSE41-LABEL: merge_4i32_i32_45zz_inc5:
713 ; X86-SSE41: # %bb.0:
714 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
715 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
716 ; X86-SSE41-NEXT: incl 20(%eax)
717 ; X86-SSE41-NEXT: retl
718 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
719 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
720 %val0 = load i32, i32* %ptr0
721 %val1 = load i32, i32* %ptr1
722 %inc = add i32 %val1, 1
723 store i32 %inc, i32* %ptr1
724 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
725 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
729 define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline ssp {
730 ; SSE-LABEL: merge_8i16_i16_23u567u9:
732 ; SSE-NEXT: movups 4(%rdi), %xmm0
735 ; AVX-LABEL: merge_8i16_i16_23u567u9:
737 ; AVX-NEXT: vmovups 4(%rdi), %xmm0
740 ; X86-SSE1-LABEL: merge_8i16_i16_23u567u9:
742 ; X86-SSE1-NEXT: pushl %edi
743 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
744 ; X86-SSE1-NEXT: pushl %esi
745 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
746 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
747 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
748 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
749 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
750 ; X86-SSE1-NEXT: movl 4(%ecx), %edx
751 ; X86-SSE1-NEXT: movl 10(%ecx), %esi
752 ; X86-SSE1-NEXT: movzwl 14(%ecx), %edi
753 ; X86-SSE1-NEXT: movzwl 18(%ecx), %ecx
754 ; X86-SSE1-NEXT: movw %di, 10(%eax)
755 ; X86-SSE1-NEXT: movw %cx, 14(%eax)
756 ; X86-SSE1-NEXT: movl %esi, 6(%eax)
757 ; X86-SSE1-NEXT: movl %edx, (%eax)
758 ; X86-SSE1-NEXT: popl %esi
759 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
760 ; X86-SSE1-NEXT: popl %edi
761 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
762 ; X86-SSE1-NEXT: retl $4
764 ; X86-SSE41-LABEL: merge_8i16_i16_23u567u9:
765 ; X86-SSE41: # %bb.0:
766 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
767 ; X86-SSE41-NEXT: movups 4(%eax), %xmm0
768 ; X86-SSE41-NEXT: retl
769 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
770 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
771 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 5
772 %ptr4 = getelementptr inbounds i16, i16* %ptr, i64 6
773 %ptr5 = getelementptr inbounds i16, i16* %ptr, i64 7
774 %ptr7 = getelementptr inbounds i16, i16* %ptr, i64 9
775 %val0 = load i16, i16* %ptr0
776 %val1 = load i16, i16* %ptr1
777 %val3 = load i16, i16* %ptr3
778 %val4 = load i16, i16* %ptr4
779 %val5 = load i16, i16* %ptr5
780 %val7 = load i16, i16* %ptr7
781 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
782 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
783 %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
784 %res4 = insertelement <8 x i16> %res3, i16 %val4, i32 4
785 %res5 = insertelement <8 x i16> %res4, i16 %val5, i32 5
786 %res7 = insertelement <8 x i16> %res5, i16 %val7, i32 7
790 define <8 x i16> @merge_8i16_i16_34uuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
791 ; SSE-LABEL: merge_8i16_i16_34uuuuuu:
793 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
796 ; AVX-LABEL: merge_8i16_i16_34uuuuuu:
798 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
801 ; X86-SSE1-LABEL: merge_8i16_i16_34uuuuuu:
803 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
804 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
805 ; X86-SSE1-NEXT: movl 6(%ecx), %ecx
806 ; X86-SSE1-NEXT: movl %ecx, (%eax)
807 ; X86-SSE1-NEXT: retl $4
809 ; X86-SSE41-LABEL: merge_8i16_i16_34uuuuuu:
810 ; X86-SSE41: # %bb.0:
811 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
812 ; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
813 ; X86-SSE41-NEXT: retl
814 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 3
815 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 4
816 %val0 = load i16, i16* %ptr0
817 %val1 = load i16, i16* %ptr1
818 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
819 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
823 define <8 x i16> @merge_8i16_i16_45u7zzzz(i16* %ptr) nounwind uwtable noinline ssp {
824 ; SSE-LABEL: merge_8i16_i16_45u7zzzz:
826 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
829 ; AVX-LABEL: merge_8i16_i16_45u7zzzz:
831 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
834 ; X86-SSE1-LABEL: merge_8i16_i16_45u7zzzz:
836 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
837 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
838 ; X86-SSE1-NEXT: movl 8(%ecx), %edx
839 ; X86-SSE1-NEXT: movzwl 14(%ecx), %ecx
840 ; X86-SSE1-NEXT: movw %cx, 6(%eax)
841 ; X86-SSE1-NEXT: movl %edx, (%eax)
842 ; X86-SSE1-NEXT: movl $0, 12(%eax)
843 ; X86-SSE1-NEXT: movl $0, 8(%eax)
844 ; X86-SSE1-NEXT: retl $4
846 ; X86-SSE41-LABEL: merge_8i16_i16_45u7zzzz:
847 ; X86-SSE41: # %bb.0:
848 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
849 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
850 ; X86-SSE41-NEXT: retl
851 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
852 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
853 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
854 %val0 = load i16, i16* %ptr0
855 %val1 = load i16, i16* %ptr1
856 %val3 = load i16, i16* %ptr3
857 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
858 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
859 %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
860 %res4 = insertelement <8 x i16> %res3, i16 0, i32 4
861 %res5 = insertelement <8 x i16> %res4, i16 0, i32 5
862 %res6 = insertelement <8 x i16> %res5, i16 0, i32 6
863 %res7 = insertelement <8 x i16> %res6, i16 0, i32 7
867 define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noinline ssp {
868 ; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
870 ; SSE-NEXT: movups (%rdi), %xmm0
873 ; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF:
875 ; AVX-NEXT: vmovups (%rdi), %xmm0
878 ; X86-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF:
880 ; X86-SSE1-NEXT: pushl %ebp
881 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
882 ; X86-SSE1-NEXT: pushl %ebx
883 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
884 ; X86-SSE1-NEXT: pushl %edi
885 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 16
886 ; X86-SSE1-NEXT: pushl %esi
887 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 20
888 ; X86-SSE1-NEXT: .cfi_offset %esi, -20
889 ; X86-SSE1-NEXT: .cfi_offset %edi, -16
890 ; X86-SSE1-NEXT: .cfi_offset %ebx, -12
891 ; X86-SSE1-NEXT: .cfi_offset %ebp, -8
892 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
893 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
894 ; X86-SSE1-NEXT: movzwl (%ecx), %ebp
895 ; X86-SSE1-NEXT: movl 3(%ecx), %esi
896 ; X86-SSE1-NEXT: movl 7(%ecx), %edi
897 ; X86-SSE1-NEXT: movzwl 11(%ecx), %ebx
898 ; X86-SSE1-NEXT: movb 13(%ecx), %dl
899 ; X86-SSE1-NEXT: movb 15(%ecx), %cl
900 ; X86-SSE1-NEXT: movb %dl, 13(%eax)
901 ; X86-SSE1-NEXT: movb %cl, 15(%eax)
902 ; X86-SSE1-NEXT: movw %bx, 11(%eax)
903 ; X86-SSE1-NEXT: movl %edi, 7(%eax)
904 ; X86-SSE1-NEXT: movl %esi, 3(%eax)
905 ; X86-SSE1-NEXT: movw %bp, (%eax)
906 ; X86-SSE1-NEXT: popl %esi
907 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 16
908 ; X86-SSE1-NEXT: popl %edi
909 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
910 ; X86-SSE1-NEXT: popl %ebx
911 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
912 ; X86-SSE1-NEXT: popl %ebp
913 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
914 ; X86-SSE1-NEXT: retl $4
916 ; X86-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF:
917 ; X86-SSE41: # %bb.0:
918 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
919 ; X86-SSE41-NEXT: movups (%eax), %xmm0
920 ; X86-SSE41-NEXT: retl
921 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
922 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
923 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
924 %ptr4 = getelementptr inbounds i8, i8* %ptr, i64 4
925 %ptr5 = getelementptr inbounds i8, i8* %ptr, i64 5
926 %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
927 %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
928 %ptr8 = getelementptr inbounds i8, i8* %ptr, i64 8
929 %ptr9 = getelementptr inbounds i8, i8* %ptr, i64 9
930 %ptrA = getelementptr inbounds i8, i8* %ptr, i64 10
931 %ptrB = getelementptr inbounds i8, i8* %ptr, i64 11
932 %ptrC = getelementptr inbounds i8, i8* %ptr, i64 12
933 %ptrD = getelementptr inbounds i8, i8* %ptr, i64 13
934 %ptrF = getelementptr inbounds i8, i8* %ptr, i64 15
935 %val0 = load i8, i8* %ptr0
936 %val1 = load i8, i8* %ptr1
937 %val3 = load i8, i8* %ptr3
938 %val4 = load i8, i8* %ptr4
939 %val5 = load i8, i8* %ptr5
940 %val6 = load i8, i8* %ptr6
941 %val7 = load i8, i8* %ptr7
942 %val8 = load i8, i8* %ptr8
943 %val9 = load i8, i8* %ptr9
944 %valA = load i8, i8* %ptrA
945 %valB = load i8, i8* %ptrB
946 %valC = load i8, i8* %ptrC
947 %valD = load i8, i8* %ptrD
948 %valF = load i8, i8* %ptrF
949 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
950 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
951 %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
952 %res4 = insertelement <16 x i8> %res3, i8 %val4, i32 4
953 %res5 = insertelement <16 x i8> %res4, i8 %val5, i32 5
954 %res6 = insertelement <16 x i8> %res5, i8 %val6, i32 6
955 %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
956 %res8 = insertelement <16 x i8> %res7, i8 %val8, i32 8
957 %res9 = insertelement <16 x i8> %res8, i8 %val9, i32 9
958 %resA = insertelement <16 x i8> %res9, i8 %valA, i32 10
959 %resB = insertelement <16 x i8> %resA, i8 %valB, i32 11
960 %resC = insertelement <16 x i8> %resB, i8 %valC, i32 12
961 %resD = insertelement <16 x i8> %resC, i8 %valD, i32 13
962 %resF = insertelement <16 x i8> %resD, i8 %valF, i32 15
966 define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
967 ; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
969 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
972 ; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
974 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
977 ; X86-SSE1-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
979 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
980 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
981 ; X86-SSE1-NEXT: movzwl (%ecx), %edx
982 ; X86-SSE1-NEXT: movb 3(%ecx), %cl
983 ; X86-SSE1-NEXT: movb %cl, 3(%eax)
984 ; X86-SSE1-NEXT: movw %dx, (%eax)
985 ; X86-SSE1-NEXT: movb $0, 15(%eax)
986 ; X86-SSE1-NEXT: movw $0, 13(%eax)
987 ; X86-SSE1-NEXT: movw $0, 6(%eax)
988 ; X86-SSE1-NEXT: retl $4
990 ; X86-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
991 ; X86-SSE41: # %bb.0:
992 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
993 ; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
994 ; X86-SSE41-NEXT: retl
995 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
996 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
997 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
998 %val0 = load i8, i8* %ptr0
999 %val1 = load i8, i8* %ptr1
1000 %val3 = load i8, i8* %ptr3
1001 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
1002 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
1003 %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
1004 %res6 = insertelement <16 x i8> %res3, i8 0, i32 6
1005 %res7 = insertelement <16 x i8> %res6, i8 0, i32 7
1006 %resD = insertelement <16 x i8> %res7, i8 0, i32 13
1007 %resE = insertelement <16 x i8> %resD, i8 0, i32 14
1008 %resF = insertelement <16 x i8> %resE, i8 0, i32 15
1012 define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
1013 ; SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1015 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1018 ; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1020 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1023 ; X86-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1024 ; X86-SSE1: # %bb.0:
1025 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1026 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1027 ; X86-SSE1-NEXT: movl (%ecx), %edx
1028 ; X86-SSE1-NEXT: movzwl 6(%ecx), %ecx
1029 ; X86-SSE1-NEXT: movw %cx, 6(%eax)
1030 ; X86-SSE1-NEXT: movl %edx, (%eax)
1031 ; X86-SSE1-NEXT: movb $0, 15(%eax)
1032 ; X86-SSE1-NEXT: movw $0, 13(%eax)
1033 ; X86-SSE1-NEXT: retl $4
1035 ; X86-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1036 ; X86-SSE41: # %bb.0:
1037 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1038 ; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1039 ; X86-SSE41-NEXT: retl
1040 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
1041 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
1042 %ptr2 = getelementptr inbounds i8, i8* %ptr, i64 2
1043 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
1044 %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
1045 %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
1046 %val0 = load i8, i8* %ptr0
1047 %val1 = load i8, i8* %ptr1
1048 %val2 = load i8, i8* %ptr2
1049 %val3 = load i8, i8* %ptr3
1050 %val6 = load i8, i8* %ptr6
1051 %val7 = load i8, i8* %ptr7
1052 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
1053 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
1054 %res2 = insertelement <16 x i8> %res1, i8 %val2, i32 2
1055 %res3 = insertelement <16 x i8> %res2, i8 %val3, i32 3
1056 %res6 = insertelement <16 x i8> %res3, i8 %val6, i32 6
1057 %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
1058 %resD = insertelement <16 x i8> %res7, i8 0, i32 13
1059 %resE = insertelement <16 x i8> %resD, i8 0, i32 14
1060 %resF = insertelement <16 x i8> %resE, i8 0, i32 15
1064 define void @merge_4i32_i32_combine(<4 x i32>* %dst, i32* %src) {
1065 ; SSE-LABEL: merge_4i32_i32_combine:
1067 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1068 ; SSE-NEXT: movaps %xmm0, (%rdi)
1071 ; AVX-LABEL: merge_4i32_i32_combine:
1073 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1074 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
1077 ; X86-SSE1-LABEL: merge_4i32_i32_combine:
1078 ; X86-SSE1: # %bb.0:
1079 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1080 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1081 ; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1082 ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1083 ; X86-SSE1-NEXT: andps %xmm0, %xmm1
1084 ; X86-SSE1-NEXT: movaps %xmm1, (%eax)
1085 ; X86-SSE1-NEXT: retl
1087 ; X86-SSE41-LABEL: merge_4i32_i32_combine:
1088 ; X86-SSE41: # %bb.0:
1089 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1090 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
1091 ; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1092 ; X86-SSE41-NEXT: movaps %xmm0, (%eax)
1093 ; X86-SSE41-NEXT: retl
1094 %1 = getelementptr i32, i32* %src, i32 0
1095 %2 = load i32, i32* %1
1096 %3 = insertelement <4 x i32> undef, i32 %2, i32 0
1097 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
1098 %5 = lshr <4 x i32> %4, <i32 0, i32 undef, i32 undef, i32 undef>
1099 %6 = and <4 x i32> %5, <i32 -1, i32 0, i32 0, i32 0>
1100 store <4 x i32> %6, <4 x i32>* %dst
1105 ; consecutive loads including any/all volatiles may not be combined
1108 define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinline ssp {
1109 ; SSE-LABEL: merge_2i64_i64_12_volatile:
1111 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1112 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
1113 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1116 ; AVX-LABEL: merge_2i64_i64_12_volatile:
1118 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1119 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1120 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1123 ; X86-SSE1-LABEL: merge_2i64_i64_12_volatile:
1124 ; X86-SSE1: # %bb.0:
1125 ; X86-SSE1-NEXT: pushl %edi
1126 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
1127 ; X86-SSE1-NEXT: pushl %esi
1128 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
1129 ; X86-SSE1-NEXT: .cfi_offset %esi, -12
1130 ; X86-SSE1-NEXT: .cfi_offset %edi, -8
1131 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1132 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1133 ; X86-SSE1-NEXT: movl 8(%ecx), %edx
1134 ; X86-SSE1-NEXT: movl 12(%ecx), %esi
1135 ; X86-SSE1-NEXT: movl 16(%ecx), %edi
1136 ; X86-SSE1-NEXT: movl 20(%ecx), %ecx
1137 ; X86-SSE1-NEXT: movl %ecx, 12(%eax)
1138 ; X86-SSE1-NEXT: movl %edi, 8(%eax)
1139 ; X86-SSE1-NEXT: movl %esi, 4(%eax)
1140 ; X86-SSE1-NEXT: movl %edx, (%eax)
1141 ; X86-SSE1-NEXT: popl %esi
1142 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
1143 ; X86-SSE1-NEXT: popl %edi
1144 ; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
1145 ; X86-SSE1-NEXT: retl $4
1147 ; X86-SSE41-LABEL: merge_2i64_i64_12_volatile:
1148 ; X86-SSE41: # %bb.0:
1149 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1150 ; X86-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1151 ; X86-SSE41-NEXT: pinsrd $1, 12(%eax), %xmm0
1152 ; X86-SSE41-NEXT: pinsrd $2, 16(%eax), %xmm0
1153 ; X86-SSE41-NEXT: pinsrd $3, 20(%eax), %xmm0
1154 ; X86-SSE41-NEXT: retl
1155 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
1156 %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
1157 %val0 = load volatile i64, i64* %ptr0
1158 %val1 = load volatile i64, i64* %ptr1
1159 %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
1160 %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
1164 define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable noinline ssp {
1165 ; SSE2-LABEL: merge_4f32_f32_2345_volatile:
1167 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1168 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1169 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1170 ; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1173 ; SSE41-LABEL: merge_4f32_f32_2345_volatile:
1175 ; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1176 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1177 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1178 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1181 ; AVX-LABEL: merge_4f32_f32_2345_volatile:
1183 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1184 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1185 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1186 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1189 ; X86-SSE1-LABEL: merge_4f32_f32_2345_volatile:
1190 ; X86-SSE1: # %bb.0:
1191 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1192 ; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1193 ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1194 ; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1195 ; X86-SSE1-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1196 ; X86-SSE1-NEXT: retl
1198 ; X86-SSE41-LABEL: merge_4f32_f32_2345_volatile:
1199 ; X86-SSE41: # %bb.0:
1200 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1201 ; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1202 ; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1203 ; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1204 ; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1205 ; X86-SSE41-NEXT: retl
1206 %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
1207 %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
1208 %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
1209 %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
1210 %val0 = load volatile float, float* %ptr0
1211 %val1 = load float, float* %ptr1
1212 %val2 = load float, float* %ptr2
1213 %val3 = load float, float* %ptr3
1214 %res0 = insertelement <4 x float> undef, float %val0, i32 0
1215 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
1216 %res2 = insertelement <4 x float> %res1, float %val2, i32 2
1217 %res3 = insertelement <4 x float> %res2, float %val3, i32 3
1218 ret <4 x float> %res3
1222 ; Non-consecutive test.
1225 define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp {
1226 ; SSE-LABEL: merge_4f32_f32_X0YY:
1228 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1229 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1230 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1233 ; AVX-LABEL: merge_4f32_f32_X0YY:
1235 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1236 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1237 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
1240 ; X86-SSE-LABEL: merge_4f32_f32_X0YY:
1242 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1243 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1244 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1245 ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1246 ; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1247 ; X86-SSE-NEXT: retl
1248 %val0 = load float, float* %ptr0, align 4
1249 %val1 = load float, float* %ptr1, align 4
1250 %res0 = insertelement <4 x float> undef, float %val0, i32 0
1251 %res1 = insertelement <4 x float> %res0, float 0.000000e+00, i32 1
1252 %res2 = insertelement <4 x float> %res1, float %val1, i32 2
1253 %res3 = insertelement <4 x float> %res2, float %val1, i32 3
1254 ret <4 x float> %res3
1262 define <4 x i32> @load_i32_zext_i128_v4i32(i32* %ptr) {
1263 ; SSE-LABEL: load_i32_zext_i128_v4i32:
1265 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1268 ; AVX-LABEL: load_i32_zext_i128_v4i32:
1270 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1273 ; X86-SSE1-LABEL: load_i32_zext_i128_v4i32:
1274 ; X86-SSE1: # %bb.0:
1275 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1276 ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1277 ; X86-SSE1-NEXT: movl (%ecx), %ecx
1278 ; X86-SSE1-NEXT: movl %ecx, (%eax)
1279 ; X86-SSE1-NEXT: movl $0, 12(%eax)
1280 ; X86-SSE1-NEXT: movl $0, 8(%eax)
1281 ; X86-SSE1-NEXT: movl $0, 4(%eax)
1282 ; X86-SSE1-NEXT: retl $4
1284 ; X86-SSE41-LABEL: load_i32_zext_i128_v4i32:
1285 ; X86-SSE41: # %bb.0:
1286 ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1287 ; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1288 ; X86-SSE41-NEXT: retl
1289 %1 = load i32, i32* %ptr
1290 %2 = zext i32 %1 to i128
1291 %3 = bitcast i128 %2 to <4 x i32>