1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
8 ; 32-bit SSE tests to make sure we do reasonable things.
9 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE1
10 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE41
12 define <2 x double> @merge_2f64_f64_23(double* %ptr) nounwind uwtable noinline ssp {
13 ; SSE-LABEL: merge_2f64_f64_23:
15 ; SSE-NEXT: movups 16(%rdi), %xmm0
18 ; AVX-LABEL: merge_2f64_f64_23:
20 ; AVX-NEXT: vmovups 16(%rdi), %xmm0
23 ; X32-SSE1-LABEL: merge_2f64_f64_23:
25 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
26 ; X32-SSE1-NEXT: fldl 16(%eax)
27 ; X32-SSE1-NEXT: fldl 24(%eax)
28 ; X32-SSE1-NEXT: fxch %st(1)
31 ; X32-SSE41-LABEL: merge_2f64_f64_23:
33 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
34 ; X32-SSE41-NEXT: movups 16(%eax), %xmm0
35 ; X32-SSE41-NEXT: retl
36 %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
37 %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
38 %val0 = load double, double* %ptr0
39 %val1 = load double, double* %ptr1
40 %res0 = insertelement <2 x double> undef, double %val0, i32 0
41 %res1 = insertelement <2 x double> %res0, double %val1, i32 1
42 ret <2 x double> %res1
45 define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp {
46 ; SSE-LABEL: merge_2i64_i64_12:
48 ; SSE-NEXT: movups 8(%rdi), %xmm0
51 ; AVX-LABEL: merge_2i64_i64_12:
53 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
56 ; X32-SSE1-LABEL: merge_2i64_i64_12:
58 ; X32-SSE1-NEXT: pushl %edi
59 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
60 ; X32-SSE1-NEXT: pushl %esi
61 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
62 ; X32-SSE1-NEXT: .cfi_offset %esi, -12
63 ; X32-SSE1-NEXT: .cfi_offset %edi, -8
64 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
65 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
66 ; X32-SSE1-NEXT: movl 8(%ecx), %edx
67 ; X32-SSE1-NEXT: movl 12(%ecx), %esi
68 ; X32-SSE1-NEXT: movl 16(%ecx), %edi
69 ; X32-SSE1-NEXT: movl 20(%ecx), %ecx
70 ; X32-SSE1-NEXT: movl %ecx, 12(%eax)
71 ; X32-SSE1-NEXT: movl %edi, 8(%eax)
72 ; X32-SSE1-NEXT: movl %esi, 4(%eax)
73 ; X32-SSE1-NEXT: movl %edx, (%eax)
74 ; X32-SSE1-NEXT: popl %esi
75 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
76 ; X32-SSE1-NEXT: popl %edi
77 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
78 ; X32-SSE1-NEXT: retl $4
80 ; X32-SSE41-LABEL: merge_2i64_i64_12:
82 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
83 ; X32-SSE41-NEXT: movups 8(%eax), %xmm0
84 ; X32-SSE41-NEXT: retl
85 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
86 %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
87 %val0 = load i64, i64* %ptr0
88 %val1 = load i64, i64* %ptr1
89 %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
90 %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
94 define <4 x float> @merge_4f32_f32_2345(float* %ptr) nounwind uwtable noinline ssp {
95 ; SSE-LABEL: merge_4f32_f32_2345:
97 ; SSE-NEXT: movups 8(%rdi), %xmm0
100 ; AVX-LABEL: merge_4f32_f32_2345:
102 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
105 ; X32-SSE-LABEL: merge_4f32_f32_2345:
107 ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
108 ; X32-SSE-NEXT: movups 8(%eax), %xmm0
110 %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
111 %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
112 %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
113 %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
114 %val0 = load float, float* %ptr0
115 %val1 = load float, float* %ptr1
116 %val2 = load float, float* %ptr2
117 %val3 = load float, float* %ptr3
118 %res0 = insertelement <4 x float> undef, float %val0, i32 0
119 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
120 %res2 = insertelement <4 x float> %res1, float %val2, i32 2
121 %res3 = insertelement <4 x float> %res2, float %val3, i32 3
122 ret <4 x float> %res3
125 define <4 x float> @merge_4f32_f32_3zuu(float* %ptr) nounwind uwtable noinline ssp {
126 ; SSE-LABEL: merge_4f32_f32_3zuu:
128 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
131 ; AVX-LABEL: merge_4f32_f32_3zuu:
133 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
136 ; X32-SSE-LABEL: merge_4f32_f32_3zuu:
138 ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
139 ; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
141 %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
142 %val0 = load float, float* %ptr0
143 %res0 = insertelement <4 x float> undef, float %val0, i32 0
144 %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
145 ret <4 x float> %res1
148 define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline ssp {
149 ; SSE-LABEL: merge_4f32_f32_34uu:
151 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
154 ; AVX-LABEL: merge_4f32_f32_34uu:
156 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
159 ; X32-SSE1-LABEL: merge_4f32_f32_34uu:
161 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
162 ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
163 ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
164 ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
165 ; X32-SSE1-NEXT: retl
167 ; X32-SSE41-LABEL: merge_4f32_f32_34uu:
168 ; X32-SSE41: # %bb.0:
169 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
170 ; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
171 ; X32-SSE41-NEXT: retl
172 %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
173 %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
174 %val0 = load float, float* %ptr0
175 %val1 = load float, float* %ptr1
176 %res0 = insertelement <4 x float> undef, float %val0, i32 0
177 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
178 ret <4 x float> %res1
181 define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline ssp {
182 ; SSE2-LABEL: merge_4f32_f32_34z6:
184 ; SSE2-NEXT: movups 12(%rdi), %xmm0
185 ; SSE2-NEXT: xorps %xmm1, %xmm1
186 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
187 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
190 ; SSE41-LABEL: merge_4f32_f32_34z6:
192 ; SSE41-NEXT: movups 12(%rdi), %xmm1
193 ; SSE41-NEXT: xorps %xmm0, %xmm0
194 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
197 ; AVX-LABEL: merge_4f32_f32_34z6:
199 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
200 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3]
203 ; X32-SSE1-LABEL: merge_4f32_f32_34z6:
205 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
206 ; X32-SSE1-NEXT: movups 12(%eax), %xmm0
207 ; X32-SSE1-NEXT: xorps %xmm1, %xmm1
208 ; X32-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
209 ; X32-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
210 ; X32-SSE1-NEXT: retl
212 ; X32-SSE41-LABEL: merge_4f32_f32_34z6:
213 ; X32-SSE41: # %bb.0:
214 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
215 ; X32-SSE41-NEXT: movups 12(%eax), %xmm1
216 ; X32-SSE41-NEXT: xorps %xmm0, %xmm0
217 ; X32-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
218 ; X32-SSE41-NEXT: retl
219 %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
220 %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
221 %ptr3 = getelementptr inbounds float, float* %ptr, i64 6
222 %val0 = load float, float* %ptr0
223 %val1 = load float, float* %ptr1
224 %val3 = load float, float* %ptr3
225 %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
226 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
227 %res3 = insertelement <4 x float> %res1, float %val3, i32 3
228 ret <4 x float> %res3
231 define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline ssp {
232 ; SSE-LABEL: merge_4f32_f32_45zz:
234 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
237 ; AVX-LABEL: merge_4f32_f32_45zz:
239 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
242 ; X32-SSE1-LABEL: merge_4f32_f32_45zz:
244 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
245 ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
246 ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
247 ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
248 ; X32-SSE1-NEXT: xorps %xmm1, %xmm1
249 ; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
250 ; X32-SSE1-NEXT: retl
252 ; X32-SSE41-LABEL: merge_4f32_f32_45zz:
253 ; X32-SSE41: # %bb.0:
254 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
255 ; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
256 ; X32-SSE41-NEXT: retl
257 %ptr0 = getelementptr inbounds float, float* %ptr, i64 4
258 %ptr1 = getelementptr inbounds float, float* %ptr, i64 5
259 %val0 = load float, float* %ptr0
260 %val1 = load float, float* %ptr1
261 %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
262 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
263 ret <4 x float> %res1
266 define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline ssp {
267 ; SSE2-LABEL: merge_4f32_f32_012u:
269 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
270 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
271 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
274 ; SSE41-LABEL: merge_4f32_f32_012u:
276 ; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
277 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
280 ; AVX-LABEL: merge_4f32_f32_012u:
282 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
283 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
286 ; X32-SSE1-LABEL: merge_4f32_f32_012u:
288 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
289 ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
290 ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
291 ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
292 ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
293 ; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
294 ; X32-SSE1-NEXT: retl
296 ; X32-SSE41-LABEL: merge_4f32_f32_012u:
297 ; X32-SSE41: # %bb.0:
298 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
299 ; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
300 ; X32-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
301 ; X32-SSE41-NEXT: retl
302 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
303 %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
304 %ptr2 = getelementptr inbounds float, float* %ptr, i64 2
305 %val0 = load float, float* %ptr0
306 %val1 = load float, float* %ptr1
307 %val2 = load float, float* %ptr2
308 %res0 = insertelement <4 x float> undef, float %val0, i32 0
309 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
310 %res2 = insertelement <4 x float> %res1, float %val2, i32 2
311 %res3 = insertelement <4 x float> %res2, float undef, i32 3
312 ret <4 x float> %res3
315 define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline ssp {
316 ; SSE2-LABEL: merge_4f32_f32_019u:
318 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
319 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
320 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
323 ; SSE41-LABEL: merge_4f32_f32_019u:
325 ; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
326 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
329 ; AVX-LABEL: merge_4f32_f32_019u:
331 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
332 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
335 ; X32-SSE1-LABEL: merge_4f32_f32_019u:
337 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
338 ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
339 ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
340 ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
341 ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
342 ; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
343 ; X32-SSE1-NEXT: retl
345 ; X32-SSE41-LABEL: merge_4f32_f32_019u:
346 ; X32-SSE41: # %bb.0:
347 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
348 ; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
349 ; X32-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
350 ; X32-SSE41-NEXT: retl
351 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
352 %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
353 %ptr2 = getelementptr inbounds float, float* %ptr, i64 9
354 %val0 = load float, float* %ptr0
355 %val1 = load float, float* %ptr1
356 %val2 = load float, float* %ptr2
357 %res0 = insertelement <4 x float> undef, float %val0, i32 0
358 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
359 %res2 = insertelement <4 x float> %res1, float %val2, i32 2
360 %res3 = insertelement <4 x float> %res2, float undef, i32 3
361 ret <4 x float> %res3
364 define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp {
365 ; SSE-LABEL: merge_4i32_i32_23u5:
367 ; SSE-NEXT: movups 8(%rdi), %xmm0
370 ; AVX-LABEL: merge_4i32_i32_23u5:
372 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
375 ; X32-SSE1-LABEL: merge_4i32_i32_23u5:
377 ; X32-SSE1-NEXT: pushl %esi
378 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
379 ; X32-SSE1-NEXT: .cfi_offset %esi, -8
380 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
381 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
382 ; X32-SSE1-NEXT: movl 8(%ecx), %edx
383 ; X32-SSE1-NEXT: movl 12(%ecx), %esi
384 ; X32-SSE1-NEXT: movl 20(%ecx), %ecx
385 ; X32-SSE1-NEXT: movl %esi, 4(%eax)
386 ; X32-SSE1-NEXT: movl %edx, (%eax)
387 ; X32-SSE1-NEXT: movl %ecx, 12(%eax)
388 ; X32-SSE1-NEXT: popl %esi
389 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
390 ; X32-SSE1-NEXT: retl $4
392 ; X32-SSE41-LABEL: merge_4i32_i32_23u5:
393 ; X32-SSE41: # %bb.0:
394 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
395 ; X32-SSE41-NEXT: movups 8(%eax), %xmm0
396 ; X32-SSE41-NEXT: retl
397 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
398 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
399 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
400 %val0 = load i32, i32* %ptr0
401 %val1 = load i32, i32* %ptr1
402 %val3 = load i32, i32* %ptr3
403 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
404 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
405 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
409 define <4 x i32> @merge_4i32_i32_23u5_inc2(i32* %ptr) nounwind uwtable noinline ssp {
410 ; SSE-LABEL: merge_4i32_i32_23u5_inc2:
412 ; SSE-NEXT: movups 8(%rdi), %xmm0
413 ; SSE-NEXT: incl 8(%rdi)
416 ; AVX-LABEL: merge_4i32_i32_23u5_inc2:
418 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
419 ; AVX-NEXT: incl 8(%rdi)
422 ; X32-SSE1-LABEL: merge_4i32_i32_23u5_inc2:
424 ; X32-SSE1-NEXT: pushl %edi
425 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
426 ; X32-SSE1-NEXT: pushl %esi
427 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
428 ; X32-SSE1-NEXT: .cfi_offset %esi, -12
429 ; X32-SSE1-NEXT: .cfi_offset %edi, -8
430 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
431 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
432 ; X32-SSE1-NEXT: movl 8(%ecx), %edx
433 ; X32-SSE1-NEXT: movl 12(%ecx), %esi
434 ; X32-SSE1-NEXT: leal 1(%edx), %edi
435 ; X32-SSE1-NEXT: movl %edi, 8(%ecx)
436 ; X32-SSE1-NEXT: movl 20(%ecx), %ecx
437 ; X32-SSE1-NEXT: movl %esi, 4(%eax)
438 ; X32-SSE1-NEXT: movl %edx, (%eax)
439 ; X32-SSE1-NEXT: movl %ecx, 12(%eax)
440 ; X32-SSE1-NEXT: popl %esi
441 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
442 ; X32-SSE1-NEXT: popl %edi
443 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
444 ; X32-SSE1-NEXT: retl $4
446 ; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc2:
447 ; X32-SSE41: # %bb.0:
448 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
449 ; X32-SSE41-NEXT: movups 8(%eax), %xmm0
450 ; X32-SSE41-NEXT: incl 8(%eax)
451 ; X32-SSE41-NEXT: retl
452 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
453 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
454 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
455 %val0 = load i32, i32* %ptr0
456 %inc = add i32 %val0, 1
457 store i32 %inc, i32* %ptr0
458 %val1 = load i32, i32* %ptr1
459 %val3 = load i32, i32* %ptr3
460 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
461 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
462 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
466 define <4 x i32> @merge_4i32_i32_23u5_inc3(i32* %ptr) nounwind uwtable noinline ssp {
467 ; SSE-LABEL: merge_4i32_i32_23u5_inc3:
469 ; SSE-NEXT: movups 8(%rdi), %xmm0
470 ; SSE-NEXT: incl 12(%rdi)
473 ; AVX-LABEL: merge_4i32_i32_23u5_inc3:
475 ; AVX-NEXT: vmovups 8(%rdi), %xmm0
476 ; AVX-NEXT: incl 12(%rdi)
479 ; X32-SSE1-LABEL: merge_4i32_i32_23u5_inc3:
481 ; X32-SSE1-NEXT: pushl %edi
482 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
483 ; X32-SSE1-NEXT: pushl %esi
484 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
485 ; X32-SSE1-NEXT: .cfi_offset %esi, -12
486 ; X32-SSE1-NEXT: .cfi_offset %edi, -8
487 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
488 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
489 ; X32-SSE1-NEXT: movl 8(%ecx), %edx
490 ; X32-SSE1-NEXT: movl 12(%ecx), %esi
491 ; X32-SSE1-NEXT: leal 1(%esi), %edi
492 ; X32-SSE1-NEXT: movl %edi, 12(%ecx)
493 ; X32-SSE1-NEXT: movl 20(%ecx), %ecx
494 ; X32-SSE1-NEXT: movl %esi, 4(%eax)
495 ; X32-SSE1-NEXT: movl %edx, (%eax)
496 ; X32-SSE1-NEXT: movl %ecx, 12(%eax)
497 ; X32-SSE1-NEXT: popl %esi
498 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
499 ; X32-SSE1-NEXT: popl %edi
500 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
501 ; X32-SSE1-NEXT: retl $4
503 ; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc3:
504 ; X32-SSE41: # %bb.0:
505 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
506 ; X32-SSE41-NEXT: movups 8(%eax), %xmm0
507 ; X32-SSE41-NEXT: incl 12(%eax)
508 ; X32-SSE41-NEXT: retl
509 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
510 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
511 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
512 %val0 = load i32, i32* %ptr0
513 %val1 = load i32, i32* %ptr1
514 %inc = add i32 %val1, 1
515 store i32 %inc, i32* %ptr1
516 %val3 = load i32, i32* %ptr3
517 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
518 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
519 %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
523 define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp {
524 ; SSE-LABEL: merge_4i32_i32_3zuu:
526 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
529 ; AVX-LABEL: merge_4i32_i32_3zuu:
531 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
534 ; X32-SSE1-LABEL: merge_4i32_i32_3zuu:
536 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
537 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
538 ; X32-SSE1-NEXT: movl 12(%ecx), %ecx
539 ; X32-SSE1-NEXT: movl %ecx, (%eax)
540 ; X32-SSE1-NEXT: movl $0, 4(%eax)
541 ; X32-SSE1-NEXT: retl $4
543 ; X32-SSE41-LABEL: merge_4i32_i32_3zuu:
544 ; X32-SSE41: # %bb.0:
545 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
546 ; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
547 ; X32-SSE41-NEXT: retl
548 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
549 %val0 = load i32, i32* %ptr0
550 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
551 %res1 = insertelement <4 x i32> %res0, i32 0, i32 1
555 define <4 x i32> @merge_4i32_i32_34uu(i32* %ptr) nounwind uwtable noinline ssp {
556 ; SSE-LABEL: merge_4i32_i32_34uu:
558 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
561 ; AVX-LABEL: merge_4i32_i32_34uu:
563 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
566 ; X32-SSE1-LABEL: merge_4i32_i32_34uu:
568 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
569 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
570 ; X32-SSE1-NEXT: movl 12(%ecx), %edx
571 ; X32-SSE1-NEXT: movl 16(%ecx), %ecx
572 ; X32-SSE1-NEXT: movl %ecx, 4(%eax)
573 ; X32-SSE1-NEXT: movl %edx, (%eax)
574 ; X32-SSE1-NEXT: retl $4
576 ; X32-SSE41-LABEL: merge_4i32_i32_34uu:
577 ; X32-SSE41: # %bb.0:
578 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
579 ; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
580 ; X32-SSE41-NEXT: retl
581 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
582 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 4
583 %val0 = load i32, i32* %ptr0
584 %val1 = load i32, i32* %ptr1
585 %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
586 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
590 define <4 x i32> @merge_4i32_i32_45zz(i32* %ptr) nounwind uwtable noinline ssp {
591 ; SSE-LABEL: merge_4i32_i32_45zz:
593 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
596 ; AVX-LABEL: merge_4i32_i32_45zz:
598 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
601 ; X32-SSE1-LABEL: merge_4i32_i32_45zz:
603 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
604 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
605 ; X32-SSE1-NEXT: movl 16(%ecx), %edx
606 ; X32-SSE1-NEXT: movl 20(%ecx), %ecx
607 ; X32-SSE1-NEXT: movl %ecx, 4(%eax)
608 ; X32-SSE1-NEXT: movl %edx, (%eax)
609 ; X32-SSE1-NEXT: movl $0, 12(%eax)
610 ; X32-SSE1-NEXT: movl $0, 8(%eax)
611 ; X32-SSE1-NEXT: retl $4
613 ; X32-SSE41-LABEL: merge_4i32_i32_45zz:
614 ; X32-SSE41: # %bb.0:
615 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
616 ; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
617 ; X32-SSE41-NEXT: retl
618 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
619 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
620 %val0 = load i32, i32* %ptr0
621 %val1 = load i32, i32* %ptr1
622 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
623 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
627 define <4 x i32> @merge_4i32_i32_45zz_inc4(i32* %ptr) nounwind uwtable noinline ssp {
628 ; SSE-LABEL: merge_4i32_i32_45zz_inc4:
630 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
631 ; SSE-NEXT: incl 16(%rdi)
634 ; AVX-LABEL: merge_4i32_i32_45zz_inc4:
636 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
637 ; AVX-NEXT: incl 16(%rdi)
640 ; X32-SSE1-LABEL: merge_4i32_i32_45zz_inc4:
642 ; X32-SSE1-NEXT: pushl %edi
643 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
644 ; X32-SSE1-NEXT: pushl %esi
645 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
646 ; X32-SSE1-NEXT: .cfi_offset %esi, -12
647 ; X32-SSE1-NEXT: .cfi_offset %edi, -8
648 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
649 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
650 ; X32-SSE1-NEXT: movl 16(%ecx), %edx
651 ; X32-SSE1-NEXT: movl 20(%ecx), %esi
652 ; X32-SSE1-NEXT: leal 1(%edx), %edi
653 ; X32-SSE1-NEXT: movl %edi, 16(%ecx)
654 ; X32-SSE1-NEXT: movl %esi, 4(%eax)
655 ; X32-SSE1-NEXT: movl %edx, (%eax)
656 ; X32-SSE1-NEXT: movl $0, 12(%eax)
657 ; X32-SSE1-NEXT: movl $0, 8(%eax)
658 ; X32-SSE1-NEXT: popl %esi
659 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
660 ; X32-SSE1-NEXT: popl %edi
661 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
662 ; X32-SSE1-NEXT: retl $4
664 ; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc4:
665 ; X32-SSE41: # %bb.0:
666 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
667 ; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
668 ; X32-SSE41-NEXT: incl 16(%eax)
669 ; X32-SSE41-NEXT: retl
670 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
671 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
672 %val0 = load i32, i32* %ptr0
673 %inc = add i32 %val0, 1
674 store i32 %inc, i32* %ptr0
675 %val1 = load i32, i32* %ptr1
676 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
677 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
681 define <4 x i32> @merge_4i32_i32_45zz_inc5(i32* %ptr) nounwind uwtable noinline ssp {
682 ; SSE-LABEL: merge_4i32_i32_45zz_inc5:
684 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
685 ; SSE-NEXT: incl 20(%rdi)
688 ; AVX-LABEL: merge_4i32_i32_45zz_inc5:
690 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
691 ; AVX-NEXT: incl 20(%rdi)
694 ; X32-SSE1-LABEL: merge_4i32_i32_45zz_inc5:
696 ; X32-SSE1-NEXT: pushl %edi
697 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
698 ; X32-SSE1-NEXT: pushl %esi
699 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
700 ; X32-SSE1-NEXT: .cfi_offset %esi, -12
701 ; X32-SSE1-NEXT: .cfi_offset %edi, -8
702 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
703 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
704 ; X32-SSE1-NEXT: movl 16(%ecx), %edx
705 ; X32-SSE1-NEXT: movl 20(%ecx), %esi
706 ; X32-SSE1-NEXT: leal 1(%esi), %edi
707 ; X32-SSE1-NEXT: movl %edi, 20(%ecx)
708 ; X32-SSE1-NEXT: movl %esi, 4(%eax)
709 ; X32-SSE1-NEXT: movl %edx, (%eax)
710 ; X32-SSE1-NEXT: movl $0, 12(%eax)
711 ; X32-SSE1-NEXT: movl $0, 8(%eax)
712 ; X32-SSE1-NEXT: popl %esi
713 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
714 ; X32-SSE1-NEXT: popl %edi
715 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
716 ; X32-SSE1-NEXT: retl $4
718 ; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc5:
719 ; X32-SSE41: # %bb.0:
720 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
721 ; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
722 ; X32-SSE41-NEXT: incl 20(%eax)
723 ; X32-SSE41-NEXT: retl
724 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
725 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
726 %val0 = load i32, i32* %ptr0
727 %val1 = load i32, i32* %ptr1
728 %inc = add i32 %val1, 1
729 store i32 %inc, i32* %ptr1
730 %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
731 %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
735 define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline ssp {
736 ; SSE-LABEL: merge_8i16_i16_23u567u9:
738 ; SSE-NEXT: movups 4(%rdi), %xmm0
741 ; AVX-LABEL: merge_8i16_i16_23u567u9:
743 ; AVX-NEXT: vmovups 4(%rdi), %xmm0
746 ; X32-SSE1-LABEL: merge_8i16_i16_23u567u9:
748 ; X32-SSE1-NEXT: pushl %edi
749 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
750 ; X32-SSE1-NEXT: pushl %esi
751 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
752 ; X32-SSE1-NEXT: .cfi_offset %esi, -12
753 ; X32-SSE1-NEXT: .cfi_offset %edi, -8
754 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
755 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
756 ; X32-SSE1-NEXT: movl 4(%ecx), %edx
757 ; X32-SSE1-NEXT: movl 10(%ecx), %esi
758 ; X32-SSE1-NEXT: movzwl 14(%ecx), %edi
759 ; X32-SSE1-NEXT: movzwl 18(%ecx), %ecx
760 ; X32-SSE1-NEXT: movw %di, 10(%eax)
761 ; X32-SSE1-NEXT: movw %cx, 14(%eax)
762 ; X32-SSE1-NEXT: movl %esi, 6(%eax)
763 ; X32-SSE1-NEXT: movl %edx, (%eax)
764 ; X32-SSE1-NEXT: popl %esi
765 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
766 ; X32-SSE1-NEXT: popl %edi
767 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
768 ; X32-SSE1-NEXT: retl $4
770 ; X32-SSE41-LABEL: merge_8i16_i16_23u567u9:
771 ; X32-SSE41: # %bb.0:
772 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
773 ; X32-SSE41-NEXT: movups 4(%eax), %xmm0
774 ; X32-SSE41-NEXT: retl
775 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
776 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
777 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 5
778 %ptr4 = getelementptr inbounds i16, i16* %ptr, i64 6
779 %ptr5 = getelementptr inbounds i16, i16* %ptr, i64 7
780 %ptr7 = getelementptr inbounds i16, i16* %ptr, i64 9
781 %val0 = load i16, i16* %ptr0
782 %val1 = load i16, i16* %ptr1
783 %val3 = load i16, i16* %ptr3
784 %val4 = load i16, i16* %ptr4
785 %val5 = load i16, i16* %ptr5
786 %val7 = load i16, i16* %ptr7
787 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
788 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
789 %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
790 %res4 = insertelement <8 x i16> %res3, i16 %val4, i32 4
791 %res5 = insertelement <8 x i16> %res4, i16 %val5, i32 5
792 %res7 = insertelement <8 x i16> %res5, i16 %val7, i32 7
796 define <8 x i16> @merge_8i16_i16_34uuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
797 ; SSE-LABEL: merge_8i16_i16_34uuuuuu:
799 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
802 ; AVX-LABEL: merge_8i16_i16_34uuuuuu:
804 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
807 ; X32-SSE1-LABEL: merge_8i16_i16_34uuuuuu:
809 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
810 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
811 ; X32-SSE1-NEXT: movl 6(%ecx), %ecx
812 ; X32-SSE1-NEXT: movl %ecx, (%eax)
813 ; X32-SSE1-NEXT: retl $4
815 ; X32-SSE41-LABEL: merge_8i16_i16_34uuuuuu:
816 ; X32-SSE41: # %bb.0:
817 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
818 ; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
819 ; X32-SSE41-NEXT: retl
820 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 3
821 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 4
822 %val0 = load i16, i16* %ptr0
823 %val1 = load i16, i16* %ptr1
824 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
825 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
829 define <8 x i16> @merge_8i16_i16_45u7zzzz(i16* %ptr) nounwind uwtable noinline ssp {
830 ; SSE-LABEL: merge_8i16_i16_45u7zzzz:
832 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
835 ; AVX-LABEL: merge_8i16_i16_45u7zzzz:
837 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
840 ; X32-SSE1-LABEL: merge_8i16_i16_45u7zzzz:
842 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
843 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
844 ; X32-SSE1-NEXT: movl 8(%ecx), %edx
845 ; X32-SSE1-NEXT: movzwl 14(%ecx), %ecx
846 ; X32-SSE1-NEXT: movw %cx, 6(%eax)
847 ; X32-SSE1-NEXT: movl %edx, (%eax)
848 ; X32-SSE1-NEXT: movl $0, 12(%eax)
849 ; X32-SSE1-NEXT: movl $0, 8(%eax)
850 ; X32-SSE1-NEXT: retl $4
852 ; X32-SSE41-LABEL: merge_8i16_i16_45u7zzzz:
853 ; X32-SSE41: # %bb.0:
854 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
855 ; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
856 ; X32-SSE41-NEXT: retl
857 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
858 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
859 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
860 %val0 = load i16, i16* %ptr0
861 %val1 = load i16, i16* %ptr1
862 %val3 = load i16, i16* %ptr3
863 %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
864 %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
865 %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
866 %res4 = insertelement <8 x i16> %res3, i16 0, i32 4
867 %res5 = insertelement <8 x i16> %res4, i16 0, i32 5
868 %res6 = insertelement <8 x i16> %res5, i16 0, i32 6
869 %res7 = insertelement <8 x i16> %res6, i16 0, i32 7
873 define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noinline ssp {
874 ; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
876 ; SSE-NEXT: movups (%rdi), %xmm0
879 ; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF:
881 ; AVX-NEXT: vmovups (%rdi), %xmm0
884 ; X32-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF:
886 ; X32-SSE1-NEXT: pushl %ebp
887 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
888 ; X32-SSE1-NEXT: pushl %ebx
889 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
890 ; X32-SSE1-NEXT: pushl %edi
891 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 16
892 ; X32-SSE1-NEXT: pushl %esi
893 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 20
894 ; X32-SSE1-NEXT: .cfi_offset %esi, -20
895 ; X32-SSE1-NEXT: .cfi_offset %edi, -16
896 ; X32-SSE1-NEXT: .cfi_offset %ebx, -12
897 ; X32-SSE1-NEXT: .cfi_offset %ebp, -8
898 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
899 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
900 ; X32-SSE1-NEXT: movzwl (%ecx), %ebp
901 ; X32-SSE1-NEXT: movl 3(%ecx), %esi
902 ; X32-SSE1-NEXT: movl 7(%ecx), %edi
903 ; X32-SSE1-NEXT: movzwl 11(%ecx), %ebx
904 ; X32-SSE1-NEXT: movb 13(%ecx), %dl
905 ; X32-SSE1-NEXT: movb 15(%ecx), %cl
906 ; X32-SSE1-NEXT: movb %dl, 13(%eax)
907 ; X32-SSE1-NEXT: movb %cl, 15(%eax)
908 ; X32-SSE1-NEXT: movw %bx, 11(%eax)
909 ; X32-SSE1-NEXT: movl %edi, 7(%eax)
910 ; X32-SSE1-NEXT: movl %esi, 3(%eax)
911 ; X32-SSE1-NEXT: movw %bp, (%eax)
912 ; X32-SSE1-NEXT: popl %esi
913 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 16
914 ; X32-SSE1-NEXT: popl %edi
915 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
916 ; X32-SSE1-NEXT: popl %ebx
917 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
918 ; X32-SSE1-NEXT: popl %ebp
919 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
920 ; X32-SSE1-NEXT: retl $4
922 ; X32-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF:
923 ; X32-SSE41: # %bb.0:
924 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
925 ; X32-SSE41-NEXT: movups (%eax), %xmm0
926 ; X32-SSE41-NEXT: retl
927 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
928 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
929 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
930 %ptr4 = getelementptr inbounds i8, i8* %ptr, i64 4
931 %ptr5 = getelementptr inbounds i8, i8* %ptr, i64 5
932 %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
933 %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
934 %ptr8 = getelementptr inbounds i8, i8* %ptr, i64 8
935 %ptr9 = getelementptr inbounds i8, i8* %ptr, i64 9
936 %ptrA = getelementptr inbounds i8, i8* %ptr, i64 10
937 %ptrB = getelementptr inbounds i8, i8* %ptr, i64 11
938 %ptrC = getelementptr inbounds i8, i8* %ptr, i64 12
939 %ptrD = getelementptr inbounds i8, i8* %ptr, i64 13
940 %ptrF = getelementptr inbounds i8, i8* %ptr, i64 15
941 %val0 = load i8, i8* %ptr0
942 %val1 = load i8, i8* %ptr1
943 %val3 = load i8, i8* %ptr3
944 %val4 = load i8, i8* %ptr4
945 %val5 = load i8, i8* %ptr5
946 %val6 = load i8, i8* %ptr6
947 %val7 = load i8, i8* %ptr7
948 %val8 = load i8, i8* %ptr8
949 %val9 = load i8, i8* %ptr9
950 %valA = load i8, i8* %ptrA
951 %valB = load i8, i8* %ptrB
952 %valC = load i8, i8* %ptrC
953 %valD = load i8, i8* %ptrD
954 %valF = load i8, i8* %ptrF
955 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
956 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
957 %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
958 %res4 = insertelement <16 x i8> %res3, i8 %val4, i32 4
959 %res5 = insertelement <16 x i8> %res4, i8 %val5, i32 5
960 %res6 = insertelement <16 x i8> %res5, i8 %val6, i32 6
961 %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
962 %res8 = insertelement <16 x i8> %res7, i8 %val8, i32 8
963 %res9 = insertelement <16 x i8> %res8, i8 %val9, i32 9
964 %resA = insertelement <16 x i8> %res9, i8 %valA, i32 10
965 %resB = insertelement <16 x i8> %resA, i8 %valB, i32 11
966 %resC = insertelement <16 x i8> %resB, i8 %valC, i32 12
967 %resD = insertelement <16 x i8> %resC, i8 %valD, i32 13
968 %resF = insertelement <16 x i8> %resD, i8 %valF, i32 15
972 define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
973 ; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
975 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
978 ; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
980 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
983 ; X32-SSE1-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
985 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
986 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
987 ; X32-SSE1-NEXT: movzwl (%ecx), %edx
988 ; X32-SSE1-NEXT: movb 3(%ecx), %cl
989 ; X32-SSE1-NEXT: movb %cl, 3(%eax)
990 ; X32-SSE1-NEXT: movw %dx, (%eax)
991 ; X32-SSE1-NEXT: movb $0, 15(%eax)
992 ; X32-SSE1-NEXT: movw $0, 13(%eax)
993 ; X32-SSE1-NEXT: movw $0, 6(%eax)
994 ; X32-SSE1-NEXT: retl $4
996 ; X32-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
997 ; X32-SSE41: # %bb.0:
998 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
999 ; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1000 ; X32-SSE41-NEXT: retl
1001 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
1002 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
1003 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
1004 %val0 = load i8, i8* %ptr0
1005 %val1 = load i8, i8* %ptr1
1006 %val3 = load i8, i8* %ptr3
1007 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
1008 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
1009 %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
1010 %res6 = insertelement <16 x i8> %res3, i8 0, i32 6
1011 %res7 = insertelement <16 x i8> %res6, i8 0, i32 7
1012 %resD = insertelement <16 x i8> %res7, i8 0, i32 13
1013 %resE = insertelement <16 x i8> %resD, i8 0, i32 14
1014 %resF = insertelement <16 x i8> %resE, i8 0, i32 15
1018 define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
1019 ; SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1021 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1024 ; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1026 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1029 ; X32-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1030 ; X32-SSE1: # %bb.0:
1031 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1032 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1033 ; X32-SSE1-NEXT: movl (%ecx), %edx
1034 ; X32-SSE1-NEXT: movzwl 6(%ecx), %ecx
1035 ; X32-SSE1-NEXT: movw %cx, 6(%eax)
1036 ; X32-SSE1-NEXT: movl %edx, (%eax)
1037 ; X32-SSE1-NEXT: movb $0, 15(%eax)
1038 ; X32-SSE1-NEXT: movw $0, 13(%eax)
1039 ; X32-SSE1-NEXT: retl $4
1041 ; X32-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
1042 ; X32-SSE41: # %bb.0:
1043 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1044 ; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1045 ; X32-SSE41-NEXT: retl
1046 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
1047 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
1048 %ptr2 = getelementptr inbounds i8, i8* %ptr, i64 2
1049 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
1050 %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
1051 %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
1052 %val0 = load i8, i8* %ptr0
1053 %val1 = load i8, i8* %ptr1
1054 %val2 = load i8, i8* %ptr2
1055 %val3 = load i8, i8* %ptr3
1056 %val6 = load i8, i8* %ptr6
1057 %val7 = load i8, i8* %ptr7
1058 %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
1059 %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
1060 %res2 = insertelement <16 x i8> %res1, i8 %val2, i32 2
1061 %res3 = insertelement <16 x i8> %res2, i8 %val3, i32 3
1062 %res6 = insertelement <16 x i8> %res3, i8 %val6, i32 6
1063 %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
1064 %resD = insertelement <16 x i8> %res7, i8 0, i32 13
1065 %resE = insertelement <16 x i8> %resD, i8 0, i32 14
1066 %resF = insertelement <16 x i8> %resE, i8 0, i32 15
1070 define void @merge_4i32_i32_combine(<4 x i32>* %dst, i32* %src) {
1071 ; SSE-LABEL: merge_4i32_i32_combine:
1073 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1074 ; SSE-NEXT: movaps %xmm0, (%rdi)
1077 ; AVX-LABEL: merge_4i32_i32_combine:
1079 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1080 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
1083 ; X32-SSE1-LABEL: merge_4i32_i32_combine:
1084 ; X32-SSE1: # %bb.0:
1085 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1086 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1087 ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1088 ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1089 ; X32-SSE1-NEXT: andps %xmm0, %xmm1
1090 ; X32-SSE1-NEXT: movaps %xmm1, (%eax)
1091 ; X32-SSE1-NEXT: retl
1093 ; X32-SSE41-LABEL: merge_4i32_i32_combine:
1094 ; X32-SSE41: # %bb.0:
1095 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1096 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
1097 ; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1098 ; X32-SSE41-NEXT: movaps %xmm0, (%eax)
1099 ; X32-SSE41-NEXT: retl
1100 %1 = getelementptr i32, i32* %src, i32 0
1101 %2 = load i32, i32* %1
1102 %3 = insertelement <4 x i32> undef, i32 %2, i32 0
1103 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
1104 %5 = lshr <4 x i32> %4, <i32 0, i32 undef, i32 undef, i32 undef>
1105 %6 = and <4 x i32> %5, <i32 -1, i32 0, i32 0, i32 0>
1106 store <4 x i32> %6, <4 x i32>* %dst
1111 ; consecutive loads including any/all volatiles may not be combined
1114 define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinline ssp {
1115 ; SSE-LABEL: merge_2i64_i64_12_volatile:
1117 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1118 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
1119 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1122 ; AVX-LABEL: merge_2i64_i64_12_volatile:
1124 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1125 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1126 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1129 ; X32-SSE1-LABEL: merge_2i64_i64_12_volatile:
1130 ; X32-SSE1: # %bb.0:
1131 ; X32-SSE1-NEXT: pushl %edi
1132 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
1133 ; X32-SSE1-NEXT: pushl %esi
1134 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
1135 ; X32-SSE1-NEXT: .cfi_offset %esi, -12
1136 ; X32-SSE1-NEXT: .cfi_offset %edi, -8
1137 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1138 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1139 ; X32-SSE1-NEXT: movl 8(%ecx), %edx
1140 ; X32-SSE1-NEXT: movl 12(%ecx), %esi
1141 ; X32-SSE1-NEXT: movl 16(%ecx), %edi
1142 ; X32-SSE1-NEXT: movl 20(%ecx), %ecx
1143 ; X32-SSE1-NEXT: movl %ecx, 12(%eax)
1144 ; X32-SSE1-NEXT: movl %edi, 8(%eax)
1145 ; X32-SSE1-NEXT: movl %esi, 4(%eax)
1146 ; X32-SSE1-NEXT: movl %edx, (%eax)
1147 ; X32-SSE1-NEXT: popl %esi
1148 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
1149 ; X32-SSE1-NEXT: popl %edi
1150 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
1151 ; X32-SSE1-NEXT: retl $4
1153 ; X32-SSE41-LABEL: merge_2i64_i64_12_volatile:
1154 ; X32-SSE41: # %bb.0:
1155 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1156 ; X32-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1157 ; X32-SSE41-NEXT: pinsrd $1, 12(%eax), %xmm0
1158 ; X32-SSE41-NEXT: pinsrd $2, 16(%eax), %xmm0
1159 ; X32-SSE41-NEXT: pinsrd $3, 20(%eax), %xmm0
1160 ; X32-SSE41-NEXT: retl
1161 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
1162 %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
1163 %val0 = load volatile i64, i64* %ptr0
1164 %val1 = load volatile i64, i64* %ptr1
1165 %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
1166 %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
1170 define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable noinline ssp {
1171 ; SSE2-LABEL: merge_4f32_f32_2345_volatile:
1173 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1174 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1175 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1176 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
1177 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1180 ; SSE41-LABEL: merge_4f32_f32_2345_volatile:
1182 ; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1183 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1184 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1185 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1188 ; AVX-LABEL: merge_4f32_f32_2345_volatile:
1190 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1191 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1192 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1193 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1196 ; X32-SSE1-LABEL: merge_4f32_f32_2345_volatile:
1197 ; X32-SSE1: # %bb.0:
1198 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1199 ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1200 ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1201 ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1202 ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1203 ; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1204 ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1205 ; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1206 ; X32-SSE1-NEXT: retl
1208 ; X32-SSE41-LABEL: merge_4f32_f32_2345_volatile:
1209 ; X32-SSE41: # %bb.0:
1210 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1211 ; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1212 ; X32-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1213 ; X32-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1214 ; X32-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1215 ; X32-SSE41-NEXT: retl
1216 %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
1217 %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
1218 %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
1219 %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
1220 %val0 = load volatile float, float* %ptr0
1221 %val1 = load float, float* %ptr1
1222 %val2 = load float, float* %ptr2
1223 %val3 = load float, float* %ptr3
1224 %res0 = insertelement <4 x float> undef, float %val0, i32 0
1225 %res1 = insertelement <4 x float> %res0, float %val1, i32 1
1226 %res2 = insertelement <4 x float> %res1, float %val2, i32 2
1227 %res3 = insertelement <4 x float> %res2, float %val3, i32 3
1228 ret <4 x float> %res3
1232 ; Non-consecutive test.
1235 define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp {
1236 ; SSE-LABEL: merge_4f32_f32_X0YY:
1238 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1239 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1240 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1243 ; AVX-LABEL: merge_4f32_f32_X0YY:
1245 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1246 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1247 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
1250 ; X32-SSE-LABEL: merge_4f32_f32_X0YY:
1252 ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1253 ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1254 ; X32-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1255 ; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1256 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1257 ; X32-SSE-NEXT: retl
1258 %val0 = load float, float* %ptr0, align 4
1259 %val1 = load float, float* %ptr1, align 4
1260 %res0 = insertelement <4 x float> undef, float %val0, i32 0
1261 %res1 = insertelement <4 x float> %res0, float 0.000000e+00, i32 1
1262 %res2 = insertelement <4 x float> %res1, float %val1, i32 2
1263 %res3 = insertelement <4 x float> %res2, float %val1, i32 3
1264 ret <4 x float> %res3
1272 define <4 x i32> @load_i32_zext_i128_v4i32(i32* %ptr) {
1273 ; SSE-LABEL: load_i32_zext_i128_v4i32:
1275 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1278 ; AVX-LABEL: load_i32_zext_i128_v4i32:
1280 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1283 ; X32-SSE1-LABEL: load_i32_zext_i128_v4i32:
1284 ; X32-SSE1: # %bb.0:
1285 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1286 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1287 ; X32-SSE1-NEXT: movl (%ecx), %ecx
1288 ; X32-SSE1-NEXT: movl %ecx, (%eax)
1289 ; X32-SSE1-NEXT: movl $0, 12(%eax)
1290 ; X32-SSE1-NEXT: movl $0, 8(%eax)
1291 ; X32-SSE1-NEXT: movl $0, 4(%eax)
1292 ; X32-SSE1-NEXT: retl $4
1294 ; X32-SSE41-LABEL: load_i32_zext_i128_v4i32:
1295 ; X32-SSE41: # %bb.0:
1296 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1297 ; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1298 ; X32-SSE41-NEXT: retl
1299 %1 = load i32, i32* %ptr
1300 %2 = zext i32 %1 to i128
1301 %3 = bitcast i128 %2 to <4 x i32>