1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64
5 @g16 = external global i16
7 define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
10 ; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0
13 ; X64-LABEL: pinsrd_1:
15 ; X64-NEXT: pinsrd $1, %edi, %xmm0
17 %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
21 define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
22 ; X32-LABEL: pinsrb_1:
24 ; X32-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
27 ; X64-LABEL: pinsrb_1:
29 ; X64-NEXT: pinsrb $1, %edi, %xmm0
31 %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
35 define <2 x i64> @pmovzxbq_1() nounwind {
36 ; X32-LABEL: pmovzxbq_1:
37 ; X32: ## BB#0: ## %entry
38 ; X32-NEXT: movl L_g16$non_lazy_ptr, %eax
39 ; X32-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
42 ; X64-LABEL: pmovzxbq_1:
43 ; X64: ## BB#0: ## %entry
44 ; X64-NEXT: movq _g16@{{.*}}(%rip), %rax
45 ; X64-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
48 %0 = load i16, i16* @g16, align 2 ; <i16> [#uses=1]
49 %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1]
50 %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1]
51 %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1]
55 declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
57 define i32 @extractps_1(<4 x float> %v) nounwind {
58 ; X32-LABEL: extractps_1:
60 ; X32-NEXT: extractps $3, %xmm0, %eax
63 ; X64-LABEL: extractps_1:
65 ; X64-NEXT: extractps $3, %xmm0, %eax
67 %s = extractelement <4 x float> %v, i32 3
68 %i = bitcast float %s to i32
71 define i32 @extractps_2(<4 x float> %v) nounwind {
72 ; X32-LABEL: extractps_2:
74 ; X32-NEXT: extractps $3, %xmm0, %eax
77 ; X64-LABEL: extractps_2:
79 ; X64-NEXT: extractps $3, %xmm0, %eax
81 %t = bitcast <4 x float> %v to <4 x i32>
82 %s = extractelement <4 x i32> %t, i32 3
87 ; The non-store form of extractps puts its result into a GPR.
88 ; This makes it suitable for an extract from a <4 x float> that
89 ; is bitcasted to i32, but unsuitable for much of anything else.
91 define float @ext_1(<4 x float> %v) nounwind {
94 ; X32-NEXT: pushl %eax
95 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
96 ; X32-NEXT: addss LCPI5_0, %xmm0
97 ; X32-NEXT: movss %xmm0, (%esp)
98 ; X32-NEXT: flds (%esp)
104 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
105 ; X64-NEXT: addss {{.*}}(%rip), %xmm0
107 %s = extractelement <4 x float> %v, i32 3
108 %t = fadd float %s, 1.0
112 define float @ext_2(<4 x float> %v) nounwind {
115 ; X32-NEXT: pushl %eax
116 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
117 ; X32-NEXT: movss %xmm0, (%esp)
118 ; X32-NEXT: flds (%esp)
119 ; X32-NEXT: popl %eax
124 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
126 %s = extractelement <4 x float> %v, i32 3
130 define i32 @ext_3(<4 x i32> %v) nounwind {
133 ; X32-NEXT: extractps $3, %xmm0, %eax
138 ; X64-NEXT: extractps $3, %xmm0, %eax
140 %i = extractelement <4 x i32> %v, i32 3
144 define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
145 ; X32-LABEL: insertps_1:
147 ; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
150 ; X64-LABEL: insertps_1:
152 ; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
154 %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 21) nounwind readnone
155 ret <4 x float> %tmp1
158 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
160 ; When optimizing for speed, prefer blendps over insertps even if it means we have to
161 ; generate a separate movss to load the scalar operand.
162 define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
163 ; X32-LABEL: blendps_not_insertps_1:
165 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
166 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
169 ; X64-LABEL: blendps_not_insertps_1:
171 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
173 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
174 ret <4 x float> %tmp1
177 ; When optimizing for size, generate an insertps if there's a load fold opportunity.
178 ; The difference between i386 and x86-64 ABIs for the float operand means we should
179 ; generate an insertps for X32 but not for X64!
180 define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
181 ; X32-LABEL: insertps_or_blendps:
183 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
184 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
187 ; X64-LABEL: insertps_or_blendps:
189 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
191 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
192 ret <4 x float> %tmp1
195 ; An insert into the low 32-bits of a vector from the low 32-bits of another vector
196 ; is always just a blendps because blendps is never more expensive than insertps.
197 define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
198 ; X32-LABEL: blendps_not_insertps_2:
200 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
203 ; X64-LABEL: blendps_not_insertps_2:
205 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
207 %tmp2 = extractelement <4 x float> %t2, i32 0
208 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
209 ret <4 x float> %tmp1
212 define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
213 ; X32-LABEL: ptestz_1:
215 ; X32-NEXT: xorl %eax, %eax
216 ; X32-NEXT: ptest %xmm1, %xmm0
220 ; X64-LABEL: ptestz_1:
222 ; X64-NEXT: xorl %eax, %eax
223 ; X64-NEXT: ptest %xmm1, %xmm0
226 %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
230 define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
231 ; X32-LABEL: ptestz_2:
233 ; X32-NEXT: xorl %eax, %eax
234 ; X32-NEXT: ptest %xmm1, %xmm0
238 ; X64-LABEL: ptestz_2:
240 ; X64-NEXT: xorl %eax, %eax
241 ; X64-NEXT: ptest %xmm1, %xmm0
244 %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
248 define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
249 ; X32-LABEL: ptestz_3:
251 ; X32-NEXT: xorl %eax, %eax
252 ; X32-NEXT: ptest %xmm1, %xmm0
256 ; X64-LABEL: ptestz_3:
258 ; X64-NEXT: xorl %eax, %eax
259 ; X64-NEXT: ptest %xmm1, %xmm0
262 %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
266 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
267 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
268 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
270 ; This used to compile to insertps $0 + insertps $16. insertps $0 is always
272 define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind {
273 ; X32-LABEL: buildvector:
274 ; X32: ## BB#0: ## %entry
275 ; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
276 ; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
277 ; X32-NEXT: addss %xmm2, %xmm3
278 ; X32-NEXT: addss %xmm1, %xmm0
279 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
282 ; X64-LABEL: buildvector:
283 ; X64: ## BB#0: ## %entry
284 ; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
285 ; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
286 ; X64-NEXT: addss %xmm2, %xmm3
287 ; X64-NEXT: addss %xmm1, %xmm0
288 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
291 %tmp7 = extractelement <2 x float> %A, i32 0
292 %tmp5 = extractelement <2 x float> %A, i32 1
293 %tmp3 = extractelement <2 x float> %B, i32 0
294 %tmp1 = extractelement <2 x float> %B, i32 1
295 %add.r = fadd float %tmp7, %tmp3
296 %add.i = fadd float %tmp5, %tmp1
297 %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
298 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
299 ret <2 x float> %tmp9
302 define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
303 ; X32-LABEL: insertps_from_shufflevector_1:
304 ; X32: ## BB#0: ## %entry
305 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
306 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
309 ; X64-LABEL: insertps_from_shufflevector_1:
310 ; X64: ## BB#0: ## %entry
311 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
314 %0 = load <4 x float>, <4 x float>* %pb, align 16
315 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
316 ret <4 x float> %vecinit6
319 define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
320 ; X32-LABEL: insertps_from_shufflevector_2:
321 ; X32: ## BB#0: ## %entry
322 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
325 ; X64-LABEL: insertps_from_shufflevector_2:
326 ; X64: ## BB#0: ## %entry
327 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
330 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
331 ret <4 x float> %vecinit6
334 ; For loading an i32 from memory into an xmm register we use pinsrd
335 ; instead of insertps
336 define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
337 ; X32-LABEL: pinsrd_from_shufflevector_i32:
338 ; X32: ## BB#0: ## %entry
339 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
340 ; X32-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
341 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
344 ; X64-LABEL: pinsrd_from_shufflevector_i32:
345 ; X64: ## BB#0: ## %entry
346 ; X64-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
347 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
350 %0 = load <4 x i32>, <4 x i32>* %pb, align 16
351 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
352 ret <4 x i32> %vecinit6
355 define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
356 ; X32-LABEL: insertps_from_shufflevector_i32_2:
357 ; X32: ## BB#0: ## %entry
358 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
359 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
362 ; X64-LABEL: insertps_from_shufflevector_i32_2:
363 ; X64: ## BB#0: ## %entry
364 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
365 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
368 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
369 ret <4 x i32> %vecinit6
372 define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
373 ; X32-LABEL: insertps_from_load_ins_elt_undef:
375 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
376 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
379 ; X64-LABEL: insertps_from_load_ins_elt_undef:
381 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
383 %1 = load float, float* %b, align 4
384 %2 = insertelement <4 x float> undef, float %1, i32 0
385 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
386 ret <4 x float> %result
389 ; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
390 define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
391 ; X32-LABEL: insertps_from_load_ins_elt_undef_i32:
393 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
394 ; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
395 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
396 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
399 ; X64-LABEL: insertps_from_load_ins_elt_undef_i32:
401 ; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
402 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
403 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
405 %1 = load i32, i32* %b, align 4
406 %2 = insertelement <4 x i32> undef, i32 %1, i32 0
407 %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
408 ret <4 x i32> %result
411 ;;;;;; Shuffles optimizable with a single insertps or blend instruction
412 define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
413 ; X32-LABEL: shuf_XYZ0:
415 ; X32-NEXT: xorps %xmm1, %xmm1
416 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
419 ; X64-LABEL: shuf_XYZ0:
421 ; X64-NEXT: xorps %xmm1, %xmm1
422 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
424 %vecext = extractelement <4 x float> %x, i32 0
425 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
426 %vecext1 = extractelement <4 x float> %x, i32 1
427 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
428 %vecext3 = extractelement <4 x float> %x, i32 2
429 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
430 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
431 ret <4 x float> %vecinit5
434 define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
435 ; X32-LABEL: shuf_XY00:
437 ; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
440 ; X64-LABEL: shuf_XY00:
442 ; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
444 %vecext = extractelement <4 x float> %x, i32 0
445 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
446 %vecext1 = extractelement <4 x float> %x, i32 1
447 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
448 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
449 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
450 ret <4 x float> %vecinit4
453 define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
454 ; X32-LABEL: shuf_XYY0:
456 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
459 ; X64-LABEL: shuf_XYY0:
461 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
463 %vecext = extractelement <4 x float> %x, i32 0
464 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
465 %vecext1 = extractelement <4 x float> %x, i32 1
466 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
467 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
468 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
469 ret <4 x float> %vecinit5
472 define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
473 ; X32-LABEL: shuf_XYW0:
475 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
478 ; X64-LABEL: shuf_XYW0:
480 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
482 %vecext = extractelement <4 x float> %x, i32 0
483 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
484 %vecext1 = extractelement <4 x float> %x, i32 1
485 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
486 %vecext2 = extractelement <4 x float> %x, i32 3
487 %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
488 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
489 ret <4 x float> %vecinit4
492 define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
493 ; X32-LABEL: shuf_W00W:
495 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
498 ; X64-LABEL: shuf_W00W:
500 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
502 %vecext = extractelement <4 x float> %x, i32 3
503 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
504 %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
505 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
506 %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
507 ret <4 x float> %vecinit4
510 define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
511 ; X32-LABEL: shuf_X00A:
513 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
516 ; X64-LABEL: shuf_X00A:
518 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
520 %vecext = extractelement <4 x float> %x, i32 0
521 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
522 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
523 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
524 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
525 ret <4 x float> %vecinit4
528 define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
529 ; X32-LABEL: shuf_X00X:
531 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
534 ; X64-LABEL: shuf_X00X:
536 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
538 %vecext = extractelement <4 x float> %x, i32 0
539 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
540 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
541 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
542 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
543 ret <4 x float> %vecinit4
546 define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
547 ; X32-LABEL: shuf_X0YC:
549 ; X32-NEXT: xorps %xmm2, %xmm2
550 ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
551 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
554 ; X64-LABEL: shuf_X0YC:
556 ; X64-NEXT: xorps %xmm2, %xmm2
557 ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
558 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
560 %vecext = extractelement <4 x float> %x, i32 0
561 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
562 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
563 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
564 %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
565 ret <4 x float> %vecinit5
568 define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
569 ; X32-LABEL: i32_shuf_XYZ0:
571 ; X32-NEXT: pxor %xmm1, %xmm1
572 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
575 ; X64-LABEL: i32_shuf_XYZ0:
577 ; X64-NEXT: pxor %xmm1, %xmm1
578 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
580 %vecext = extractelement <4 x i32> %x, i32 0
581 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
582 %vecext1 = extractelement <4 x i32> %x, i32 1
583 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
584 %vecext3 = extractelement <4 x i32> %x, i32 2
585 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
586 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
587 ret <4 x i32> %vecinit5
590 define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
591 ; X32-LABEL: i32_shuf_XY00:
593 ; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
596 ; X64-LABEL: i32_shuf_XY00:
598 ; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
600 %vecext = extractelement <4 x i32> %x, i32 0
601 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
602 %vecext1 = extractelement <4 x i32> %x, i32 1
603 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
604 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
605 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
606 ret <4 x i32> %vecinit4
609 define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
610 ; X32-LABEL: i32_shuf_XYY0:
612 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
613 ; X32-NEXT: pxor %xmm0, %xmm0
614 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
617 ; X64-LABEL: i32_shuf_XYY0:
619 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
620 ; X64-NEXT: pxor %xmm0, %xmm0
621 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
623 %vecext = extractelement <4 x i32> %x, i32 0
624 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
625 %vecext1 = extractelement <4 x i32> %x, i32 1
626 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
627 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
628 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
629 ret <4 x i32> %vecinit5
632 define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
633 ; X32-LABEL: i32_shuf_XYW0:
635 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
636 ; X32-NEXT: pxor %xmm0, %xmm0
637 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
640 ; X64-LABEL: i32_shuf_XYW0:
642 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
643 ; X64-NEXT: pxor %xmm0, %xmm0
644 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
646 %vecext = extractelement <4 x i32> %x, i32 0
647 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
648 %vecext1 = extractelement <4 x i32> %x, i32 1
649 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
650 %vecext2 = extractelement <4 x i32> %x, i32 3
651 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
652 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
653 ret <4 x i32> %vecinit4
656 define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
657 ; X32-LABEL: i32_shuf_W00W:
659 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
660 ; X32-NEXT: pxor %xmm0, %xmm0
661 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
664 ; X64-LABEL: i32_shuf_W00W:
666 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
667 ; X64-NEXT: pxor %xmm0, %xmm0
668 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
670 %vecext = extractelement <4 x i32> %x, i32 3
671 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
672 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
673 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
674 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
675 ret <4 x i32> %vecinit4
678 define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
679 ; X32-LABEL: i32_shuf_X00A:
681 ; X32-NEXT: pxor %xmm2, %xmm2
682 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
683 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
684 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
687 ; X64-LABEL: i32_shuf_X00A:
689 ; X64-NEXT: pxor %xmm2, %xmm2
690 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
691 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
692 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
694 %vecext = extractelement <4 x i32> %x, i32 0
695 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
696 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
697 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
698 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
699 ret <4 x i32> %vecinit4
702 define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
703 ; X32-LABEL: i32_shuf_X00X:
705 ; X32-NEXT: pxor %xmm1, %xmm1
706 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
707 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
710 ; X64-LABEL: i32_shuf_X00X:
712 ; X64-NEXT: pxor %xmm1, %xmm1
713 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
714 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
716 %vecext = extractelement <4 x i32> %x, i32 0
717 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
718 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
719 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
720 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
721 ret <4 x i32> %vecinit4
724 define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
725 ; X32-LABEL: i32_shuf_X0YC:
727 ; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
728 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
729 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
732 ; X64-LABEL: i32_shuf_X0YC:
734 ; X64-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
735 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
736 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
738 %vecext = extractelement <4 x i32> %x, i32 0
739 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
740 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
741 %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
742 %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
743 ret <4 x i32> %vecinit5
746 ;; Test for a bug in the first implementation of LowerBuildVectorv4x32
747 define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
748 ; X32-LABEL: test_insertps_no_undef:
750 ; X32-NEXT: xorps %xmm1, %xmm1
751 ; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
752 ; X32-NEXT: maxps %xmm1, %xmm0
755 ; X64-LABEL: test_insertps_no_undef:
757 ; X64-NEXT: xorps %xmm1, %xmm1
758 ; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
759 ; X64-NEXT: maxps %xmm1, %xmm0
761 %vecext = extractelement <4 x float> %x, i32 0
762 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
763 %vecext1 = extractelement <4 x float> %x, i32 1
764 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
765 %vecext3 = extractelement <4 x float> %x, i32 2
766 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
767 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
768 %mask = fcmp olt <4 x float> %vecinit5, %x
769 %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
773 define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
774 ; X32-LABEL: blendvb_fallback:
776 ; X32-NEXT: psllw $15, %xmm0
777 ; X32-NEXT: psraw $15, %xmm0
778 ; X32-NEXT: pblendvb %xmm0, %xmm1, %xmm2
779 ; X32-NEXT: movdqa %xmm2, %xmm0
782 ; X64-LABEL: blendvb_fallback:
784 ; X64-NEXT: psllw $15, %xmm0
785 ; X64-NEXT: psraw $15, %xmm0
786 ; X64-NEXT: pblendvb %xmm0, %xmm1, %xmm2
787 ; X64-NEXT: movdqa %xmm2, %xmm0
789 %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
793 ; On X32, account for the argument's move to registers
794 define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
795 ; X32-LABEL: insertps_from_vector_load:
797 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
798 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
801 ; X64-LABEL: insertps_from_vector_load:
803 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
805 %1 = load <4 x float>, <4 x float>* %pb, align 16
806 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
810 ;; Use a non-zero CountS for insertps
811 ;; Try to match a bit more of the instr, since we need the load's offset.
812 define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
813 ; X32-LABEL: insertps_from_vector_load_offset:
815 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
816 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
819 ; X64-LABEL: insertps_from_vector_load_offset:
821 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
823 %1 = load <4 x float>, <4 x float>* %pb, align 16
824 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
828 ;; Try to match a bit more of the instr, since we need the load's offset.
829 define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
830 ; X32-LABEL: insertps_from_vector_load_offset_2:
832 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
833 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
834 ; X32-NEXT: shll $4, %ecx
835 ; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
838 ; X64-LABEL: insertps_from_vector_load_offset_2:
840 ; X64-NEXT: shlq $4, %rsi
841 ; X64-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
843 %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
844 %2 = load <4 x float>, <4 x float>* %1, align 16
845 %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
849 define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
850 ; X32-LABEL: insertps_from_broadcast_loadf32:
852 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
853 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
854 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
857 ; X64-LABEL: insertps_from_broadcast_loadf32:
859 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
861 %1 = getelementptr inbounds float, float* %fb, i64 %index
862 %2 = load float, float* %1, align 4
863 %3 = insertelement <4 x float> undef, float %2, i32 0
864 %4 = insertelement <4 x float> %3, float %2, i32 1
865 %5 = insertelement <4 x float> %4, float %2, i32 2
866 %6 = insertelement <4 x float> %5, float %2, i32 3
867 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
871 define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
872 ; X32-LABEL: insertps_from_broadcast_loadv4f32:
874 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
875 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
878 ; X64-LABEL: insertps_from_broadcast_loadv4f32:
880 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
882 %1 = load <4 x float>, <4 x float>* %b, align 4
883 %2 = extractelement <4 x float> %1, i32 0
884 %3 = insertelement <4 x float> undef, float %2, i32 0
885 %4 = insertelement <4 x float> %3, float %2, i32 1
886 %5 = insertelement <4 x float> %4, float %2, i32 2
887 %6 = insertelement <4 x float> %5, float %2, i32 3
888 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
892 define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
893 ; X32-LABEL: insertps_from_broadcast_multiple_use:
895 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
896 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
897 ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
898 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
899 ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
900 ; X32-NEXT: addps %xmm1, %xmm0
901 ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
902 ; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
903 ; X32-NEXT: addps %xmm2, %xmm3
904 ; X32-NEXT: addps %xmm3, %xmm0
907 ; X64-LABEL: insertps_from_broadcast_multiple_use:
909 ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
910 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
911 ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
912 ; X64-NEXT: addps %xmm1, %xmm0
913 ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
914 ; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
915 ; X64-NEXT: addps %xmm2, %xmm3
916 ; X64-NEXT: addps %xmm3, %xmm0
918 %1 = getelementptr inbounds float, float* %fb, i64 %index
919 %2 = load float, float* %1, align 4
920 %3 = insertelement <4 x float> undef, float %2, i32 0
921 %4 = insertelement <4 x float> %3, float %2, i32 1
922 %5 = insertelement <4 x float> %4, float %2, i32 2
923 %6 = insertelement <4 x float> %5, float %2, i32 3
924 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
925 %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
926 %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
927 %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
928 %11 = fadd <4 x float> %7, %8
929 %12 = fadd <4 x float> %9, %10
930 %13 = fadd <4 x float> %11, %12
934 define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
935 ; X32-LABEL: insertps_with_undefs:
937 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
938 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
939 ; X32-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
940 ; X32-NEXT: movaps %xmm1, %xmm0
943 ; X64-LABEL: insertps_with_undefs:
945 ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
946 ; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
947 ; X64-NEXT: movaps %xmm1, %xmm0
949 %1 = load float, float* %b, align 4
950 %2 = insertelement <4 x float> undef, float %1, i32 0
951 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
952 ret <4 x float> %result
955 ; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
956 ; the destination index to change the load, instead of the source index.
957 define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
958 ; X32-LABEL: pr20087:
960 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
961 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
964 ; X64-LABEL: pr20087:
966 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
968 %load = load <4 x float> , <4 x float> *%ptr
969 %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
973 ; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
974 define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 {
975 ; X32-LABEL: insertps_pr20411:
977 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
978 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
979 ; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
980 ; X32-NEXT: movdqu %xmm1, (%eax)
983 ; X64-LABEL: insertps_pr20411:
985 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
986 ; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
987 ; X64-NEXT: movdqu %xmm1, (%rdi)
989 %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef>
990 %ptrcast = bitcast i32* %RET to <4 x i32>*
991 store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
995 define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
996 ; X32-LABEL: insertps_4:
998 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
1001 ; X64-LABEL: insertps_4:
1003 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
1005 %vecext = extractelement <4 x float> %A, i32 0
1006 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1007 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
1008 %vecext2 = extractelement <4 x float> %B, i32 2
1009 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
1010 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1011 ret <4 x float> %vecinit4
1014 define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
1015 ; X32-LABEL: insertps_5:
1017 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
1020 ; X64-LABEL: insertps_5:
1022 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
1024 %vecext = extractelement <4 x float> %A, i32 0
1025 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1026 %vecext1 = extractelement <4 x float> %B, i32 1
1027 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1028 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
1029 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1030 ret <4 x float> %vecinit4
1033 define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
1034 ; X32-LABEL: insertps_6:
1036 ; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
1039 ; X64-LABEL: insertps_6:
1041 ; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
1043 %vecext = extractelement <4 x float> %A, i32 1
1044 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
1045 %vecext1 = extractelement <4 x float> %B, i32 2
1046 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
1047 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
1048 ret <4 x float> %vecinit3
1051 define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
1052 ; X32-LABEL: insertps_7:
1054 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
1057 ; X64-LABEL: insertps_7:
1059 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
1061 %vecext = extractelement <4 x float> %A, i32 0
1062 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1063 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
1064 %vecext2 = extractelement <4 x float> %B, i32 1
1065 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
1066 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1067 ret <4 x float> %vecinit4
1070 define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
1071 ; X32-LABEL: insertps_8:
1073 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1076 ; X64-LABEL: insertps_8:
1078 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1080 %vecext = extractelement <4 x float> %A, i32 0
1081 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1082 %vecext1 = extractelement <4 x float> %B, i32 0
1083 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1084 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
1085 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1086 ret <4 x float> %vecinit4
1089 define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
1090 ; X32-LABEL: insertps_9:
1092 ; X32-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
1093 ; X32-NEXT: movaps %xmm1, %xmm0
1096 ; X64-LABEL: insertps_9:
1098 ; X64-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
1099 ; X64-NEXT: movaps %xmm1, %xmm0
1101 %vecext = extractelement <4 x float> %A, i32 0
1102 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
1103 %vecext1 = extractelement <4 x float> %B, i32 2
1104 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
1105 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
1106 ret <4 x float> %vecinit3
1109 define <4 x float> @insertps_10(<4 x float> %A) {
1110 ; X32-LABEL: insertps_10:
1112 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
1115 ; X64-LABEL: insertps_10:
1117 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
1119 %vecext = extractelement <4 x float> %A, i32 0
1120 %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0
1121 %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2
1122 ret <4 x float> %vecbuild2
1125 define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
1126 ; X32-LABEL: build_vector_to_shuffle_1:
1128 ; X32-NEXT: xorps %xmm1, %xmm1
1129 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1132 ; X64-LABEL: build_vector_to_shuffle_1:
1134 ; X64-NEXT: xorps %xmm1, %xmm1
1135 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1137 %vecext = extractelement <4 x float> %A, i32 1
1138 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
1139 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
1140 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1141 ret <4 x float> %vecinit3
1144 define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
1145 ; X32-LABEL: build_vector_to_shuffle_2:
1147 ; X32-NEXT: xorps %xmm1, %xmm1
1148 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1151 ; X64-LABEL: build_vector_to_shuffle_2:
1153 ; X64-NEXT: xorps %xmm1, %xmm1
1154 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1156 %vecext = extractelement <4 x float> %A, i32 1
1157 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
1158 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
1159 ret <4 x float> %vecinit1