1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
7 ; 0'th element insertion into an SSE register.
9 define <4 x float> @insert_f32_firstelt(<4 x float> %x, float* %s.addr) {
10 ; SSE2-LABEL: insert_f32_firstelt:
12 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
13 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
16 ; SSE41-LABEL: insert_f32_firstelt:
18 ; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
19 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
22 ; AVX-LABEL: insert_f32_firstelt:
24 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
25 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
27 %s = load float, float* %s.addr
28 %i0 = insertelement <4 x float> %x, float %s, i32 0
32 define <2 x double> @insert_f64_firstelt(<2 x double> %x, double* %s.addr) {
33 ; SSE-LABEL: insert_f64_firstelt:
35 ; SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
38 ; AVX-LABEL: insert_f64_firstelt:
40 ; AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
42 %s = load double, double* %s.addr
43 %i0 = insertelement <2 x double> %x, double %s, i32 0
47 define <16 x i8> @insert_i8_firstelt(<16 x i8> %x, i8* %s.addr) {
48 ; SSE2-LABEL: insert_i8_firstelt:
50 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
51 ; SSE2-NEXT: pand %xmm1, %xmm0
52 ; SSE2-NEXT: movzbl (%rdi), %eax
53 ; SSE2-NEXT: movd %eax, %xmm2
54 ; SSE2-NEXT: pandn %xmm2, %xmm1
55 ; SSE2-NEXT: por %xmm1, %xmm0
58 ; SSE41-LABEL: insert_i8_firstelt:
60 ; SSE41-NEXT: pinsrb $0, (%rdi), %xmm0
63 ; AVX-LABEL: insert_i8_firstelt:
65 ; AVX-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0
67 %s = load i8, i8* %s.addr
68 %i0 = insertelement <16 x i8> %x, i8 %s, i32 0
72 define <8 x i16> @insert_i16_firstelt(<8 x i16> %x, i16* %s.addr) {
73 ; SSE-LABEL: insert_i16_firstelt:
75 ; SSE-NEXT: pinsrw $0, (%rdi), %xmm0
78 ; AVX-LABEL: insert_i16_firstelt:
80 ; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
82 %s = load i16, i16* %s.addr
83 %i0 = insertelement <8 x i16> %x, i16 %s, i32 0
87 define <4 x i32> @insert_i32_firstelt(<4 x i32> %x, i32* %s.addr) {
88 ; SSE2-LABEL: insert_i32_firstelt:
90 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
91 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
94 ; SSE41-LABEL: insert_i32_firstelt:
96 ; SSE41-NEXT: pinsrd $0, (%rdi), %xmm0
99 ; AVX-LABEL: insert_i32_firstelt:
101 ; AVX-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
103 %s = load i32, i32* %s.addr
104 %i0 = insertelement <4 x i32> %x, i32 %s, i32 0
108 define <2 x i64> @insert_i64_firstelt(<2 x i64> %x, i64* %s.addr) {
109 ; SSE2-LABEL: insert_i64_firstelt:
111 ; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
114 ; SSE41-LABEL: insert_i64_firstelt:
116 ; SSE41-NEXT: pinsrq $0, (%rdi), %xmm0
119 ; AVX-LABEL: insert_i64_firstelt:
121 ; AVX-NEXT: vpinsrq $0, (%rdi), %xmm0, %xmm0
123 %s = load i64, i64* %s.addr
124 %i0 = insertelement <2 x i64> %x, i64 %s, i32 0
128 ; 1'th element insertion.
130 define <4 x float> @insert_f32_secondelt(<4 x float> %x, float* %s.addr) {
131 ; SSE2-LABEL: insert_f32_secondelt:
133 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
134 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
135 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
136 ; SSE2-NEXT: movaps %xmm1, %xmm0
139 ; SSE41-LABEL: insert_f32_secondelt:
141 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
144 ; AVX-LABEL: insert_f32_secondelt:
146 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
148 %s = load float, float* %s.addr
149 %i0 = insertelement <4 x float> %x, float %s, i32 1
153 define <2 x double> @insert_f64_secondelt(<2 x double> %x, double* %s.addr) {
154 ; SSE-LABEL: insert_f64_secondelt:
156 ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
159 ; AVX-LABEL: insert_f64_secondelt:
161 ; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
163 %s = load double, double* %s.addr
164 %i0 = insertelement <2 x double> %x, double %s, i32 1
168 define <16 x i8> @insert_i8_secondelt(<16 x i8> %x, i8* %s.addr) {
169 ; SSE2-LABEL: insert_i8_secondelt:
171 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
172 ; SSE2-NEXT: pand %xmm1, %xmm0
173 ; SSE2-NEXT: movzbl (%rdi), %eax
174 ; SSE2-NEXT: movd %eax, %xmm2
175 ; SSE2-NEXT: psllw $8, %xmm2
176 ; SSE2-NEXT: pandn %xmm2, %xmm1
177 ; SSE2-NEXT: por %xmm1, %xmm0
180 ; SSE41-LABEL: insert_i8_secondelt:
182 ; SSE41-NEXT: pinsrb $1, (%rdi), %xmm0
185 ; AVX-LABEL: insert_i8_secondelt:
187 ; AVX-NEXT: vpinsrb $1, (%rdi), %xmm0, %xmm0
189 %s = load i8, i8* %s.addr
190 %i0 = insertelement <16 x i8> %x, i8 %s, i32 1
194 define <8 x i16> @insert_i16_secondelt(<8 x i16> %x, i16* %s.addr) {
195 ; SSE-LABEL: insert_i16_secondelt:
197 ; SSE-NEXT: pinsrw $1, (%rdi), %xmm0
200 ; AVX-LABEL: insert_i16_secondelt:
202 ; AVX-NEXT: vpinsrw $1, (%rdi), %xmm0, %xmm0
204 %s = load i16, i16* %s.addr
205 %i0 = insertelement <8 x i16> %x, i16 %s, i32 1
209 define <4 x i32> @insert_i32_secondelt(<4 x i32> %x, i32* %s.addr) {
210 ; SSE2-LABEL: insert_i32_secondelt:
212 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
213 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
214 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
215 ; SSE2-NEXT: movaps %xmm1, %xmm0
218 ; SSE41-LABEL: insert_i32_secondelt:
220 ; SSE41-NEXT: pinsrd $1, (%rdi), %xmm0
223 ; AVX-LABEL: insert_i32_secondelt:
225 ; AVX-NEXT: vpinsrd $1, (%rdi), %xmm0, %xmm0
227 %s = load i32, i32* %s.addr
228 %i0 = insertelement <4 x i32> %x, i32 %s, i32 1
232 define <2 x i64> @insert_i64_secondelt(<2 x i64> %x, i64* %s.addr) {
233 ; SSE2-LABEL: insert_i64_secondelt:
235 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
236 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
239 ; SSE41-LABEL: insert_i64_secondelt:
241 ; SSE41-NEXT: pinsrq $1, (%rdi), %xmm0
244 ; AVX-LABEL: insert_i64_secondelt:
246 ; AVX-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0
248 %s = load i64, i64* %s.addr
249 %i0 = insertelement <2 x i64> %x, i64 %s, i32 1
253 ; element insertion into two elements
255 define <4 x float> @insert_f32_two_elts(<4 x float> %x, float* %s.addr) {
256 ; SSE-LABEL: insert_f32_two_elts:
258 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
259 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
260 ; SSE-NEXT: movaps %xmm1, %xmm0
263 ; AVX-LABEL: insert_f32_two_elts:
265 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
266 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[2,3]
268 %s = load float, float* %s.addr
269 %i0 = insertelement <4 x float> %x, float %s, i32 0
270 %i1 = insertelement <4 x float> %i0, float %s, i32 1
274 define <2 x double> @insert_f64_two_elts(<2 x double> %x, double* %s.addr) {
275 ; SSE2-LABEL: insert_f64_two_elts:
277 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
278 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
281 ; SSE41-LABEL: insert_f64_two_elts:
283 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
286 ; AVX-LABEL: insert_f64_two_elts:
288 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
290 %s = load double, double* %s.addr
291 %i0 = insertelement <2 x double> %x, double %s, i32 0
292 %i1 = insertelement <2 x double> %i0, double %s, i32 1
296 define <16 x i8> @insert_i8_two_elts(<16 x i8> %x, i8* %s.addr) {
297 ; SSE2-LABEL: insert_i8_two_elts:
299 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
300 ; SSE2-NEXT: pand %xmm1, %xmm0
301 ; SSE2-NEXT: movzbl (%rdi), %eax
302 ; SSE2-NEXT: movd %eax, %xmm2
303 ; SSE2-NEXT: pandn %xmm2, %xmm1
304 ; SSE2-NEXT: por %xmm1, %xmm0
305 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
306 ; SSE2-NEXT: pand %xmm1, %xmm0
307 ; SSE2-NEXT: psllw $8, %xmm2
308 ; SSE2-NEXT: pandn %xmm2, %xmm1
309 ; SSE2-NEXT: por %xmm1, %xmm0
312 ; SSE41-LABEL: insert_i8_two_elts:
314 ; SSE41-NEXT: movzbl (%rdi), %eax
315 ; SSE41-NEXT: pinsrb $0, %eax, %xmm0
316 ; SSE41-NEXT: pinsrb $1, %eax, %xmm0
319 ; AVX-LABEL: insert_i8_two_elts:
321 ; AVX-NEXT: movzbl (%rdi), %eax
322 ; AVX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
323 ; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
325 %s = load i8, i8* %s.addr
326 %i0 = insertelement <16 x i8> %x, i8 %s, i32 0
327 %i1 = insertelement <16 x i8> %i0, i8 %s, i32 1
331 define <8 x i16> @insert_i16_two_elts(<8 x i16> %x, i16* %s.addr) {
332 ; SSE-LABEL: insert_i16_two_elts:
334 ; SSE-NEXT: movzwl (%rdi), %eax
335 ; SSE-NEXT: pinsrw $0, %eax, %xmm0
336 ; SSE-NEXT: pinsrw $1, %eax, %xmm0
339 ; AVX-LABEL: insert_i16_two_elts:
341 ; AVX-NEXT: movzwl (%rdi), %eax
342 ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
343 ; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
345 %s = load i16, i16* %s.addr
346 %i0 = insertelement <8 x i16> %x, i16 %s, i32 0
347 %i1 = insertelement <8 x i16> %i0, i16 %s, i32 1
351 define <4 x i32> @insert_i32_two_elts(<4 x i32> %x, i32* %s.addr) {
352 ; SSE2-LABEL: insert_i32_two_elts:
354 ; SSE2-NEXT: movl (%rdi), %eax
355 ; SSE2-NEXT: movd %eax, %xmm2
356 ; SSE2-NEXT: movd %eax, %xmm1
357 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
358 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
359 ; SSE2-NEXT: movaps %xmm1, %xmm0
362 ; SSE41-LABEL: insert_i32_two_elts:
364 ; SSE41-NEXT: movl (%rdi), %eax
365 ; SSE41-NEXT: pinsrd $0, %eax, %xmm0
366 ; SSE41-NEXT: pinsrd $1, %eax, %xmm0
369 ; AVX-LABEL: insert_i32_two_elts:
371 ; AVX-NEXT: movl (%rdi), %eax
372 ; AVX-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
373 ; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
375 %s = load i32, i32* %s.addr
376 %i0 = insertelement <4 x i32> %x, i32 %s, i32 0
377 %i1 = insertelement <4 x i32> %i0, i32 %s, i32 1
381 define <2 x i64> @insert_i64_two_elts(<2 x i64> %x, i64* %s.addr) {
382 ; SSE2-LABEL: insert_i64_two_elts:
384 ; SSE2-NEXT: movq (%rdi), %rax
385 ; SSE2-NEXT: movq %rax, %xmm0
386 ; SSE2-NEXT: movq %rax, %xmm1
387 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
390 ; SSE41-LABEL: insert_i64_two_elts:
392 ; SSE41-NEXT: movq (%rdi), %rax
393 ; SSE41-NEXT: pinsrq $0, %rax, %xmm0
394 ; SSE41-NEXT: pinsrq $1, %rax, %xmm0
397 ; AVX-LABEL: insert_i64_two_elts:
399 ; AVX-NEXT: movq (%rdi), %rax
400 ; AVX-NEXT: vpinsrq $0, %rax, %xmm0, %xmm0
401 ; AVX-NEXT: vpinsrq $1, %rax, %xmm0, %xmm0
403 %s = load i64, i64* %s.addr
404 %i0 = insertelement <2 x i64> %x, i64 %s, i32 0
405 %i1 = insertelement <2 x i64> %i0, i64 %s, i32 1
411 define void @insert_i32_two_elts_into_different_vectors(<4 x i32> %x, <4 x i32> %y, i32* %s.addr, <4 x i32>* %x.out.addr, <4 x i32>* %y.out.addr) {
412 ; SSE2-LABEL: insert_i32_two_elts_into_different_vectors:
414 ; SSE2-NEXT: movl (%rdi), %eax
415 ; SSE2-NEXT: movd %eax, %xmm2
416 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
417 ; SSE2-NEXT: movd %eax, %xmm2
418 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
419 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
420 ; SSE2-NEXT: movaps %xmm0, (%rsi)
421 ; SSE2-NEXT: movaps %xmm2, (%rdx)
424 ; SSE41-LABEL: insert_i32_two_elts_into_different_vectors:
426 ; SSE41-NEXT: movl (%rdi), %eax
427 ; SSE41-NEXT: pinsrd $0, %eax, %xmm0
428 ; SSE41-NEXT: pinsrd $1, %eax, %xmm1
429 ; SSE41-NEXT: movdqa %xmm0, (%rsi)
430 ; SSE41-NEXT: movdqa %xmm1, (%rdx)
433 ; AVX-LABEL: insert_i32_two_elts_into_different_vectors:
435 ; AVX-NEXT: movl (%rdi), %eax
436 ; AVX-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
437 ; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
438 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
439 ; AVX-NEXT: vmovdqa %xmm1, (%rdx)
441 %s = load i32, i32* %s.addr
442 %i0 = insertelement <4 x i32> %x, i32 %s, i32 0
443 %i1 = insertelement <4 x i32> %y, i32 %s, i32 1
444 store <4 x i32> %i0, <4 x i32>* %x.out.addr
445 store <4 x i32> %i1, <4 x i32>* %y.out.addr
449 define <4 x float> @insert_f32_two_elts_extrause_of_scalar(<4 x float> %x, float* %s.addr, float* %s.out) {
450 ; SSE-LABEL: insert_f32_two_elts_extrause_of_scalar:
452 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
453 ; SSE-NEXT: movss %xmm1, (%rsi)
454 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
455 ; SSE-NEXT: movaps %xmm1, %xmm0
458 ; AVX-LABEL: insert_f32_two_elts_extrause_of_scalar:
460 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
461 ; AVX-NEXT: vmovss %xmm1, (%rsi)
462 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[2,3]
464 %s = load float, float* %s.addr
465 store float %s, float* %s.out
466 %i0 = insertelement <4 x float> %x, float %s, i32 0
467 %i1 = insertelement <4 x float> %i0, float %s, i32 1