1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2
6 define <16 x i8> @arg_i8_v16i8(i8 %x, i32 %y) nounwind {
7 ; SSE-LABEL: arg_i8_v16i8:
9 ; SSE-NEXT: movd %edi, %xmm0
10 ; SSE-NEXT: pxor %xmm1, %xmm1
11 ; SSE-NEXT: pshufb %xmm1, %xmm0
14 ; AVX1-LABEL: arg_i8_v16i8:
16 ; AVX1-NEXT: vmovd %edi, %xmm0
17 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
18 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
21 ; AVX2-LABEL: arg_i8_v16i8:
23 ; AVX2-NEXT: vmovd %edi, %xmm0
24 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
26 %ins = insertelement <16 x i8> undef, i8 %x, i32 %y
30 define <8 x i16> @arg_i16_v8i16(i16 %x, i32 %y) nounwind {
31 ; SSE-LABEL: arg_i16_v8i16:
33 ; SSE-NEXT: movd %edi, %xmm0
34 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
35 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
38 ; AVX1-LABEL: arg_i16_v8i16:
40 ; AVX1-NEXT: vmovd %edi, %xmm0
41 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
42 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
45 ; AVX2-LABEL: arg_i16_v8i16:
47 ; AVX2-NEXT: vmovd %edi, %xmm0
48 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
50 %ins = insertelement <8 x i16> undef, i16 %x, i32 %y
54 define <4 x i32> @arg_i32_v4i32(i32 %x, i32 %y) nounwind {
55 ; SSE-LABEL: arg_i32_v4i32:
57 ; SSE-NEXT: movd %edi, %xmm0
58 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
61 ; AVX1-LABEL: arg_i32_v4i32:
63 ; AVX1-NEXT: vmovd %edi, %xmm0
64 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
67 ; AVX2-LABEL: arg_i32_v4i32:
69 ; AVX2-NEXT: vmovd %edi, %xmm0
70 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
72 %ins = insertelement <4 x i32> undef, i32 %x, i32 %y
76 define <2 x i64> @arg_i64_v2i64(i64 %x, i32 %y) nounwind {
77 ; SSE-LABEL: arg_i64_v2i64:
79 ; SSE-NEXT: movq %rdi, %xmm0
80 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
83 ; AVX1-LABEL: arg_i64_v2i64:
85 ; AVX1-NEXT: vmovq %rdi, %xmm0
86 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
89 ; AVX2-LABEL: arg_i64_v2i64:
91 ; AVX2-NEXT: vmovq %rdi, %xmm0
92 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
94 %ins = insertelement <2 x i64> undef, i64 %x, i32 %y
98 define <4 x float> @arg_f32_v4f32(float %x, i32 %y) nounwind {
99 ; SSE-LABEL: arg_f32_v4f32:
101 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
104 ; AVX1-LABEL: arg_f32_v4f32:
106 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
109 ; AVX2-LABEL: arg_f32_v4f32:
111 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
113 %ins = insertelement <4 x float> undef, float %x, i32 %y
117 define <2 x double> @arg_f64_v2f64(double %x, i32 %y) nounwind {
118 ; SSE-LABEL: arg_f64_v2f64:
120 ; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
123 ; AVX-LABEL: arg_f64_v2f64:
125 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
127 %ins = insertelement <2 x double> undef, double %x, i32 %y
128 ret <2 x double> %ins
131 define <16 x i8> @load_i8_v16i8(i8* %p, i32 %y) nounwind {
132 ; SSE-LABEL: load_i8_v16i8:
134 ; SSE-NEXT: movzbl (%rdi), %eax
135 ; SSE-NEXT: movd %eax, %xmm0
136 ; SSE-NEXT: pxor %xmm1, %xmm1
137 ; SSE-NEXT: pshufb %xmm1, %xmm0
140 ; AVX1-LABEL: load_i8_v16i8:
142 ; AVX1-NEXT: movzbl (%rdi), %eax
143 ; AVX1-NEXT: vmovd %eax, %xmm0
144 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
145 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
148 ; AVX2-LABEL: load_i8_v16i8:
150 ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
153 %ins = insertelement <16 x i8> undef, i8 %x, i32 %y
157 define <8 x i16> @load_i16_v8i16(i16* %p, i32 %y) nounwind {
158 ; SSE-LABEL: load_i16_v8i16:
160 ; SSE-NEXT: movzwl (%rdi), %eax
161 ; SSE-NEXT: movd %eax, %xmm0
162 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
163 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
166 ; AVX1-LABEL: load_i16_v8i16:
168 ; AVX1-NEXT: movzwl (%rdi), %eax
169 ; AVX1-NEXT: vmovd %eax, %xmm0
170 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
171 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
174 ; AVX2-LABEL: load_i16_v8i16:
176 ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
178 %x = load i16, i16* %p
179 %ins = insertelement <8 x i16> undef, i16 %x, i32 %y
183 define <4 x i32> @load_i32_v4i32(i32* %p, i32 %y) nounwind {
184 ; SSE-LABEL: load_i32_v4i32:
186 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
187 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
190 ; AVX-LABEL: load_i32_v4i32:
192 ; AVX-NEXT: vbroadcastss (%rdi), %xmm0
194 %x = load i32, i32* %p
195 %ins = insertelement <4 x i32> undef, i32 %x, i32 %y
199 define <2 x i64> @load_i64_v2i64(i64* %p, i32 %y) nounwind {
200 ; SSE-LABEL: load_i64_v2i64:
202 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
203 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
206 ; AVX-LABEL: load_i64_v2i64:
208 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
210 %x = load i64, i64* %p
211 %ins = insertelement <2 x i64> undef, i64 %x, i32 %y
215 define <4 x float> @load_f32_v4f32(float* %p, i32 %y) nounwind {
216 ; SSE-LABEL: load_f32_v4f32:
218 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
219 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
222 ; AVX-LABEL: load_f32_v4f32:
224 ; AVX-NEXT: vbroadcastss (%rdi), %xmm0
226 %x = load float, float* %p
227 %ins = insertelement <4 x float> undef, float %x, i32 %y
231 define <2 x double> @load_f64_v2f64(double* %p, i32 %y) nounwind {
232 ; SSE-LABEL: load_f64_v2f64:
234 ; SSE-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
237 ; AVX-LABEL: load_f64_v2f64:
239 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
241 %x = load double, double* %p
242 %ins = insertelement <2 x double> undef, double %x, i32 %y
243 ret <2 x double> %ins
246 define <32 x i8> @arg_i8_v32i8(i8 %x, i32 %y) nounwind {
247 ; SSE-LABEL: arg_i8_v32i8:
249 ; SSE-NEXT: pushq %rbp
250 ; SSE-NEXT: movq %rsp, %rbp
251 ; SSE-NEXT: andq $-32, %rsp
252 ; SSE-NEXT: subq $64, %rsp
253 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
254 ; SSE-NEXT: andl $31, %esi
255 ; SSE-NEXT: movb %dil, (%rsp,%rsi)
256 ; SSE-NEXT: movaps (%rsp), %xmm0
257 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
258 ; SSE-NEXT: movq %rbp, %rsp
259 ; SSE-NEXT: popq %rbp
262 ; AVX1-LABEL: arg_i8_v32i8:
264 ; AVX1-NEXT: vmovd %edi, %xmm0
265 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
266 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
267 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
270 ; AVX2-LABEL: arg_i8_v32i8:
272 ; AVX2-NEXT: vmovd %edi, %xmm0
273 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
275 %ins = insertelement <32 x i8> undef, i8 %x, i32 %y
279 define <16 x i16> @arg_i16_v16i16(i16 %x, i32 %y) nounwind {
280 ; SSE-LABEL: arg_i16_v16i16:
282 ; SSE-NEXT: pushq %rbp
283 ; SSE-NEXT: movq %rsp, %rbp
284 ; SSE-NEXT: andq $-32, %rsp
285 ; SSE-NEXT: subq $64, %rsp
286 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
287 ; SSE-NEXT: andl $15, %esi
288 ; SSE-NEXT: movw %di, (%rsp,%rsi,2)
289 ; SSE-NEXT: movaps (%rsp), %xmm0
290 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
291 ; SSE-NEXT: movq %rbp, %rsp
292 ; SSE-NEXT: popq %rbp
295 ; AVX1-LABEL: arg_i16_v16i16:
297 ; AVX1-NEXT: vmovd %edi, %xmm0
298 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
299 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
300 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
303 ; AVX2-LABEL: arg_i16_v16i16:
305 ; AVX2-NEXT: vmovd %edi, %xmm0
306 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
308 %ins = insertelement <16 x i16> undef, i16 %x, i32 %y
312 define <8 x i32> @arg_i32_v8i32(i32 %x, i32 %y) nounwind {
313 ; SSE-LABEL: arg_i32_v8i32:
315 ; SSE-NEXT: pushq %rbp
316 ; SSE-NEXT: movq %rsp, %rbp
317 ; SSE-NEXT: andq $-32, %rsp
318 ; SSE-NEXT: subq $64, %rsp
319 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
320 ; SSE-NEXT: andl $7, %esi
321 ; SSE-NEXT: movl %edi, (%rsp,%rsi,4)
322 ; SSE-NEXT: movaps (%rsp), %xmm0
323 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
324 ; SSE-NEXT: movq %rbp, %rsp
325 ; SSE-NEXT: popq %rbp
328 ; AVX1-LABEL: arg_i32_v8i32:
330 ; AVX1-NEXT: vmovd %edi, %xmm0
331 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
332 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
335 ; AVX2-LABEL: arg_i32_v8i32:
337 ; AVX2-NEXT: vmovd %edi, %xmm0
338 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
340 %ins = insertelement <8 x i32> undef, i32 %x, i32 %y
344 define <4 x i64> @arg_i64_v4i64(i64 %x, i32 %y) nounwind {
345 ; SSE-LABEL: arg_i64_v4i64:
347 ; SSE-NEXT: pushq %rbp
348 ; SSE-NEXT: movq %rsp, %rbp
349 ; SSE-NEXT: andq $-32, %rsp
350 ; SSE-NEXT: subq $64, %rsp
351 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
352 ; SSE-NEXT: andl $3, %esi
353 ; SSE-NEXT: movq %rdi, (%rsp,%rsi,8)
354 ; SSE-NEXT: movaps (%rsp), %xmm0
355 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
356 ; SSE-NEXT: movq %rbp, %rsp
357 ; SSE-NEXT: popq %rbp
360 ; AVX1-LABEL: arg_i64_v4i64:
362 ; AVX1-NEXT: vmovq %rdi, %xmm0
363 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
364 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
367 ; AVX2-LABEL: arg_i64_v4i64:
369 ; AVX2-NEXT: vmovq %rdi, %xmm0
370 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
372 %ins = insertelement <4 x i64> undef, i64 %x, i32 %y
376 define <8 x float> @arg_f32_v8f32(float %x, i32 %y) nounwind {
377 ; SSE-LABEL: arg_f32_v8f32:
379 ; SSE-NEXT: pushq %rbp
380 ; SSE-NEXT: movq %rsp, %rbp
381 ; SSE-NEXT: andq $-32, %rsp
382 ; SSE-NEXT: subq $64, %rsp
383 ; SSE-NEXT: # kill: def $edi killed $edi def $rdi
384 ; SSE-NEXT: andl $7, %edi
385 ; SSE-NEXT: movss %xmm0, (%rsp,%rdi,4)
386 ; SSE-NEXT: movaps (%rsp), %xmm0
387 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
388 ; SSE-NEXT: movq %rbp, %rsp
389 ; SSE-NEXT: popq %rbp
392 ; AVX1-LABEL: arg_f32_v8f32:
394 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
395 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
398 ; AVX2-LABEL: arg_f32_v8f32:
400 ; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
402 %ins = insertelement <8 x float> undef, float %x, i32 %y
406 define <4 x double> @arg_f64_v4f64(double %x, i32 %y) nounwind {
407 ; SSE-LABEL: arg_f64_v4f64:
409 ; SSE-NEXT: pushq %rbp
410 ; SSE-NEXT: movq %rsp, %rbp
411 ; SSE-NEXT: andq $-32, %rsp
412 ; SSE-NEXT: subq $64, %rsp
413 ; SSE-NEXT: # kill: def $edi killed $edi def $rdi
414 ; SSE-NEXT: andl $3, %edi
415 ; SSE-NEXT: movsd %xmm0, (%rsp,%rdi,8)
416 ; SSE-NEXT: movaps (%rsp), %xmm0
417 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
418 ; SSE-NEXT: movq %rbp, %rsp
419 ; SSE-NEXT: popq %rbp
422 ; AVX1-LABEL: arg_f64_v4f64:
424 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
425 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
428 ; AVX2-LABEL: arg_f64_v4f64:
430 ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
432 %ins = insertelement <4 x double> undef, double %x, i32 %y
433 ret <4 x double> %ins
436 define <32 x i8> @load_i8_v32i8(i8* %p, i32 %y) nounwind {
437 ; SSE-LABEL: load_i8_v32i8:
439 ; SSE-NEXT: pushq %rbp
440 ; SSE-NEXT: movq %rsp, %rbp
441 ; SSE-NEXT: andq $-32, %rsp
442 ; SSE-NEXT: subq $64, %rsp
443 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
444 ; SSE-NEXT: movb (%rdi), %al
445 ; SSE-NEXT: andl $31, %esi
446 ; SSE-NEXT: movb %al, (%rsp,%rsi)
447 ; SSE-NEXT: movaps (%rsp), %xmm0
448 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
449 ; SSE-NEXT: movq %rbp, %rsp
450 ; SSE-NEXT: popq %rbp
453 ; AVX1-LABEL: load_i8_v32i8:
455 ; AVX1-NEXT: movzbl (%rdi), %eax
456 ; AVX1-NEXT: vmovd %eax, %xmm0
457 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
458 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
459 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
462 ; AVX2-LABEL: load_i8_v32i8:
464 ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0
467 %ins = insertelement <32 x i8> undef, i8 %x, i32 %y
471 define <16 x i16> @load_i16_v16i16(i16* %p, i32 %y) nounwind {
472 ; SSE-LABEL: load_i16_v16i16:
474 ; SSE-NEXT: pushq %rbp
475 ; SSE-NEXT: movq %rsp, %rbp
476 ; SSE-NEXT: andq $-32, %rsp
477 ; SSE-NEXT: subq $64, %rsp
478 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
479 ; SSE-NEXT: movzwl (%rdi), %eax
480 ; SSE-NEXT: andl $15, %esi
481 ; SSE-NEXT: movw %ax, (%rsp,%rsi,2)
482 ; SSE-NEXT: movaps (%rsp), %xmm0
483 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
484 ; SSE-NEXT: movq %rbp, %rsp
485 ; SSE-NEXT: popq %rbp
488 ; AVX1-LABEL: load_i16_v16i16:
490 ; AVX1-NEXT: movzwl (%rdi), %eax
491 ; AVX1-NEXT: vmovd %eax, %xmm0
492 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
493 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
494 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
497 ; AVX2-LABEL: load_i16_v16i16:
499 ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0
501 %x = load i16, i16* %p
502 %ins = insertelement <16 x i16> undef, i16 %x, i32 %y
506 define <8 x i32> @load_i32_v8i32(i32* %p, i32 %y) nounwind {
507 ; SSE-LABEL: load_i32_v8i32:
509 ; SSE-NEXT: pushq %rbp
510 ; SSE-NEXT: movq %rsp, %rbp
511 ; SSE-NEXT: andq $-32, %rsp
512 ; SSE-NEXT: subq $64, %rsp
513 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
514 ; SSE-NEXT: movl (%rdi), %eax
515 ; SSE-NEXT: andl $7, %esi
516 ; SSE-NEXT: movl %eax, (%rsp,%rsi,4)
517 ; SSE-NEXT: movaps (%rsp), %xmm0
518 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
519 ; SSE-NEXT: movq %rbp, %rsp
520 ; SSE-NEXT: popq %rbp
523 ; AVX-LABEL: load_i32_v8i32:
525 ; AVX-NEXT: vbroadcastss (%rdi), %ymm0
527 %x = load i32, i32* %p
528 %ins = insertelement <8 x i32> undef, i32 %x, i32 %y
532 define <4 x i64> @load_i64_v4i64(i64* %p, i32 %y) nounwind {
533 ; SSE-LABEL: load_i64_v4i64:
535 ; SSE-NEXT: pushq %rbp
536 ; SSE-NEXT: movq %rsp, %rbp
537 ; SSE-NEXT: andq $-32, %rsp
538 ; SSE-NEXT: subq $64, %rsp
539 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
540 ; SSE-NEXT: movq (%rdi), %rax
541 ; SSE-NEXT: andl $3, %esi
542 ; SSE-NEXT: movq %rax, (%rsp,%rsi,8)
543 ; SSE-NEXT: movaps (%rsp), %xmm0
544 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
545 ; SSE-NEXT: movq %rbp, %rsp
546 ; SSE-NEXT: popq %rbp
549 ; AVX-LABEL: load_i64_v4i64:
551 ; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
553 %x = load i64, i64* %p
554 %ins = insertelement <4 x i64> undef, i64 %x, i32 %y
558 define <8 x float> @load_f32_v8f32(float* %p, i32 %y) nounwind {
559 ; SSE-LABEL: load_f32_v8f32:
561 ; SSE-NEXT: pushq %rbp
562 ; SSE-NEXT: movq %rsp, %rbp
563 ; SSE-NEXT: andq $-32, %rsp
564 ; SSE-NEXT: subq $64, %rsp
565 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
566 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
567 ; SSE-NEXT: andl $7, %esi
568 ; SSE-NEXT: movss %xmm0, (%rsp,%rsi,4)
569 ; SSE-NEXT: movaps (%rsp), %xmm0
570 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
571 ; SSE-NEXT: movq %rbp, %rsp
572 ; SSE-NEXT: popq %rbp
575 ; AVX-LABEL: load_f32_v8f32:
577 ; AVX-NEXT: vbroadcastss (%rdi), %ymm0
579 %x = load float, float* %p
580 %ins = insertelement <8 x float> undef, float %x, i32 %y
584 define <4 x double> @load_f64_v4f64(double* %p, i32 %y) nounwind {
585 ; SSE-LABEL: load_f64_v4f64:
587 ; SSE-NEXT: pushq %rbp
588 ; SSE-NEXT: movq %rsp, %rbp
589 ; SSE-NEXT: andq $-32, %rsp
590 ; SSE-NEXT: subq $64, %rsp
591 ; SSE-NEXT: # kill: def $esi killed $esi def $rsi
592 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
593 ; SSE-NEXT: andl $3, %esi
594 ; SSE-NEXT: movsd %xmm0, (%rsp,%rsi,8)
595 ; SSE-NEXT: movaps (%rsp), %xmm0
596 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
597 ; SSE-NEXT: movq %rbp, %rsp
598 ; SSE-NEXT: popq %rbp
601 ; AVX-LABEL: load_f64_v4f64:
603 ; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
605 %x = load double, double* %p
606 %ins = insertelement <4 x double> undef, double %x, i32 %y
607 ret <4 x double> %ins