1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
9 ; Test codegen for under aligned nontemporal vector loads
13 define <2 x double> @test_v2f64_align1(<2 x double>* %src) nounwind {
14 ; SSE-LABEL: test_v2f64_align1:
16 ; SSE-NEXT: movups (%rdi), %xmm0
19 ; AVX-LABEL: test_v2f64_align1:
21 ; AVX-NEXT: vmovups (%rdi), %xmm0
23 %1 = load <2 x double>, <2 x double>* %src, align 1, !nontemporal !1
27 define <4 x float> @test_v4f32_align1(<4 x float>* %src) nounwind {
28 ; SSE-LABEL: test_v4f32_align1:
30 ; SSE-NEXT: movups (%rdi), %xmm0
33 ; AVX-LABEL: test_v4f32_align1:
35 ; AVX-NEXT: vmovups (%rdi), %xmm0
37 %1 = load <4 x float>, <4 x float>* %src, align 1, !nontemporal !1
41 define <2 x i64> @test_v2i64_align1(<2 x i64>* %src) nounwind {
42 ; SSE-LABEL: test_v2i64_align1:
44 ; SSE-NEXT: movups (%rdi), %xmm0
47 ; AVX-LABEL: test_v2i64_align1:
49 ; AVX-NEXT: vmovups (%rdi), %xmm0
51 %1 = load <2 x i64>, <2 x i64>* %src, align 1, !nontemporal !1
55 define <4 x i32> @test_v4i32_align1(<4 x i32>* %src) nounwind {
56 ; SSE-LABEL: test_v4i32_align1:
58 ; SSE-NEXT: movups (%rdi), %xmm0
61 ; AVX-LABEL: test_v4i32_align1:
63 ; AVX-NEXT: vmovups (%rdi), %xmm0
65 %1 = load <4 x i32>, <4 x i32>* %src, align 1, !nontemporal !1
69 define <8 x i16> @test_v8i16_align1(<8 x i16>* %src) nounwind {
70 ; SSE-LABEL: test_v8i16_align1:
72 ; SSE-NEXT: movups (%rdi), %xmm0
75 ; AVX-LABEL: test_v8i16_align1:
77 ; AVX-NEXT: vmovups (%rdi), %xmm0
79 %1 = load <8 x i16>, <8 x i16>* %src, align 1, !nontemporal !1
83 define <16 x i8> @test_v16i8_align1(<16 x i8>* %src) nounwind {
84 ; SSE-LABEL: test_v16i8_align1:
86 ; SSE-NEXT: movups (%rdi), %xmm0
89 ; AVX-LABEL: test_v16i8_align1:
91 ; AVX-NEXT: vmovups (%rdi), %xmm0
93 %1 = load <16 x i8>, <16 x i8>* %src, align 1, !nontemporal !1
99 define <4 x double> @test_v4f64_align1(<4 x double>* %src) nounwind {
100 ; SSE-LABEL: test_v4f64_align1:
102 ; SSE-NEXT: movups (%rdi), %xmm0
103 ; SSE-NEXT: movups 16(%rdi), %xmm1
106 ; AVX-LABEL: test_v4f64_align1:
108 ; AVX-NEXT: vmovups (%rdi), %ymm0
110 %1 = load <4 x double>, <4 x double>* %src, align 1, !nontemporal !1
114 define <8 x float> @test_v8f32_align1(<8 x float>* %src) nounwind {
115 ; SSE-LABEL: test_v8f32_align1:
117 ; SSE-NEXT: movups (%rdi), %xmm0
118 ; SSE-NEXT: movups 16(%rdi), %xmm1
121 ; AVX-LABEL: test_v8f32_align1:
123 ; AVX-NEXT: vmovups (%rdi), %ymm0
125 %1 = load <8 x float>, <8 x float>* %src, align 1, !nontemporal !1
129 define <4 x i64> @test_v4i64_align1(<4 x i64>* %src) nounwind {
130 ; SSE-LABEL: test_v4i64_align1:
132 ; SSE-NEXT: movups (%rdi), %xmm0
133 ; SSE-NEXT: movups 16(%rdi), %xmm1
136 ; AVX-LABEL: test_v4i64_align1:
138 ; AVX-NEXT: vmovups (%rdi), %ymm0
140 %1 = load <4 x i64>, <4 x i64>* %src, align 1, !nontemporal !1
144 define <8 x i32> @test_v8i32_align1(<8 x i32>* %src) nounwind {
145 ; SSE-LABEL: test_v8i32_align1:
147 ; SSE-NEXT: movups (%rdi), %xmm0
148 ; SSE-NEXT: movups 16(%rdi), %xmm1
151 ; AVX-LABEL: test_v8i32_align1:
153 ; AVX-NEXT: vmovups (%rdi), %ymm0
155 %1 = load <8 x i32>, <8 x i32>* %src, align 1, !nontemporal !1
159 define <16 x i16> @test_v16i16_align1(<16 x i16>* %src) nounwind {
160 ; SSE-LABEL: test_v16i16_align1:
162 ; SSE-NEXT: movups (%rdi), %xmm0
163 ; SSE-NEXT: movups 16(%rdi), %xmm1
166 ; AVX-LABEL: test_v16i16_align1:
168 ; AVX-NEXT: vmovups (%rdi), %ymm0
170 %1 = load <16 x i16>, <16 x i16>* %src, align 1, !nontemporal !1
174 define <32 x i8> @test_v32i8_align1(<32 x i8>* %src) nounwind {
175 ; SSE-LABEL: test_v32i8_align1:
177 ; SSE-NEXT: movups (%rdi), %xmm0
178 ; SSE-NEXT: movups 16(%rdi), %xmm1
181 ; AVX-LABEL: test_v32i8_align1:
183 ; AVX-NEXT: vmovups (%rdi), %ymm0
185 %1 = load <32 x i8>, <32 x i8>* %src, align 1, !nontemporal !1
189 define <4 x double> @test_v4f64_align16(<4 x double>* %src) nounwind {
190 ; SSE2-LABEL: test_v4f64_align16:
192 ; SSE2-NEXT: movaps (%rdi), %xmm0
193 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
196 ; SSE41-LABEL: test_v4f64_align16:
198 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
199 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
202 ; AVX-LABEL: test_v4f64_align16:
204 ; AVX-NEXT: pushq %rbp
205 ; AVX-NEXT: movq %rsp, %rbp
206 ; AVX-NEXT: andq $-32, %rsp
207 ; AVX-NEXT: subq $64, %rsp
208 ; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0
209 ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
210 ; AVX-NEXT: vmovntdqa (%rdi), %xmm0
211 ; AVX-NEXT: vmovdqa %xmm0, (%rsp)
212 ; AVX-NEXT: vmovaps (%rsp), %ymm0
213 ; AVX-NEXT: movq %rbp, %rsp
214 ; AVX-NEXT: popq %rbp
216 %1 = load <4 x double>, <4 x double>* %src, align 16, !nontemporal !1
220 define <8 x float> @test_v8f32_align16(<8 x float>* %src) nounwind {
221 ; SSE2-LABEL: test_v8f32_align16:
223 ; SSE2-NEXT: movaps (%rdi), %xmm0
224 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
227 ; SSE41-LABEL: test_v8f32_align16:
229 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
230 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
233 ; AVX-LABEL: test_v8f32_align16:
235 ; AVX-NEXT: pushq %rbp
236 ; AVX-NEXT: movq %rsp, %rbp
237 ; AVX-NEXT: andq $-32, %rsp
238 ; AVX-NEXT: subq $64, %rsp
239 ; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0
240 ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
241 ; AVX-NEXT: vmovntdqa (%rdi), %xmm0
242 ; AVX-NEXT: vmovdqa %xmm0, (%rsp)
243 ; AVX-NEXT: vmovaps (%rsp), %ymm0
244 ; AVX-NEXT: movq %rbp, %rsp
245 ; AVX-NEXT: popq %rbp
247 %1 = load <8 x float>, <8 x float>* %src, align 16, !nontemporal !1
251 define <4 x i64> @test_v4i64_align16(<4 x i64>* %src) nounwind {
252 ; SSE2-LABEL: test_v4i64_align16:
254 ; SSE2-NEXT: movaps (%rdi), %xmm0
255 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
258 ; SSE41-LABEL: test_v4i64_align16:
260 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
261 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
264 ; AVX-LABEL: test_v4i64_align16:
266 ; AVX-NEXT: pushq %rbp
267 ; AVX-NEXT: movq %rsp, %rbp
268 ; AVX-NEXT: andq $-32, %rsp
269 ; AVX-NEXT: subq $64, %rsp
270 ; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0
271 ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
272 ; AVX-NEXT: vmovntdqa (%rdi), %xmm0
273 ; AVX-NEXT: vmovdqa %xmm0, (%rsp)
274 ; AVX-NEXT: vmovaps (%rsp), %ymm0
275 ; AVX-NEXT: movq %rbp, %rsp
276 ; AVX-NEXT: popq %rbp
278 %1 = load <4 x i64>, <4 x i64>* %src, align 16, !nontemporal !1
282 define <8 x i32> @test_v8i32_align16(<8 x i32>* %src) nounwind {
283 ; SSE2-LABEL: test_v8i32_align16:
285 ; SSE2-NEXT: movaps (%rdi), %xmm0
286 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
289 ; SSE41-LABEL: test_v8i32_align16:
291 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
292 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
295 ; AVX-LABEL: test_v8i32_align16:
297 ; AVX-NEXT: pushq %rbp
298 ; AVX-NEXT: movq %rsp, %rbp
299 ; AVX-NEXT: andq $-32, %rsp
300 ; AVX-NEXT: subq $64, %rsp
301 ; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0
302 ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
303 ; AVX-NEXT: vmovntdqa (%rdi), %xmm0
304 ; AVX-NEXT: vmovdqa %xmm0, (%rsp)
305 ; AVX-NEXT: vmovaps (%rsp), %ymm0
306 ; AVX-NEXT: movq %rbp, %rsp
307 ; AVX-NEXT: popq %rbp
309 %1 = load <8 x i32>, <8 x i32>* %src, align 16, !nontemporal !1
313 define <16 x i16> @test_v16i16_align16(<16 x i16>* %src) nounwind {
314 ; SSE2-LABEL: test_v16i16_align16:
316 ; SSE2-NEXT: movaps (%rdi), %xmm0
317 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
320 ; SSE41-LABEL: test_v16i16_align16:
322 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
323 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
326 ; AVX-LABEL: test_v16i16_align16:
328 ; AVX-NEXT: pushq %rbp
329 ; AVX-NEXT: movq %rsp, %rbp
330 ; AVX-NEXT: andq $-32, %rsp
331 ; AVX-NEXT: subq $64, %rsp
332 ; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0
333 ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
334 ; AVX-NEXT: vmovntdqa (%rdi), %xmm0
335 ; AVX-NEXT: vmovdqa %xmm0, (%rsp)
336 ; AVX-NEXT: vmovaps (%rsp), %ymm0
337 ; AVX-NEXT: movq %rbp, %rsp
338 ; AVX-NEXT: popq %rbp
340 %1 = load <16 x i16>, <16 x i16>* %src, align 16, !nontemporal !1
344 define <32 x i8> @test_v32i8_align16(<32 x i8>* %src) nounwind {
345 ; SSE2-LABEL: test_v32i8_align16:
347 ; SSE2-NEXT: movaps (%rdi), %xmm0
348 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
351 ; SSE41-LABEL: test_v32i8_align16:
353 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
354 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
357 ; AVX-LABEL: test_v32i8_align16:
359 ; AVX-NEXT: pushq %rbp
360 ; AVX-NEXT: movq %rsp, %rbp
361 ; AVX-NEXT: andq $-32, %rsp
362 ; AVX-NEXT: subq $64, %rsp
363 ; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0
364 ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
365 ; AVX-NEXT: vmovntdqa (%rdi), %xmm0
366 ; AVX-NEXT: vmovdqa %xmm0, (%rsp)
367 ; AVX-NEXT: vmovaps (%rsp), %ymm0
368 ; AVX-NEXT: movq %rbp, %rsp
369 ; AVX-NEXT: popq %rbp
371 %1 = load <32 x i8>, <32 x i8>* %src, align 16, !nontemporal !1
377 define <8 x double> @test_v8f64_align1(<8 x double>* %src) nounwind {
378 ; SSE-LABEL: test_v8f64_align1:
380 ; SSE-NEXT: movups (%rdi), %xmm0
381 ; SSE-NEXT: movups 16(%rdi), %xmm1
382 ; SSE-NEXT: movups 32(%rdi), %xmm2
383 ; SSE-NEXT: movups 48(%rdi), %xmm3
386 ; AVX1-LABEL: test_v8f64_align1:
388 ; AVX1-NEXT: vmovups (%rdi), %ymm0
389 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
392 ; AVX2-LABEL: test_v8f64_align1:
394 ; AVX2-NEXT: vmovups (%rdi), %ymm0
395 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
398 ; AVX512-LABEL: test_v8f64_align1:
400 ; AVX512-NEXT: vmovups (%rdi), %zmm0
402 %1 = load <8 x double>, <8 x double>* %src, align 1, !nontemporal !1
406 define <16 x float> @test_v16f32_align1(<16 x float>* %src) nounwind {
407 ; SSE-LABEL: test_v16f32_align1:
409 ; SSE-NEXT: movups (%rdi), %xmm0
410 ; SSE-NEXT: movups 16(%rdi), %xmm1
411 ; SSE-NEXT: movups 32(%rdi), %xmm2
412 ; SSE-NEXT: movups 48(%rdi), %xmm3
415 ; AVX1-LABEL: test_v16f32_align1:
417 ; AVX1-NEXT: vmovups (%rdi), %ymm0
418 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
421 ; AVX2-LABEL: test_v16f32_align1:
423 ; AVX2-NEXT: vmovups (%rdi), %ymm0
424 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
427 ; AVX512-LABEL: test_v16f32_align1:
429 ; AVX512-NEXT: vmovups (%rdi), %zmm0
431 %1 = load <16 x float>, <16 x float>* %src, align 1, !nontemporal !1
435 define <8 x i64> @test_v8i64_align1(<8 x i64>* %src) nounwind {
436 ; SSE-LABEL: test_v8i64_align1:
438 ; SSE-NEXT: movups (%rdi), %xmm0
439 ; SSE-NEXT: movups 16(%rdi), %xmm1
440 ; SSE-NEXT: movups 32(%rdi), %xmm2
441 ; SSE-NEXT: movups 48(%rdi), %xmm3
444 ; AVX1-LABEL: test_v8i64_align1:
446 ; AVX1-NEXT: vmovups (%rdi), %ymm0
447 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
450 ; AVX2-LABEL: test_v8i64_align1:
452 ; AVX2-NEXT: vmovups (%rdi), %ymm0
453 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
456 ; AVX512-LABEL: test_v8i64_align1:
458 ; AVX512-NEXT: vmovups (%rdi), %zmm0
460 %1 = load <8 x i64>, <8 x i64>* %src, align 1, !nontemporal !1
464 define <16 x i32> @test_v16i32_align1(<16 x i32>* %src) nounwind {
465 ; SSE-LABEL: test_v16i32_align1:
467 ; SSE-NEXT: movups (%rdi), %xmm0
468 ; SSE-NEXT: movups 16(%rdi), %xmm1
469 ; SSE-NEXT: movups 32(%rdi), %xmm2
470 ; SSE-NEXT: movups 48(%rdi), %xmm3
473 ; AVX1-LABEL: test_v16i32_align1:
475 ; AVX1-NEXT: vmovups (%rdi), %ymm0
476 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
479 ; AVX2-LABEL: test_v16i32_align1:
481 ; AVX2-NEXT: vmovups (%rdi), %ymm0
482 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
485 ; AVX512-LABEL: test_v16i32_align1:
487 ; AVX512-NEXT: vmovups (%rdi), %zmm0
489 %1 = load <16 x i32>, <16 x i32>* %src, align 1, !nontemporal !1
493 define <32 x i16> @test_v32i16_align1(<32 x i16>* %src) nounwind {
494 ; SSE-LABEL: test_v32i16_align1:
496 ; SSE-NEXT: movups (%rdi), %xmm0
497 ; SSE-NEXT: movups 16(%rdi), %xmm1
498 ; SSE-NEXT: movups 32(%rdi), %xmm2
499 ; SSE-NEXT: movups 48(%rdi), %xmm3
502 ; AVX1-LABEL: test_v32i16_align1:
504 ; AVX1-NEXT: vmovups (%rdi), %ymm0
505 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
508 ; AVX2-LABEL: test_v32i16_align1:
510 ; AVX2-NEXT: vmovups (%rdi), %ymm0
511 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
514 ; AVX512DQ-LABEL: test_v32i16_align1:
516 ; AVX512DQ-NEXT: vmovups (%rdi), %ymm0
517 ; AVX512DQ-NEXT: vmovups 32(%rdi), %ymm1
518 ; AVX512DQ-NEXT: retq
520 ; AVX512BW-LABEL: test_v32i16_align1:
522 ; AVX512BW-NEXT: vmovups (%rdi), %zmm0
523 ; AVX512BW-NEXT: retq
524 %1 = load <32 x i16>, <32 x i16>* %src, align 1, !nontemporal !1
528 define <64 x i8> @test_v64i8_align1(<64 x i8>* %src) nounwind {
529 ; SSE-LABEL: test_v64i8_align1:
531 ; SSE-NEXT: movups (%rdi), %xmm0
532 ; SSE-NEXT: movups 16(%rdi), %xmm1
533 ; SSE-NEXT: movups 32(%rdi), %xmm2
534 ; SSE-NEXT: movups 48(%rdi), %xmm3
537 ; AVX1-LABEL: test_v64i8_align1:
539 ; AVX1-NEXT: vmovups (%rdi), %ymm0
540 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
543 ; AVX2-LABEL: test_v64i8_align1:
545 ; AVX2-NEXT: vmovups (%rdi), %ymm0
546 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
549 ; AVX512DQ-LABEL: test_v64i8_align1:
551 ; AVX512DQ-NEXT: vmovups (%rdi), %ymm0
552 ; AVX512DQ-NEXT: vmovups 32(%rdi), %ymm1
553 ; AVX512DQ-NEXT: retq
555 ; AVX512BW-LABEL: test_v64i8_align1:
557 ; AVX512BW-NEXT: vmovups (%rdi), %zmm0
558 ; AVX512BW-NEXT: retq
559 %1 = load <64 x i8>, <64 x i8>* %src, align 1, !nontemporal !1
563 define <8 x double> @test_v8f64_align16(<8 x double>* %src) nounwind {
564 ; SSE2-LABEL: test_v8f64_align16:
566 ; SSE2-NEXT: movaps (%rdi), %xmm0
567 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
568 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
569 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
572 ; SSE41-LABEL: test_v8f64_align16:
574 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
575 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
576 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
577 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
580 ; AVX1-LABEL: test_v8f64_align16:
582 ; AVX1-NEXT: pushq %rbp
583 ; AVX1-NEXT: movq %rsp, %rbp
584 ; AVX1-NEXT: andq $-32, %rsp
585 ; AVX1-NEXT: subq $96, %rsp
586 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
587 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
588 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
589 ; AVX1-NEXT: vmovdqa %xmm0, (%rsp)
590 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0
591 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
592 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0
593 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
594 ; AVX1-NEXT: vmovaps (%rsp), %ymm0
595 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
596 ; AVX1-NEXT: movq %rbp, %rsp
597 ; AVX1-NEXT: popq %rbp
600 ; AVX2-LABEL: test_v8f64_align16:
602 ; AVX2-NEXT: pushq %rbp
603 ; AVX2-NEXT: movq %rsp, %rbp
604 ; AVX2-NEXT: andq $-32, %rsp
605 ; AVX2-NEXT: subq $96, %rsp
606 ; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0
607 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
608 ; AVX2-NEXT: vmovntdqa (%rdi), %xmm0
609 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp)
610 ; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0
611 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
612 ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0
613 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
614 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
615 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
616 ; AVX2-NEXT: movq %rbp, %rsp
617 ; AVX2-NEXT: popq %rbp
620 ; AVX512-LABEL: test_v8f64_align16:
622 ; AVX512-NEXT: pushq %rbp
623 ; AVX512-NEXT: movq %rsp, %rbp
624 ; AVX512-NEXT: andq $-64, %rsp
625 ; AVX512-NEXT: subq $128, %rsp
626 ; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0
627 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
628 ; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0
629 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
630 ; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0
631 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
632 ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
633 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp)
634 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
635 ; AVX512-NEXT: movq %rbp, %rsp
636 ; AVX512-NEXT: popq %rbp
638 %1 = load <8 x double>, <8 x double>* %src, align 16, !nontemporal !1
642 define <16 x float> @test_v16f32_align16(<16 x float>* %src) nounwind {
643 ; SSE2-LABEL: test_v16f32_align16:
645 ; SSE2-NEXT: movaps (%rdi), %xmm0
646 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
647 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
648 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
651 ; SSE41-LABEL: test_v16f32_align16:
653 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
654 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
655 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
656 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
659 ; AVX1-LABEL: test_v16f32_align16:
661 ; AVX1-NEXT: pushq %rbp
662 ; AVX1-NEXT: movq %rsp, %rbp
663 ; AVX1-NEXT: andq $-32, %rsp
664 ; AVX1-NEXT: subq $96, %rsp
665 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
666 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
667 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
668 ; AVX1-NEXT: vmovdqa %xmm0, (%rsp)
669 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0
670 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
671 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0
672 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
673 ; AVX1-NEXT: vmovaps (%rsp), %ymm0
674 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
675 ; AVX1-NEXT: movq %rbp, %rsp
676 ; AVX1-NEXT: popq %rbp
679 ; AVX2-LABEL: test_v16f32_align16:
681 ; AVX2-NEXT: pushq %rbp
682 ; AVX2-NEXT: movq %rsp, %rbp
683 ; AVX2-NEXT: andq $-32, %rsp
684 ; AVX2-NEXT: subq $96, %rsp
685 ; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0
686 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
687 ; AVX2-NEXT: vmovntdqa (%rdi), %xmm0
688 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp)
689 ; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0
690 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
691 ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0
692 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
693 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
694 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
695 ; AVX2-NEXT: movq %rbp, %rsp
696 ; AVX2-NEXT: popq %rbp
699 ; AVX512-LABEL: test_v16f32_align16:
701 ; AVX512-NEXT: pushq %rbp
702 ; AVX512-NEXT: movq %rsp, %rbp
703 ; AVX512-NEXT: andq $-64, %rsp
704 ; AVX512-NEXT: subq $128, %rsp
705 ; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0
706 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
707 ; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0
708 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
709 ; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0
710 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
711 ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
712 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp)
713 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
714 ; AVX512-NEXT: movq %rbp, %rsp
715 ; AVX512-NEXT: popq %rbp
717 %1 = load <16 x float>, <16 x float>* %src, align 16, !nontemporal !1
721 define <8 x i64> @test_v8i64_align16(<8 x i64>* %src) nounwind {
722 ; SSE2-LABEL: test_v8i64_align16:
724 ; SSE2-NEXT: movaps (%rdi), %xmm0
725 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
726 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
727 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
730 ; SSE41-LABEL: test_v8i64_align16:
732 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
733 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
734 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
735 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
738 ; AVX1-LABEL: test_v8i64_align16:
740 ; AVX1-NEXT: pushq %rbp
741 ; AVX1-NEXT: movq %rsp, %rbp
742 ; AVX1-NEXT: andq $-32, %rsp
743 ; AVX1-NEXT: subq $96, %rsp
744 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
745 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
746 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
747 ; AVX1-NEXT: vmovdqa %xmm0, (%rsp)
748 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0
749 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
750 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0
751 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
752 ; AVX1-NEXT: vmovaps (%rsp), %ymm0
753 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
754 ; AVX1-NEXT: movq %rbp, %rsp
755 ; AVX1-NEXT: popq %rbp
758 ; AVX2-LABEL: test_v8i64_align16:
760 ; AVX2-NEXT: pushq %rbp
761 ; AVX2-NEXT: movq %rsp, %rbp
762 ; AVX2-NEXT: andq $-32, %rsp
763 ; AVX2-NEXT: subq $96, %rsp
764 ; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0
765 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
766 ; AVX2-NEXT: vmovntdqa (%rdi), %xmm0
767 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp)
768 ; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0
769 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
770 ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0
771 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
772 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
773 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
774 ; AVX2-NEXT: movq %rbp, %rsp
775 ; AVX2-NEXT: popq %rbp
778 ; AVX512-LABEL: test_v8i64_align16:
780 ; AVX512-NEXT: pushq %rbp
781 ; AVX512-NEXT: movq %rsp, %rbp
782 ; AVX512-NEXT: andq $-64, %rsp
783 ; AVX512-NEXT: subq $128, %rsp
784 ; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0
785 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
786 ; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0
787 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
788 ; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0
789 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
790 ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
791 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp)
792 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
793 ; AVX512-NEXT: movq %rbp, %rsp
794 ; AVX512-NEXT: popq %rbp
796 %1 = load <8 x i64>, <8 x i64>* %src, align 16, !nontemporal !1
800 define <16 x i32> @test_v16i32_align16(<16 x i32>* %src) nounwind {
801 ; SSE2-LABEL: test_v16i32_align16:
803 ; SSE2-NEXT: movaps (%rdi), %xmm0
804 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
805 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
806 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
809 ; SSE41-LABEL: test_v16i32_align16:
811 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
812 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
813 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
814 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
817 ; AVX1-LABEL: test_v16i32_align16:
819 ; AVX1-NEXT: pushq %rbp
820 ; AVX1-NEXT: movq %rsp, %rbp
821 ; AVX1-NEXT: andq $-32, %rsp
822 ; AVX1-NEXT: subq $96, %rsp
823 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
824 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
825 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
826 ; AVX1-NEXT: vmovdqa %xmm0, (%rsp)
827 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0
828 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
829 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0
830 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
831 ; AVX1-NEXT: vmovaps (%rsp), %ymm0
832 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
833 ; AVX1-NEXT: movq %rbp, %rsp
834 ; AVX1-NEXT: popq %rbp
837 ; AVX2-LABEL: test_v16i32_align16:
839 ; AVX2-NEXT: pushq %rbp
840 ; AVX2-NEXT: movq %rsp, %rbp
841 ; AVX2-NEXT: andq $-32, %rsp
842 ; AVX2-NEXT: subq $96, %rsp
843 ; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0
844 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
845 ; AVX2-NEXT: vmovntdqa (%rdi), %xmm0
846 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp)
847 ; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0
848 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
849 ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0
850 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
851 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
852 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
853 ; AVX2-NEXT: movq %rbp, %rsp
854 ; AVX2-NEXT: popq %rbp
857 ; AVX512-LABEL: test_v16i32_align16:
859 ; AVX512-NEXT: pushq %rbp
860 ; AVX512-NEXT: movq %rsp, %rbp
861 ; AVX512-NEXT: andq $-64, %rsp
862 ; AVX512-NEXT: subq $128, %rsp
863 ; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0
864 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
865 ; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0
866 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
867 ; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0
868 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
869 ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
870 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp)
871 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
872 ; AVX512-NEXT: movq %rbp, %rsp
873 ; AVX512-NEXT: popq %rbp
875 %1 = load <16 x i32>, <16 x i32>* %src, align 16, !nontemporal !1
879 define <32 x i16> @test_v32i16_align16(<32 x i16>* %src) nounwind {
880 ; SSE2-LABEL: test_v32i16_align16:
882 ; SSE2-NEXT: movaps (%rdi), %xmm0
883 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
884 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
885 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
888 ; SSE41-LABEL: test_v32i16_align16:
890 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
891 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
892 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
893 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
896 ; AVX1-LABEL: test_v32i16_align16:
898 ; AVX1-NEXT: pushq %rbp
899 ; AVX1-NEXT: movq %rsp, %rbp
900 ; AVX1-NEXT: andq $-32, %rsp
901 ; AVX1-NEXT: subq $96, %rsp
902 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
903 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
904 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
905 ; AVX1-NEXT: vmovdqa %xmm0, (%rsp)
906 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0
907 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
908 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0
909 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
910 ; AVX1-NEXT: vmovaps (%rsp), %ymm0
911 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
912 ; AVX1-NEXT: movq %rbp, %rsp
913 ; AVX1-NEXT: popq %rbp
916 ; AVX2-LABEL: test_v32i16_align16:
918 ; AVX2-NEXT: pushq %rbp
919 ; AVX2-NEXT: movq %rsp, %rbp
920 ; AVX2-NEXT: andq $-32, %rsp
921 ; AVX2-NEXT: subq $96, %rsp
922 ; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0
923 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
924 ; AVX2-NEXT: vmovntdqa (%rdi), %xmm0
925 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp)
926 ; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0
927 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
928 ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0
929 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
930 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
931 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
932 ; AVX2-NEXT: movq %rbp, %rsp
933 ; AVX2-NEXT: popq %rbp
936 ; AVX512DQ-LABEL: test_v32i16_align16:
938 ; AVX512DQ-NEXT: pushq %rbp
939 ; AVX512DQ-NEXT: movq %rsp, %rbp
940 ; AVX512DQ-NEXT: andq $-32, %rsp
941 ; AVX512DQ-NEXT: subq $96, %rsp
942 ; AVX512DQ-NEXT: vmovntdqa 16(%rdi), %xmm0
943 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
944 ; AVX512DQ-NEXT: vmovntdqa (%rdi), %xmm0
945 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsp)
946 ; AVX512DQ-NEXT: vmovntdqa 48(%rdi), %xmm0
947 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
948 ; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %xmm0
949 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
950 ; AVX512DQ-NEXT: vmovaps (%rsp), %ymm0
951 ; AVX512DQ-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
952 ; AVX512DQ-NEXT: movq %rbp, %rsp
953 ; AVX512DQ-NEXT: popq %rbp
954 ; AVX512DQ-NEXT: retq
956 ; AVX512BW-LABEL: test_v32i16_align16:
958 ; AVX512BW-NEXT: pushq %rbp
959 ; AVX512BW-NEXT: movq %rsp, %rbp
960 ; AVX512BW-NEXT: andq $-64, %rsp
961 ; AVX512BW-NEXT: subq $128, %rsp
962 ; AVX512BW-NEXT: vmovntdqa 48(%rdi), %xmm0
963 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
964 ; AVX512BW-NEXT: vmovntdqa 32(%rdi), %xmm0
965 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
966 ; AVX512BW-NEXT: vmovntdqa 16(%rdi), %xmm0
967 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
968 ; AVX512BW-NEXT: vmovntdqa (%rdi), %xmm0
969 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsp)
970 ; AVX512BW-NEXT: vmovaps (%rsp), %zmm0
971 ; AVX512BW-NEXT: movq %rbp, %rsp
972 ; AVX512BW-NEXT: popq %rbp
973 ; AVX512BW-NEXT: retq
974 %1 = load <32 x i16>, <32 x i16>* %src, align 16, !nontemporal !1
978 define <64 x i8> @test_v64i8_align16(<64 x i8>* %src) nounwind {
979 ; SSE2-LABEL: test_v64i8_align16:
981 ; SSE2-NEXT: movaps (%rdi), %xmm0
982 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
983 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
984 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
987 ; SSE41-LABEL: test_v64i8_align16:
989 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
990 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
991 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
992 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
995 ; AVX1-LABEL: test_v64i8_align16:
997 ; AVX1-NEXT: pushq %rbp
998 ; AVX1-NEXT: movq %rsp, %rbp
999 ; AVX1-NEXT: andq $-32, %rsp
1000 ; AVX1-NEXT: subq $96, %rsp
1001 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
1002 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1003 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1004 ; AVX1-NEXT: vmovdqa %xmm0, (%rsp)
1005 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0
1006 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1007 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0
1008 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1009 ; AVX1-NEXT: vmovaps (%rsp), %ymm0
1010 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
1011 ; AVX1-NEXT: movq %rbp, %rsp
1012 ; AVX1-NEXT: popq %rbp
1015 ; AVX2-LABEL: test_v64i8_align16:
1017 ; AVX2-NEXT: pushq %rbp
1018 ; AVX2-NEXT: movq %rsp, %rbp
1019 ; AVX2-NEXT: andq $-32, %rsp
1020 ; AVX2-NEXT: subq $96, %rsp
1021 ; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0
1022 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1023 ; AVX2-NEXT: vmovntdqa (%rdi), %xmm0
1024 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp)
1025 ; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0
1026 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1027 ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0
1028 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1029 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
1030 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
1031 ; AVX2-NEXT: movq %rbp, %rsp
1032 ; AVX2-NEXT: popq %rbp
1035 ; AVX512DQ-LABEL: test_v64i8_align16:
1036 ; AVX512DQ: # %bb.0:
1037 ; AVX512DQ-NEXT: pushq %rbp
1038 ; AVX512DQ-NEXT: movq %rsp, %rbp
1039 ; AVX512DQ-NEXT: andq $-32, %rsp
1040 ; AVX512DQ-NEXT: subq $96, %rsp
1041 ; AVX512DQ-NEXT: vmovntdqa 16(%rdi), %xmm0
1042 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1043 ; AVX512DQ-NEXT: vmovntdqa (%rdi), %xmm0
1044 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsp)
1045 ; AVX512DQ-NEXT: vmovntdqa 48(%rdi), %xmm0
1046 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1047 ; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %xmm0
1048 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1049 ; AVX512DQ-NEXT: vmovaps (%rsp), %ymm0
1050 ; AVX512DQ-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
1051 ; AVX512DQ-NEXT: movq %rbp, %rsp
1052 ; AVX512DQ-NEXT: popq %rbp
1053 ; AVX512DQ-NEXT: retq
1055 ; AVX512BW-LABEL: test_v64i8_align16:
1056 ; AVX512BW: # %bb.0:
1057 ; AVX512BW-NEXT: pushq %rbp
1058 ; AVX512BW-NEXT: movq %rsp, %rbp
1059 ; AVX512BW-NEXT: andq $-64, %rsp
1060 ; AVX512BW-NEXT: subq $128, %rsp
1061 ; AVX512BW-NEXT: vmovntdqa 48(%rdi), %xmm0
1062 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1063 ; AVX512BW-NEXT: vmovntdqa 32(%rdi), %xmm0
1064 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1065 ; AVX512BW-NEXT: vmovntdqa 16(%rdi), %xmm0
1066 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1067 ; AVX512BW-NEXT: vmovntdqa (%rdi), %xmm0
1068 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsp)
1069 ; AVX512BW-NEXT: vmovaps (%rsp), %zmm0
1070 ; AVX512BW-NEXT: movq %rbp, %rsp
1071 ; AVX512BW-NEXT: popq %rbp
1072 ; AVX512BW-NEXT: retq
1073 %1 = load <64 x i8>, <64 x i8>* %src, align 16, !nontemporal !1
1077 define <8 x double> @test_v8f64_align32(<8 x double>* %src) nounwind {
1078 ; SSE2-LABEL: test_v8f64_align32:
1080 ; SSE2-NEXT: movaps (%rdi), %xmm0
1081 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
1082 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
1083 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
1086 ; SSE41-LABEL: test_v8f64_align32:
1088 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
1089 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
1090 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
1091 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
1094 ; AVX1-LABEL: test_v8f64_align32:
1096 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1097 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
1098 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1099 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
1100 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1101 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1104 ; AVX2-LABEL: test_v8f64_align32:
1106 ; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
1107 ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
1110 ; AVX512-LABEL: test_v8f64_align32:
1112 ; AVX512-NEXT: pushq %rbp
1113 ; AVX512-NEXT: movq %rsp, %rbp
1114 ; AVX512-NEXT: andq $-64, %rsp
1115 ; AVX512-NEXT: subq $128, %rsp
1116 ; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0
1117 ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1118 ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
1119 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp)
1120 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
1121 ; AVX512-NEXT: movq %rbp, %rsp
1122 ; AVX512-NEXT: popq %rbp
1124 %1 = load <8 x double>, <8 x double>* %src, align 32, !nontemporal !1
1128 define <16 x float> @test_v16f32_align32(<16 x float>* %src) nounwind {
1129 ; SSE2-LABEL: test_v16f32_align32:
1131 ; SSE2-NEXT: movaps (%rdi), %xmm0
1132 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
1133 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
1134 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
1137 ; SSE41-LABEL: test_v16f32_align32:
1139 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
1140 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
1141 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
1142 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
1145 ; AVX1-LABEL: test_v16f32_align32:
1147 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1148 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
1149 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1150 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
1151 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1152 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1155 ; AVX2-LABEL: test_v16f32_align32:
1157 ; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
1158 ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
1161 ; AVX512-LABEL: test_v16f32_align32:
1163 ; AVX512-NEXT: pushq %rbp
1164 ; AVX512-NEXT: movq %rsp, %rbp
1165 ; AVX512-NEXT: andq $-64, %rsp
1166 ; AVX512-NEXT: subq $128, %rsp
1167 ; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0
1168 ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1169 ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
1170 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp)
1171 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
1172 ; AVX512-NEXT: movq %rbp, %rsp
1173 ; AVX512-NEXT: popq %rbp
1175 %1 = load <16 x float>, <16 x float>* %src, align 32, !nontemporal !1
1179 define <8 x i64> @test_v8i64_align32(<8 x i64>* %src) nounwind {
1180 ; SSE2-LABEL: test_v8i64_align32:
1182 ; SSE2-NEXT: movaps (%rdi), %xmm0
1183 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
1184 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
1185 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
1188 ; SSE41-LABEL: test_v8i64_align32:
1190 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
1191 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
1192 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
1193 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
1196 ; AVX1-LABEL: test_v8i64_align32:
1198 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1199 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
1200 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1201 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
1202 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1203 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1206 ; AVX2-LABEL: test_v8i64_align32:
1208 ; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
1209 ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
1212 ; AVX512-LABEL: test_v8i64_align32:
1214 ; AVX512-NEXT: pushq %rbp
1215 ; AVX512-NEXT: movq %rsp, %rbp
1216 ; AVX512-NEXT: andq $-64, %rsp
1217 ; AVX512-NEXT: subq $128, %rsp
1218 ; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0
1219 ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1220 ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
1221 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp)
1222 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
1223 ; AVX512-NEXT: movq %rbp, %rsp
1224 ; AVX512-NEXT: popq %rbp
1226 %1 = load <8 x i64>, <8 x i64>* %src, align 32, !nontemporal !1
1230 define <16 x i32> @test_v16i32_align32(<16 x i32>* %src) nounwind {
1231 ; SSE2-LABEL: test_v16i32_align32:
1233 ; SSE2-NEXT: movaps (%rdi), %xmm0
1234 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
1235 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
1236 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
1239 ; SSE41-LABEL: test_v16i32_align32:
1241 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
1242 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
1243 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
1244 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
1247 ; AVX1-LABEL: test_v16i32_align32:
1249 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1250 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
1251 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1252 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
1253 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1254 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1257 ; AVX2-LABEL: test_v16i32_align32:
1259 ; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
1260 ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
1263 ; AVX512-LABEL: test_v16i32_align32:
1265 ; AVX512-NEXT: pushq %rbp
1266 ; AVX512-NEXT: movq %rsp, %rbp
1267 ; AVX512-NEXT: andq $-64, %rsp
1268 ; AVX512-NEXT: subq $128, %rsp
1269 ; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0
1270 ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1271 ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
1272 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp)
1273 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
1274 ; AVX512-NEXT: movq %rbp, %rsp
1275 ; AVX512-NEXT: popq %rbp
1277 %1 = load <16 x i32>, <16 x i32>* %src, align 32, !nontemporal !1
1281 define <32 x i16> @test_v32i16_align32(<32 x i16>* %src) nounwind {
1282 ; SSE2-LABEL: test_v32i16_align32:
1284 ; SSE2-NEXT: movaps (%rdi), %xmm0
1285 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
1286 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
1287 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
1290 ; SSE41-LABEL: test_v32i16_align32:
1292 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
1293 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
1294 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
1295 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
1298 ; AVX1-LABEL: test_v32i16_align32:
1300 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1301 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
1302 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1303 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
1304 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1305 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1308 ; AVX2-LABEL: test_v32i16_align32:
1310 ; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
1311 ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
1314 ; AVX512DQ-LABEL: test_v32i16_align32:
1315 ; AVX512DQ: # %bb.0:
1316 ; AVX512DQ-NEXT: vmovntdqa (%rdi), %ymm0
1317 ; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %ymm1
1318 ; AVX512DQ-NEXT: retq
1320 ; AVX512BW-LABEL: test_v32i16_align32:
1321 ; AVX512BW: # %bb.0:
1322 ; AVX512BW-NEXT: pushq %rbp
1323 ; AVX512BW-NEXT: movq %rsp, %rbp
1324 ; AVX512BW-NEXT: andq $-64, %rsp
1325 ; AVX512BW-NEXT: subq $128, %rsp
1326 ; AVX512BW-NEXT: vmovntdqa 32(%rdi), %ymm0
1327 ; AVX512BW-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1328 ; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0
1329 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsp)
1330 ; AVX512BW-NEXT: vmovaps (%rsp), %zmm0
1331 ; AVX512BW-NEXT: movq %rbp, %rsp
1332 ; AVX512BW-NEXT: popq %rbp
1333 ; AVX512BW-NEXT: retq
1334 %1 = load <32 x i16>, <32 x i16>* %src, align 32, !nontemporal !1
1338 define <64 x i8> @test_v64i8_align32(<64 x i8>* %src) nounwind {
1339 ; SSE2-LABEL: test_v64i8_align32:
1341 ; SSE2-NEXT: movaps (%rdi), %xmm0
1342 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
1343 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
1344 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
1347 ; SSE41-LABEL: test_v64i8_align32:
1349 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
1350 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
1351 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
1352 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
1355 ; AVX1-LABEL: test_v64i8_align32:
1357 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1358 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
1359 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1360 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
1361 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1362 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1365 ; AVX2-LABEL: test_v64i8_align32:
1367 ; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
1368 ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
1371 ; AVX512DQ-LABEL: test_v64i8_align32:
1372 ; AVX512DQ: # %bb.0:
1373 ; AVX512DQ-NEXT: vmovntdqa (%rdi), %ymm0
1374 ; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %ymm1
1375 ; AVX512DQ-NEXT: retq
1377 ; AVX512BW-LABEL: test_v64i8_align32:
1378 ; AVX512BW: # %bb.0:
1379 ; AVX512BW-NEXT: pushq %rbp
1380 ; AVX512BW-NEXT: movq %rsp, %rbp
1381 ; AVX512BW-NEXT: andq $-64, %rsp
1382 ; AVX512BW-NEXT: subq $128, %rsp
1383 ; AVX512BW-NEXT: vmovntdqa 32(%rdi), %ymm0
1384 ; AVX512BW-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1385 ; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0
1386 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsp)
1387 ; AVX512BW-NEXT: vmovaps (%rsp), %zmm0
1388 ; AVX512BW-NEXT: movq %rbp, %rsp
1389 ; AVX512BW-NEXT: popq %rbp
1390 ; AVX512BW-NEXT: retq
1391 %1 = load <64 x i8>, <64 x i8>* %src, align 32, !nontemporal !1