1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
9 ; Test codegen for under aligned nontemporal vector loads
13 define <2 x double> @test_v2f64_align1(<2 x double>* %src) nounwind {
14 ; SSE-LABEL: test_v2f64_align1:
16 ; SSE-NEXT: movups (%rdi), %xmm0
19 ; AVX-LABEL: test_v2f64_align1:
21 ; AVX-NEXT: vmovups (%rdi), %xmm0
23 %1 = load <2 x double>, <2 x double>* %src, align 1, !nontemporal !1
27 define <4 x float> @test_v4f32_align1(<4 x float>* %src) nounwind {
28 ; SSE-LABEL: test_v4f32_align1:
30 ; SSE-NEXT: movups (%rdi), %xmm0
33 ; AVX-LABEL: test_v4f32_align1:
35 ; AVX-NEXT: vmovups (%rdi), %xmm0
37 %1 = load <4 x float>, <4 x float>* %src, align 1, !nontemporal !1
41 define <2 x i64> @test_v2i64_align1(<2 x i64>* %src) nounwind {
42 ; SSE-LABEL: test_v2i64_align1:
44 ; SSE-NEXT: movups (%rdi), %xmm0
47 ; AVX-LABEL: test_v2i64_align1:
49 ; AVX-NEXT: vmovups (%rdi), %xmm0
51 %1 = load <2 x i64>, <2 x i64>* %src, align 1, !nontemporal !1
55 define <4 x i32> @test_v4i32_align1(<4 x i32>* %src) nounwind {
56 ; SSE-LABEL: test_v4i32_align1:
58 ; SSE-NEXT: movups (%rdi), %xmm0
61 ; AVX-LABEL: test_v4i32_align1:
63 ; AVX-NEXT: vmovups (%rdi), %xmm0
65 %1 = load <4 x i32>, <4 x i32>* %src, align 1, !nontemporal !1
69 define <8 x i16> @test_v8i16_align1(<8 x i16>* %src) nounwind {
70 ; SSE-LABEL: test_v8i16_align1:
72 ; SSE-NEXT: movups (%rdi), %xmm0
75 ; AVX-LABEL: test_v8i16_align1:
77 ; AVX-NEXT: vmovups (%rdi), %xmm0
79 %1 = load <8 x i16>, <8 x i16>* %src, align 1, !nontemporal !1
83 define <16 x i8> @test_v16i8_align1(<16 x i8>* %src) nounwind {
84 ; SSE-LABEL: test_v16i8_align1:
86 ; SSE-NEXT: movups (%rdi), %xmm0
89 ; AVX-LABEL: test_v16i8_align1:
91 ; AVX-NEXT: vmovups (%rdi), %xmm0
93 %1 = load <16 x i8>, <16 x i8>* %src, align 1, !nontemporal !1
99 define <4 x double> @test_v4f64_align1(<4 x double>* %src) nounwind {
100 ; SSE-LABEL: test_v4f64_align1:
102 ; SSE-NEXT: movups (%rdi), %xmm0
103 ; SSE-NEXT: movups 16(%rdi), %xmm1
106 ; AVX-LABEL: test_v4f64_align1:
108 ; AVX-NEXT: vmovups (%rdi), %ymm0
110 %1 = load <4 x double>, <4 x double>* %src, align 1, !nontemporal !1
114 define <8 x float> @test_v8f32_align1(<8 x float>* %src) nounwind {
115 ; SSE-LABEL: test_v8f32_align1:
117 ; SSE-NEXT: movups (%rdi), %xmm0
118 ; SSE-NEXT: movups 16(%rdi), %xmm1
121 ; AVX-LABEL: test_v8f32_align1:
123 ; AVX-NEXT: vmovups (%rdi), %ymm0
125 %1 = load <8 x float>, <8 x float>* %src, align 1, !nontemporal !1
129 define <4 x i64> @test_v4i64_align1(<4 x i64>* %src) nounwind {
130 ; SSE-LABEL: test_v4i64_align1:
132 ; SSE-NEXT: movups (%rdi), %xmm0
133 ; SSE-NEXT: movups 16(%rdi), %xmm1
136 ; AVX-LABEL: test_v4i64_align1:
138 ; AVX-NEXT: vmovups (%rdi), %ymm0
140 %1 = load <4 x i64>, <4 x i64>* %src, align 1, !nontemporal !1
144 define <8 x i32> @test_v8i32_align1(<8 x i32>* %src) nounwind {
145 ; SSE-LABEL: test_v8i32_align1:
147 ; SSE-NEXT: movups (%rdi), %xmm0
148 ; SSE-NEXT: movups 16(%rdi), %xmm1
151 ; AVX-LABEL: test_v8i32_align1:
153 ; AVX-NEXT: vmovups (%rdi), %ymm0
155 %1 = load <8 x i32>, <8 x i32>* %src, align 1, !nontemporal !1
159 define <16 x i16> @test_v16i16_align1(<16 x i16>* %src) nounwind {
160 ; SSE-LABEL: test_v16i16_align1:
162 ; SSE-NEXT: movups (%rdi), %xmm0
163 ; SSE-NEXT: movups 16(%rdi), %xmm1
166 ; AVX-LABEL: test_v16i16_align1:
168 ; AVX-NEXT: vmovups (%rdi), %ymm0
170 %1 = load <16 x i16>, <16 x i16>* %src, align 1, !nontemporal !1
174 define <32 x i8> @test_v32i8_align1(<32 x i8>* %src) nounwind {
175 ; SSE-LABEL: test_v32i8_align1:
177 ; SSE-NEXT: movups (%rdi), %xmm0
178 ; SSE-NEXT: movups 16(%rdi), %xmm1
181 ; AVX-LABEL: test_v32i8_align1:
183 ; AVX-NEXT: vmovups (%rdi), %ymm0
185 %1 = load <32 x i8>, <32 x i8>* %src, align 1, !nontemporal !1
189 define <4 x double> @test_v4f64_align16(<4 x double>* %src) nounwind {
190 ; SSE2-LABEL: test_v4f64_align16:
192 ; SSE2-NEXT: movaps (%rdi), %xmm0
193 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
196 ; SSE41-LABEL: test_v4f64_align16:
198 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
199 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
202 ; AVX-LABEL: test_v4f64_align16:
204 ; AVX-NEXT: pushq %rbp
205 ; AVX-NEXT: movq %rsp, %rbp
206 ; AVX-NEXT: andq $-32, %rsp
207 ; AVX-NEXT: subq $64, %rsp
208 ; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0
209 ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
210 ; AVX-NEXT: vmovntdqa (%rdi), %xmm0
211 ; AVX-NEXT: vmovdqa %xmm0, (%rsp)
212 ; AVX-NEXT: vmovaps (%rsp), %ymm0
213 ; AVX-NEXT: movq %rbp, %rsp
214 ; AVX-NEXT: popq %rbp
216 %1 = load <4 x double>, <4 x double>* %src, align 16, !nontemporal !1
220 define <8 x float> @test_v8f32_align16(<8 x float>* %src) nounwind {
221 ; SSE2-LABEL: test_v8f32_align16:
223 ; SSE2-NEXT: movaps (%rdi), %xmm0
224 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
227 ; SSE41-LABEL: test_v8f32_align16:
229 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
230 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
233 ; AVX-LABEL: test_v8f32_align16:
235 ; AVX-NEXT: pushq %rbp
236 ; AVX-NEXT: movq %rsp, %rbp
237 ; AVX-NEXT: andq $-32, %rsp
238 ; AVX-NEXT: subq $64, %rsp
239 ; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0
240 ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
241 ; AVX-NEXT: vmovntdqa (%rdi), %xmm0
242 ; AVX-NEXT: vmovdqa %xmm0, (%rsp)
243 ; AVX-NEXT: vmovaps (%rsp), %ymm0
244 ; AVX-NEXT: movq %rbp, %rsp
245 ; AVX-NEXT: popq %rbp
247 %1 = load <8 x float>, <8 x float>* %src, align 16, !nontemporal !1
251 define <4 x i64> @test_v4i64_align16(<4 x i64>* %src) nounwind {
252 ; SSE2-LABEL: test_v4i64_align16:
254 ; SSE2-NEXT: movaps (%rdi), %xmm0
255 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
258 ; SSE41-LABEL: test_v4i64_align16:
260 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
261 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
264 ; AVX-LABEL: test_v4i64_align16:
266 ; AVX-NEXT: pushq %rbp
267 ; AVX-NEXT: movq %rsp, %rbp
268 ; AVX-NEXT: andq $-32, %rsp
269 ; AVX-NEXT: subq $64, %rsp
270 ; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0
271 ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
272 ; AVX-NEXT: vmovntdqa (%rdi), %xmm0
273 ; AVX-NEXT: vmovdqa %xmm0, (%rsp)
274 ; AVX-NEXT: vmovaps (%rsp), %ymm0
275 ; AVX-NEXT: movq %rbp, %rsp
276 ; AVX-NEXT: popq %rbp
278 %1 = load <4 x i64>, <4 x i64>* %src, align 16, !nontemporal !1
282 define <8 x i32> @test_v8i32_align16(<8 x i32>* %src) nounwind {
283 ; SSE2-LABEL: test_v8i32_align16:
285 ; SSE2-NEXT: movaps (%rdi), %xmm0
286 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
289 ; SSE41-LABEL: test_v8i32_align16:
291 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
292 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
295 ; AVX-LABEL: test_v8i32_align16:
297 ; AVX-NEXT: pushq %rbp
298 ; AVX-NEXT: movq %rsp, %rbp
299 ; AVX-NEXT: andq $-32, %rsp
300 ; AVX-NEXT: subq $64, %rsp
301 ; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0
302 ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
303 ; AVX-NEXT: vmovntdqa (%rdi), %xmm0
304 ; AVX-NEXT: vmovdqa %xmm0, (%rsp)
305 ; AVX-NEXT: vmovaps (%rsp), %ymm0
306 ; AVX-NEXT: movq %rbp, %rsp
307 ; AVX-NEXT: popq %rbp
309 %1 = load <8 x i32>, <8 x i32>* %src, align 16, !nontemporal !1
313 define <16 x i16> @test_v16i16_align16(<16 x i16>* %src) nounwind {
314 ; SSE2-LABEL: test_v16i16_align16:
316 ; SSE2-NEXT: movaps (%rdi), %xmm0
317 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
320 ; SSE41-LABEL: test_v16i16_align16:
322 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
323 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
326 ; AVX-LABEL: test_v16i16_align16:
328 ; AVX-NEXT: pushq %rbp
329 ; AVX-NEXT: movq %rsp, %rbp
330 ; AVX-NEXT: andq $-32, %rsp
331 ; AVX-NEXT: subq $64, %rsp
332 ; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0
333 ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
334 ; AVX-NEXT: vmovntdqa (%rdi), %xmm0
335 ; AVX-NEXT: vmovdqa %xmm0, (%rsp)
336 ; AVX-NEXT: vmovaps (%rsp), %ymm0
337 ; AVX-NEXT: movq %rbp, %rsp
338 ; AVX-NEXT: popq %rbp
340 %1 = load <16 x i16>, <16 x i16>* %src, align 16, !nontemporal !1
344 define <32 x i8> @test_v32i8_align16(<32 x i8>* %src) nounwind {
345 ; SSE2-LABEL: test_v32i8_align16:
347 ; SSE2-NEXT: movaps (%rdi), %xmm0
348 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
351 ; SSE41-LABEL: test_v32i8_align16:
353 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
354 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
357 ; AVX-LABEL: test_v32i8_align16:
359 ; AVX-NEXT: pushq %rbp
360 ; AVX-NEXT: movq %rsp, %rbp
361 ; AVX-NEXT: andq $-32, %rsp
362 ; AVX-NEXT: subq $64, %rsp
363 ; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0
364 ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
365 ; AVX-NEXT: vmovntdqa (%rdi), %xmm0
366 ; AVX-NEXT: vmovdqa %xmm0, (%rsp)
367 ; AVX-NEXT: vmovaps (%rsp), %ymm0
368 ; AVX-NEXT: movq %rbp, %rsp
369 ; AVX-NEXT: popq %rbp
371 %1 = load <32 x i8>, <32 x i8>* %src, align 16, !nontemporal !1
377 define <8 x double> @test_v8f64_align1(<8 x double>* %src) nounwind {
378 ; SSE-LABEL: test_v8f64_align1:
380 ; SSE-NEXT: movups (%rdi), %xmm0
381 ; SSE-NEXT: movups 16(%rdi), %xmm1
382 ; SSE-NEXT: movups 32(%rdi), %xmm2
383 ; SSE-NEXT: movups 48(%rdi), %xmm3
386 ; AVX1-LABEL: test_v8f64_align1:
388 ; AVX1-NEXT: vmovups (%rdi), %ymm0
389 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
392 ; AVX2-LABEL: test_v8f64_align1:
394 ; AVX2-NEXT: vmovups (%rdi), %ymm0
395 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
398 ; AVX512-LABEL: test_v8f64_align1:
400 ; AVX512-NEXT: vmovups (%rdi), %zmm0
402 %1 = load <8 x double>, <8 x double>* %src, align 1, !nontemporal !1
406 define <16 x float> @test_v16f32_align1(<16 x float>* %src) nounwind {
407 ; SSE-LABEL: test_v16f32_align1:
409 ; SSE-NEXT: movups (%rdi), %xmm0
410 ; SSE-NEXT: movups 16(%rdi), %xmm1
411 ; SSE-NEXT: movups 32(%rdi), %xmm2
412 ; SSE-NEXT: movups 48(%rdi), %xmm3
415 ; AVX1-LABEL: test_v16f32_align1:
417 ; AVX1-NEXT: vmovups (%rdi), %ymm0
418 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
421 ; AVX2-LABEL: test_v16f32_align1:
423 ; AVX2-NEXT: vmovups (%rdi), %ymm0
424 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
427 ; AVX512-LABEL: test_v16f32_align1:
429 ; AVX512-NEXT: vmovups (%rdi), %zmm0
431 %1 = load <16 x float>, <16 x float>* %src, align 1, !nontemporal !1
435 define <8 x i64> @test_v8i64_align1(<8 x i64>* %src) nounwind {
436 ; SSE-LABEL: test_v8i64_align1:
438 ; SSE-NEXT: movups (%rdi), %xmm0
439 ; SSE-NEXT: movups 16(%rdi), %xmm1
440 ; SSE-NEXT: movups 32(%rdi), %xmm2
441 ; SSE-NEXT: movups 48(%rdi), %xmm3
444 ; AVX1-LABEL: test_v8i64_align1:
446 ; AVX1-NEXT: vmovups (%rdi), %ymm0
447 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
450 ; AVX2-LABEL: test_v8i64_align1:
452 ; AVX2-NEXT: vmovups (%rdi), %ymm0
453 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
456 ; AVX512-LABEL: test_v8i64_align1:
458 ; AVX512-NEXT: vmovups (%rdi), %zmm0
460 %1 = load <8 x i64>, <8 x i64>* %src, align 1, !nontemporal !1
464 define <16 x i32> @test_v16i32_align1(<16 x i32>* %src) nounwind {
465 ; SSE-LABEL: test_v16i32_align1:
467 ; SSE-NEXT: movups (%rdi), %xmm0
468 ; SSE-NEXT: movups 16(%rdi), %xmm1
469 ; SSE-NEXT: movups 32(%rdi), %xmm2
470 ; SSE-NEXT: movups 48(%rdi), %xmm3
473 ; AVX1-LABEL: test_v16i32_align1:
475 ; AVX1-NEXT: vmovups (%rdi), %ymm0
476 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
479 ; AVX2-LABEL: test_v16i32_align1:
481 ; AVX2-NEXT: vmovups (%rdi), %ymm0
482 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
485 ; AVX512-LABEL: test_v16i32_align1:
487 ; AVX512-NEXT: vmovups (%rdi), %zmm0
489 %1 = load <16 x i32>, <16 x i32>* %src, align 1, !nontemporal !1
493 define <32 x i16> @test_v32i16_align1(<32 x i16>* %src) nounwind {
494 ; SSE-LABEL: test_v32i16_align1:
496 ; SSE-NEXT: movups (%rdi), %xmm0
497 ; SSE-NEXT: movups 16(%rdi), %xmm1
498 ; SSE-NEXT: movups 32(%rdi), %xmm2
499 ; SSE-NEXT: movups 48(%rdi), %xmm3
502 ; AVX1-LABEL: test_v32i16_align1:
504 ; AVX1-NEXT: vmovups (%rdi), %ymm0
505 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
508 ; AVX2-LABEL: test_v32i16_align1:
510 ; AVX2-NEXT: vmovups (%rdi), %ymm0
511 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
514 ; AVX512-LABEL: test_v32i16_align1:
516 ; AVX512-NEXT: vmovups (%rdi), %zmm0
518 %1 = load <32 x i16>, <32 x i16>* %src, align 1, !nontemporal !1
522 define <64 x i8> @test_v64i8_align1(<64 x i8>* %src) nounwind {
523 ; SSE-LABEL: test_v64i8_align1:
525 ; SSE-NEXT: movups (%rdi), %xmm0
526 ; SSE-NEXT: movups 16(%rdi), %xmm1
527 ; SSE-NEXT: movups 32(%rdi), %xmm2
528 ; SSE-NEXT: movups 48(%rdi), %xmm3
531 ; AVX1-LABEL: test_v64i8_align1:
533 ; AVX1-NEXT: vmovups (%rdi), %ymm0
534 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1
537 ; AVX2-LABEL: test_v64i8_align1:
539 ; AVX2-NEXT: vmovups (%rdi), %ymm0
540 ; AVX2-NEXT: vmovups 32(%rdi), %ymm1
543 ; AVX512-LABEL: test_v64i8_align1:
545 ; AVX512-NEXT: vmovups (%rdi), %zmm0
547 %1 = load <64 x i8>, <64 x i8>* %src, align 1, !nontemporal !1
551 define <8 x double> @test_v8f64_align16(<8 x double>* %src) nounwind {
552 ; SSE2-LABEL: test_v8f64_align16:
554 ; SSE2-NEXT: movaps (%rdi), %xmm0
555 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
556 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
557 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
560 ; SSE41-LABEL: test_v8f64_align16:
562 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
563 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
564 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
565 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
568 ; AVX1-LABEL: test_v8f64_align16:
570 ; AVX1-NEXT: pushq %rbp
571 ; AVX1-NEXT: movq %rsp, %rbp
572 ; AVX1-NEXT: andq $-32, %rsp
573 ; AVX1-NEXT: subq $96, %rsp
574 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
575 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
576 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
577 ; AVX1-NEXT: vmovdqa %xmm0, (%rsp)
578 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0
579 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
580 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0
581 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
582 ; AVX1-NEXT: vmovaps (%rsp), %ymm0
583 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
584 ; AVX1-NEXT: movq %rbp, %rsp
585 ; AVX1-NEXT: popq %rbp
588 ; AVX2-LABEL: test_v8f64_align16:
590 ; AVX2-NEXT: pushq %rbp
591 ; AVX2-NEXT: movq %rsp, %rbp
592 ; AVX2-NEXT: andq $-32, %rsp
593 ; AVX2-NEXT: subq $96, %rsp
594 ; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0
595 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
596 ; AVX2-NEXT: vmovntdqa (%rdi), %xmm0
597 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp)
598 ; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0
599 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
600 ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0
601 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
602 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
603 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
604 ; AVX2-NEXT: movq %rbp, %rsp
605 ; AVX2-NEXT: popq %rbp
608 ; AVX512-LABEL: test_v8f64_align16:
610 ; AVX512-NEXT: pushq %rbp
611 ; AVX512-NEXT: movq %rsp, %rbp
612 ; AVX512-NEXT: andq $-64, %rsp
613 ; AVX512-NEXT: subq $128, %rsp
614 ; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0
615 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
616 ; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0
617 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
618 ; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0
619 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
620 ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
621 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp)
622 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
623 ; AVX512-NEXT: movq %rbp, %rsp
624 ; AVX512-NEXT: popq %rbp
626 %1 = load <8 x double>, <8 x double>* %src, align 16, !nontemporal !1
630 define <16 x float> @test_v16f32_align16(<16 x float>* %src) nounwind {
631 ; SSE2-LABEL: test_v16f32_align16:
633 ; SSE2-NEXT: movaps (%rdi), %xmm0
634 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
635 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
636 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
639 ; SSE41-LABEL: test_v16f32_align16:
641 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
642 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
643 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
644 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
647 ; AVX1-LABEL: test_v16f32_align16:
649 ; AVX1-NEXT: pushq %rbp
650 ; AVX1-NEXT: movq %rsp, %rbp
651 ; AVX1-NEXT: andq $-32, %rsp
652 ; AVX1-NEXT: subq $96, %rsp
653 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
654 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
655 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
656 ; AVX1-NEXT: vmovdqa %xmm0, (%rsp)
657 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0
658 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
659 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0
660 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
661 ; AVX1-NEXT: vmovaps (%rsp), %ymm0
662 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
663 ; AVX1-NEXT: movq %rbp, %rsp
664 ; AVX1-NEXT: popq %rbp
667 ; AVX2-LABEL: test_v16f32_align16:
669 ; AVX2-NEXT: pushq %rbp
670 ; AVX2-NEXT: movq %rsp, %rbp
671 ; AVX2-NEXT: andq $-32, %rsp
672 ; AVX2-NEXT: subq $96, %rsp
673 ; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0
674 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
675 ; AVX2-NEXT: vmovntdqa (%rdi), %xmm0
676 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp)
677 ; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0
678 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
679 ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0
680 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
681 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
682 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
683 ; AVX2-NEXT: movq %rbp, %rsp
684 ; AVX2-NEXT: popq %rbp
687 ; AVX512-LABEL: test_v16f32_align16:
689 ; AVX512-NEXT: pushq %rbp
690 ; AVX512-NEXT: movq %rsp, %rbp
691 ; AVX512-NEXT: andq $-64, %rsp
692 ; AVX512-NEXT: subq $128, %rsp
693 ; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0
694 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
695 ; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0
696 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
697 ; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0
698 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
699 ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
700 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp)
701 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
702 ; AVX512-NEXT: movq %rbp, %rsp
703 ; AVX512-NEXT: popq %rbp
705 %1 = load <16 x float>, <16 x float>* %src, align 16, !nontemporal !1
709 define <8 x i64> @test_v8i64_align16(<8 x i64>* %src) nounwind {
710 ; SSE2-LABEL: test_v8i64_align16:
712 ; SSE2-NEXT: movaps (%rdi), %xmm0
713 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
714 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
715 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
718 ; SSE41-LABEL: test_v8i64_align16:
720 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
721 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
722 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
723 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
726 ; AVX1-LABEL: test_v8i64_align16:
728 ; AVX1-NEXT: pushq %rbp
729 ; AVX1-NEXT: movq %rsp, %rbp
730 ; AVX1-NEXT: andq $-32, %rsp
731 ; AVX1-NEXT: subq $96, %rsp
732 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
733 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
734 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
735 ; AVX1-NEXT: vmovdqa %xmm0, (%rsp)
736 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0
737 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
738 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0
739 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
740 ; AVX1-NEXT: vmovaps (%rsp), %ymm0
741 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
742 ; AVX1-NEXT: movq %rbp, %rsp
743 ; AVX1-NEXT: popq %rbp
746 ; AVX2-LABEL: test_v8i64_align16:
748 ; AVX2-NEXT: pushq %rbp
749 ; AVX2-NEXT: movq %rsp, %rbp
750 ; AVX2-NEXT: andq $-32, %rsp
751 ; AVX2-NEXT: subq $96, %rsp
752 ; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0
753 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
754 ; AVX2-NEXT: vmovntdqa (%rdi), %xmm0
755 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp)
756 ; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0
757 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
758 ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0
759 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
760 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
761 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
762 ; AVX2-NEXT: movq %rbp, %rsp
763 ; AVX2-NEXT: popq %rbp
766 ; AVX512-LABEL: test_v8i64_align16:
768 ; AVX512-NEXT: pushq %rbp
769 ; AVX512-NEXT: movq %rsp, %rbp
770 ; AVX512-NEXT: andq $-64, %rsp
771 ; AVX512-NEXT: subq $128, %rsp
772 ; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0
773 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
774 ; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0
775 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
776 ; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0
777 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
778 ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
779 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp)
780 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
781 ; AVX512-NEXT: movq %rbp, %rsp
782 ; AVX512-NEXT: popq %rbp
784 %1 = load <8 x i64>, <8 x i64>* %src, align 16, !nontemporal !1
788 define <16 x i32> @test_v16i32_align16(<16 x i32>* %src) nounwind {
789 ; SSE2-LABEL: test_v16i32_align16:
791 ; SSE2-NEXT: movaps (%rdi), %xmm0
792 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
793 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
794 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
797 ; SSE41-LABEL: test_v16i32_align16:
799 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
800 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
801 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
802 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
805 ; AVX1-LABEL: test_v16i32_align16:
807 ; AVX1-NEXT: pushq %rbp
808 ; AVX1-NEXT: movq %rsp, %rbp
809 ; AVX1-NEXT: andq $-32, %rsp
810 ; AVX1-NEXT: subq $96, %rsp
811 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
812 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
813 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
814 ; AVX1-NEXT: vmovdqa %xmm0, (%rsp)
815 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0
816 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
817 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0
818 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
819 ; AVX1-NEXT: vmovaps (%rsp), %ymm0
820 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
821 ; AVX1-NEXT: movq %rbp, %rsp
822 ; AVX1-NEXT: popq %rbp
825 ; AVX2-LABEL: test_v16i32_align16:
827 ; AVX2-NEXT: pushq %rbp
828 ; AVX2-NEXT: movq %rsp, %rbp
829 ; AVX2-NEXT: andq $-32, %rsp
830 ; AVX2-NEXT: subq $96, %rsp
831 ; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0
832 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
833 ; AVX2-NEXT: vmovntdqa (%rdi), %xmm0
834 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp)
835 ; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0
836 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
837 ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0
838 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
839 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
840 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
841 ; AVX2-NEXT: movq %rbp, %rsp
842 ; AVX2-NEXT: popq %rbp
845 ; AVX512-LABEL: test_v16i32_align16:
847 ; AVX512-NEXT: pushq %rbp
848 ; AVX512-NEXT: movq %rsp, %rbp
849 ; AVX512-NEXT: andq $-64, %rsp
850 ; AVX512-NEXT: subq $128, %rsp
851 ; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0
852 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
853 ; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0
854 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
855 ; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0
856 ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
857 ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
858 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp)
859 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
860 ; AVX512-NEXT: movq %rbp, %rsp
861 ; AVX512-NEXT: popq %rbp
863 %1 = load <16 x i32>, <16 x i32>* %src, align 16, !nontemporal !1
867 define <32 x i16> @test_v32i16_align16(<32 x i16>* %src) nounwind {
868 ; SSE2-LABEL: test_v32i16_align16:
870 ; SSE2-NEXT: movaps (%rdi), %xmm0
871 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
872 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
873 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
876 ; SSE41-LABEL: test_v32i16_align16:
878 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
879 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
880 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
881 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
884 ; AVX1-LABEL: test_v32i16_align16:
886 ; AVX1-NEXT: pushq %rbp
887 ; AVX1-NEXT: movq %rsp, %rbp
888 ; AVX1-NEXT: andq $-32, %rsp
889 ; AVX1-NEXT: subq $96, %rsp
890 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
891 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
892 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
893 ; AVX1-NEXT: vmovdqa %xmm0, (%rsp)
894 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0
895 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
896 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0
897 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
898 ; AVX1-NEXT: vmovaps (%rsp), %ymm0
899 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
900 ; AVX1-NEXT: movq %rbp, %rsp
901 ; AVX1-NEXT: popq %rbp
904 ; AVX2-LABEL: test_v32i16_align16:
906 ; AVX2-NEXT: pushq %rbp
907 ; AVX2-NEXT: movq %rsp, %rbp
908 ; AVX2-NEXT: andq $-32, %rsp
909 ; AVX2-NEXT: subq $96, %rsp
910 ; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0
911 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
912 ; AVX2-NEXT: vmovntdqa (%rdi), %xmm0
913 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp)
914 ; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0
915 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
916 ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0
917 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
918 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
919 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
920 ; AVX2-NEXT: movq %rbp, %rsp
921 ; AVX2-NEXT: popq %rbp
924 ; AVX512DQ-LABEL: test_v32i16_align16:
926 ; AVX512DQ-NEXT: pushq %rbp
927 ; AVX512DQ-NEXT: movq %rsp, %rbp
928 ; AVX512DQ-NEXT: andq $-32, %rsp
929 ; AVX512DQ-NEXT: subq $96, %rsp
930 ; AVX512DQ-NEXT: vmovntdqa 16(%rdi), %xmm0
931 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
932 ; AVX512DQ-NEXT: vmovntdqa (%rdi), %xmm0
933 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsp)
934 ; AVX512DQ-NEXT: vmovntdqa 48(%rdi), %xmm0
935 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
936 ; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %xmm0
937 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
938 ; AVX512DQ-NEXT: vmovaps (%rsp), %ymm0
939 ; AVX512DQ-NEXT: vinsertf64x4 $1, {{[0-9]+}}(%rsp), %zmm0, %zmm0
940 ; AVX512DQ-NEXT: movq %rbp, %rsp
941 ; AVX512DQ-NEXT: popq %rbp
942 ; AVX512DQ-NEXT: retq
944 ; AVX512BW-LABEL: test_v32i16_align16:
946 ; AVX512BW-NEXT: pushq %rbp
947 ; AVX512BW-NEXT: movq %rsp, %rbp
948 ; AVX512BW-NEXT: andq $-64, %rsp
949 ; AVX512BW-NEXT: subq $128, %rsp
950 ; AVX512BW-NEXT: vmovntdqa 48(%rdi), %xmm0
951 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
952 ; AVX512BW-NEXT: vmovntdqa 32(%rdi), %xmm0
953 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
954 ; AVX512BW-NEXT: vmovntdqa 16(%rdi), %xmm0
955 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
956 ; AVX512BW-NEXT: vmovntdqa (%rdi), %xmm0
957 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsp)
958 ; AVX512BW-NEXT: vmovaps (%rsp), %zmm0
959 ; AVX512BW-NEXT: movq %rbp, %rsp
960 ; AVX512BW-NEXT: popq %rbp
961 ; AVX512BW-NEXT: retq
962 %1 = load <32 x i16>, <32 x i16>* %src, align 16, !nontemporal !1
966 define <64 x i8> @test_v64i8_align16(<64 x i8>* %src) nounwind {
967 ; SSE2-LABEL: test_v64i8_align16:
969 ; SSE2-NEXT: movaps (%rdi), %xmm0
970 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
971 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
972 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
975 ; SSE41-LABEL: test_v64i8_align16:
977 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
978 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
979 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
980 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
983 ; AVX1-LABEL: test_v64i8_align16:
985 ; AVX1-NEXT: pushq %rbp
986 ; AVX1-NEXT: movq %rsp, %rbp
987 ; AVX1-NEXT: andq $-32, %rsp
988 ; AVX1-NEXT: subq $96, %rsp
989 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
990 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
991 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
992 ; AVX1-NEXT: vmovdqa %xmm0, (%rsp)
993 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0
994 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
995 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0
996 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
997 ; AVX1-NEXT: vmovaps (%rsp), %ymm0
998 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
999 ; AVX1-NEXT: movq %rbp, %rsp
1000 ; AVX1-NEXT: popq %rbp
1003 ; AVX2-LABEL: test_v64i8_align16:
1005 ; AVX2-NEXT: pushq %rbp
1006 ; AVX2-NEXT: movq %rsp, %rbp
1007 ; AVX2-NEXT: andq $-32, %rsp
1008 ; AVX2-NEXT: subq $96, %rsp
1009 ; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0
1010 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1011 ; AVX2-NEXT: vmovntdqa (%rdi), %xmm0
1012 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp)
1013 ; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0
1014 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1015 ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0
1016 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1017 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
1018 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
1019 ; AVX2-NEXT: movq %rbp, %rsp
1020 ; AVX2-NEXT: popq %rbp
1023 ; AVX512DQ-LABEL: test_v64i8_align16:
1024 ; AVX512DQ: # %bb.0:
1025 ; AVX512DQ-NEXT: pushq %rbp
1026 ; AVX512DQ-NEXT: movq %rsp, %rbp
1027 ; AVX512DQ-NEXT: andq $-32, %rsp
1028 ; AVX512DQ-NEXT: subq $96, %rsp
1029 ; AVX512DQ-NEXT: vmovntdqa 16(%rdi), %xmm0
1030 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1031 ; AVX512DQ-NEXT: vmovntdqa (%rdi), %xmm0
1032 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsp)
1033 ; AVX512DQ-NEXT: vmovntdqa 48(%rdi), %xmm0
1034 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1035 ; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %xmm0
1036 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1037 ; AVX512DQ-NEXT: vmovaps (%rsp), %ymm0
1038 ; AVX512DQ-NEXT: vinsertf64x4 $1, {{[0-9]+}}(%rsp), %zmm0, %zmm0
1039 ; AVX512DQ-NEXT: movq %rbp, %rsp
1040 ; AVX512DQ-NEXT: popq %rbp
1041 ; AVX512DQ-NEXT: retq
1043 ; AVX512BW-LABEL: test_v64i8_align16:
1044 ; AVX512BW: # %bb.0:
1045 ; AVX512BW-NEXT: pushq %rbp
1046 ; AVX512BW-NEXT: movq %rsp, %rbp
1047 ; AVX512BW-NEXT: andq $-64, %rsp
1048 ; AVX512BW-NEXT: subq $128, %rsp
1049 ; AVX512BW-NEXT: vmovntdqa 48(%rdi), %xmm0
1050 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1051 ; AVX512BW-NEXT: vmovntdqa 32(%rdi), %xmm0
1052 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1053 ; AVX512BW-NEXT: vmovntdqa 16(%rdi), %xmm0
1054 ; AVX512BW-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
1055 ; AVX512BW-NEXT: vmovntdqa (%rdi), %xmm0
1056 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsp)
1057 ; AVX512BW-NEXT: vmovaps (%rsp), %zmm0
1058 ; AVX512BW-NEXT: movq %rbp, %rsp
1059 ; AVX512BW-NEXT: popq %rbp
1060 ; AVX512BW-NEXT: retq
1061 %1 = load <64 x i8>, <64 x i8>* %src, align 16, !nontemporal !1
1065 define <8 x double> @test_v8f64_align32(<8 x double>* %src) nounwind {
1066 ; SSE2-LABEL: test_v8f64_align32:
1068 ; SSE2-NEXT: movaps (%rdi), %xmm0
1069 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
1070 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
1071 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
1074 ; SSE41-LABEL: test_v8f64_align32:
1076 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
1077 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
1078 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
1079 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
1082 ; AVX1-LABEL: test_v8f64_align32:
1084 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1085 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
1086 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1087 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
1088 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1089 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1092 ; AVX2-LABEL: test_v8f64_align32:
1094 ; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
1095 ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
1098 ; AVX512-LABEL: test_v8f64_align32:
1100 ; AVX512-NEXT: pushq %rbp
1101 ; AVX512-NEXT: movq %rsp, %rbp
1102 ; AVX512-NEXT: andq $-64, %rsp
1103 ; AVX512-NEXT: subq $128, %rsp
1104 ; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0
1105 ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1106 ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
1107 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp)
1108 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
1109 ; AVX512-NEXT: movq %rbp, %rsp
1110 ; AVX512-NEXT: popq %rbp
1112 %1 = load <8 x double>, <8 x double>* %src, align 32, !nontemporal !1
1116 define <16 x float> @test_v16f32_align32(<16 x float>* %src) nounwind {
1117 ; SSE2-LABEL: test_v16f32_align32:
1119 ; SSE2-NEXT: movaps (%rdi), %xmm0
1120 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
1121 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
1122 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
1125 ; SSE41-LABEL: test_v16f32_align32:
1127 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
1128 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
1129 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
1130 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
1133 ; AVX1-LABEL: test_v16f32_align32:
1135 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1136 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
1137 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1138 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
1139 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1140 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1143 ; AVX2-LABEL: test_v16f32_align32:
1145 ; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
1146 ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
1149 ; AVX512-LABEL: test_v16f32_align32:
1151 ; AVX512-NEXT: pushq %rbp
1152 ; AVX512-NEXT: movq %rsp, %rbp
1153 ; AVX512-NEXT: andq $-64, %rsp
1154 ; AVX512-NEXT: subq $128, %rsp
1155 ; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0
1156 ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1157 ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
1158 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp)
1159 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
1160 ; AVX512-NEXT: movq %rbp, %rsp
1161 ; AVX512-NEXT: popq %rbp
1163 %1 = load <16 x float>, <16 x float>* %src, align 32, !nontemporal !1
1167 define <8 x i64> @test_v8i64_align32(<8 x i64>* %src) nounwind {
1168 ; SSE2-LABEL: test_v8i64_align32:
1170 ; SSE2-NEXT: movaps (%rdi), %xmm0
1171 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
1172 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
1173 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
1176 ; SSE41-LABEL: test_v8i64_align32:
1178 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
1179 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
1180 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
1181 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
1184 ; AVX1-LABEL: test_v8i64_align32:
1186 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1187 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
1188 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1189 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
1190 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1191 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1194 ; AVX2-LABEL: test_v8i64_align32:
1196 ; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
1197 ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
1200 ; AVX512-LABEL: test_v8i64_align32:
1202 ; AVX512-NEXT: pushq %rbp
1203 ; AVX512-NEXT: movq %rsp, %rbp
1204 ; AVX512-NEXT: andq $-64, %rsp
1205 ; AVX512-NEXT: subq $128, %rsp
1206 ; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0
1207 ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1208 ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
1209 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp)
1210 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
1211 ; AVX512-NEXT: movq %rbp, %rsp
1212 ; AVX512-NEXT: popq %rbp
1214 %1 = load <8 x i64>, <8 x i64>* %src, align 32, !nontemporal !1
1218 define <16 x i32> @test_v16i32_align32(<16 x i32>* %src) nounwind {
1219 ; SSE2-LABEL: test_v16i32_align32:
1221 ; SSE2-NEXT: movaps (%rdi), %xmm0
1222 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
1223 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
1224 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
1227 ; SSE41-LABEL: test_v16i32_align32:
1229 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
1230 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
1231 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
1232 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
1235 ; AVX1-LABEL: test_v16i32_align32:
1237 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1238 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
1239 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1240 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
1241 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1242 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1245 ; AVX2-LABEL: test_v16i32_align32:
1247 ; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
1248 ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
1251 ; AVX512-LABEL: test_v16i32_align32:
1253 ; AVX512-NEXT: pushq %rbp
1254 ; AVX512-NEXT: movq %rsp, %rbp
1255 ; AVX512-NEXT: andq $-64, %rsp
1256 ; AVX512-NEXT: subq $128, %rsp
1257 ; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0
1258 ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1259 ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
1260 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp)
1261 ; AVX512-NEXT: vmovaps (%rsp), %zmm0
1262 ; AVX512-NEXT: movq %rbp, %rsp
1263 ; AVX512-NEXT: popq %rbp
1265 %1 = load <16 x i32>, <16 x i32>* %src, align 32, !nontemporal !1
1269 define <32 x i16> @test_v32i16_align32(<32 x i16>* %src) nounwind {
1270 ; SSE2-LABEL: test_v32i16_align32:
1272 ; SSE2-NEXT: movaps (%rdi), %xmm0
1273 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
1274 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
1275 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
1278 ; SSE41-LABEL: test_v32i16_align32:
1280 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
1281 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
1282 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
1283 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
1286 ; AVX1-LABEL: test_v32i16_align32:
1288 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1289 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
1290 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1291 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
1292 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1293 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1296 ; AVX2-LABEL: test_v32i16_align32:
1298 ; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
1299 ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
1302 ; AVX512DQ-LABEL: test_v32i16_align32:
1303 ; AVX512DQ: # %bb.0:
1304 ; AVX512DQ-NEXT: vmovntdqa (%rdi), %ymm0
1305 ; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %ymm1
1306 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1307 ; AVX512DQ-NEXT: retq
1309 ; AVX512BW-LABEL: test_v32i16_align32:
1310 ; AVX512BW: # %bb.0:
1311 ; AVX512BW-NEXT: pushq %rbp
1312 ; AVX512BW-NEXT: movq %rsp, %rbp
1313 ; AVX512BW-NEXT: andq $-64, %rsp
1314 ; AVX512BW-NEXT: subq $128, %rsp
1315 ; AVX512BW-NEXT: vmovntdqa 32(%rdi), %ymm0
1316 ; AVX512BW-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1317 ; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0
1318 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsp)
1319 ; AVX512BW-NEXT: vmovaps (%rsp), %zmm0
1320 ; AVX512BW-NEXT: movq %rbp, %rsp
1321 ; AVX512BW-NEXT: popq %rbp
1322 ; AVX512BW-NEXT: retq
1323 %1 = load <32 x i16>, <32 x i16>* %src, align 32, !nontemporal !1
1327 define <64 x i8> @test_v64i8_align32(<64 x i8>* %src) nounwind {
1328 ; SSE2-LABEL: test_v64i8_align32:
1330 ; SSE2-NEXT: movaps (%rdi), %xmm0
1331 ; SSE2-NEXT: movaps 16(%rdi), %xmm1
1332 ; SSE2-NEXT: movaps 32(%rdi), %xmm2
1333 ; SSE2-NEXT: movaps 48(%rdi), %xmm3
1336 ; SSE41-LABEL: test_v64i8_align32:
1338 ; SSE41-NEXT: movntdqa (%rdi), %xmm0
1339 ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
1340 ; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
1341 ; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
1344 ; AVX1-LABEL: test_v64i8_align32:
1346 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
1347 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
1348 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1349 ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
1350 ; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
1351 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1354 ; AVX2-LABEL: test_v64i8_align32:
1356 ; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
1357 ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
1360 ; AVX512DQ-LABEL: test_v64i8_align32:
1361 ; AVX512DQ: # %bb.0:
1362 ; AVX512DQ-NEXT: vmovntdqa (%rdi), %ymm0
1363 ; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %ymm1
1364 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1365 ; AVX512DQ-NEXT: retq
1367 ; AVX512BW-LABEL: test_v64i8_align32:
1368 ; AVX512BW: # %bb.0:
1369 ; AVX512BW-NEXT: pushq %rbp
1370 ; AVX512BW-NEXT: movq %rsp, %rbp
1371 ; AVX512BW-NEXT: andq $-64, %rsp
1372 ; AVX512BW-NEXT: subq $128, %rsp
1373 ; AVX512BW-NEXT: vmovntdqa 32(%rdi), %ymm0
1374 ; AVX512BW-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1375 ; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0
1376 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsp)
1377 ; AVX512BW-NEXT: vmovaps (%rsp), %zmm0
1378 ; AVX512BW-NEXT: movq %rbp, %rsp
1379 ; AVX512BW-NEXT: popq %rbp
1380 ; AVX512BW-NEXT: retq
1381 %1 = load <64 x i8>, <64 x i8>* %src, align 32, !nontemporal !1