1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=SSE --check-prefix=SSE4A
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=VLX
9 ; Make sure that we generate non-temporal stores for the test cases below.
10 ; We use xorps for zeroing, so domain information isn't available anymore.
12 ; Scalar versions (zeroing means we can this even for fp types).
14 define void @test_zero_f32(float* %dst) {
15 ; SSE-LABEL: test_zero_f32:
17 ; SSE-NEXT: xorl %eax, %eax
18 ; SSE-NEXT: movntil %eax, (%rdi)
21 ; AVX-LABEL: test_zero_f32:
23 ; AVX-NEXT: xorl %eax, %eax
24 ; AVX-NEXT: movntil %eax, (%rdi)
27 ; VLX-LABEL: test_zero_f32:
29 ; VLX-NEXT: xorl %eax, %eax
30 ; VLX-NEXT: movntil %eax, (%rdi)
32 store float zeroinitializer, float* %dst, align 1, !nontemporal !1
36 define void @test_zero_i32(i32* %dst) {
37 ; SSE-LABEL: test_zero_i32:
39 ; SSE-NEXT: xorl %eax, %eax
40 ; SSE-NEXT: movntil %eax, (%rdi)
43 ; AVX-LABEL: test_zero_i32:
45 ; AVX-NEXT: xorl %eax, %eax
46 ; AVX-NEXT: movntil %eax, (%rdi)
49 ; VLX-LABEL: test_zero_i32:
51 ; VLX-NEXT: xorl %eax, %eax
52 ; VLX-NEXT: movntil %eax, (%rdi)
54 store i32 zeroinitializer, i32* %dst, align 1, !nontemporal !1
58 define void @test_zero_f64(double* %dst) {
59 ; SSE-LABEL: test_zero_f64:
61 ; SSE-NEXT: xorl %eax, %eax
62 ; SSE-NEXT: movntiq %rax, (%rdi)
65 ; AVX-LABEL: test_zero_f64:
67 ; AVX-NEXT: xorl %eax, %eax
68 ; AVX-NEXT: movntiq %rax, (%rdi)
71 ; VLX-LABEL: test_zero_f64:
73 ; VLX-NEXT: xorl %eax, %eax
74 ; VLX-NEXT: movntiq %rax, (%rdi)
76 store double zeroinitializer, double* %dst, align 1, !nontemporal !1
80 define void @test_zero_i64(i64* %dst) {
81 ; SSE-LABEL: test_zero_i64:
83 ; SSE-NEXT: xorl %eax, %eax
84 ; SSE-NEXT: movntiq %rax, (%rdi)
87 ; AVX-LABEL: test_zero_i64:
89 ; AVX-NEXT: xorl %eax, %eax
90 ; AVX-NEXT: movntiq %rax, (%rdi)
93 ; VLX-LABEL: test_zero_i64:
95 ; VLX-NEXT: xorl %eax, %eax
96 ; VLX-NEXT: movntiq %rax, (%rdi)
98 store i64 zeroinitializer, i64* %dst, align 1, !nontemporal !1
102 ; And now XMM versions.
104 define void @test_zero_v4f32(<4 x float>* %dst) {
105 ; SSE-LABEL: test_zero_v4f32:
107 ; SSE-NEXT: xorps %xmm0, %xmm0
108 ; SSE-NEXT: movntps %xmm0, (%rdi)
111 ; AVX-LABEL: test_zero_v4f32:
113 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
114 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
117 ; VLX-LABEL: test_zero_v4f32:
119 ; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
120 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
122 store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
126 define void @test_zero_v4i32(<4 x i32>* %dst) {
127 ; SSE-LABEL: test_zero_v4i32:
129 ; SSE-NEXT: xorps %xmm0, %xmm0
130 ; SSE-NEXT: movntps %xmm0, (%rdi)
133 ; AVX-LABEL: test_zero_v4i32:
135 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
136 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
139 ; VLX-LABEL: test_zero_v4i32:
141 ; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
142 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
144 store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
145 store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
149 define void @test_zero_v2f64(<2 x double>* %dst) {
150 ; SSE-LABEL: test_zero_v2f64:
152 ; SSE-NEXT: xorps %xmm0, %xmm0
153 ; SSE-NEXT: movntps %xmm0, (%rdi)
156 ; AVX-LABEL: test_zero_v2f64:
158 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
159 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
162 ; VLX-LABEL: test_zero_v2f64:
164 ; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
165 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
167 store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
171 define void @test_zero_v2i64(<2 x i64>* %dst) {
172 ; SSE-LABEL: test_zero_v2i64:
174 ; SSE-NEXT: xorps %xmm0, %xmm0
175 ; SSE-NEXT: movntps %xmm0, (%rdi)
178 ; AVX-LABEL: test_zero_v2i64:
180 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
181 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
184 ; VLX-LABEL: test_zero_v2i64:
186 ; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
187 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
189 store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1
193 define void @test_zero_v8i16(<8 x i16>* %dst) {
194 ; SSE-LABEL: test_zero_v8i16:
196 ; SSE-NEXT: xorps %xmm0, %xmm0
197 ; SSE-NEXT: movntps %xmm0, (%rdi)
200 ; AVX-LABEL: test_zero_v8i16:
202 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
203 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
206 ; VLX-LABEL: test_zero_v8i16:
208 ; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
209 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
211 store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1
215 define void @test_zero_v16i8(<16 x i8>* %dst) {
216 ; SSE-LABEL: test_zero_v16i8:
218 ; SSE-NEXT: xorps %xmm0, %xmm0
219 ; SSE-NEXT: movntps %xmm0, (%rdi)
222 ; AVX-LABEL: test_zero_v16i8:
224 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
225 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
228 ; VLX-LABEL: test_zero_v16i8:
230 ; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
231 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
233 store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1
237 ; And now YMM versions.
239 define void @test_zero_v8f32(<8 x float>* %dst) {
240 ; SSE-LABEL: test_zero_v8f32:
242 ; SSE-NEXT: xorps %xmm0, %xmm0
243 ; SSE-NEXT: movntps %xmm0, 16(%rdi)
244 ; SSE-NEXT: movntps %xmm0, (%rdi)
247 ; AVX-LABEL: test_zero_v8f32:
249 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
250 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
251 ; AVX-NEXT: vzeroupper
254 ; VLX-LABEL: test_zero_v8f32:
256 ; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
257 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
258 ; VLX-NEXT: vzeroupper
260 store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1
264 define void @test_zero_v8i32(<8 x i32>* %dst) {
265 ; SSE-LABEL: test_zero_v8i32:
267 ; SSE-NEXT: xorps %xmm0, %xmm0
268 ; SSE-NEXT: movntps %xmm0, 16(%rdi)
269 ; SSE-NEXT: movntps %xmm0, (%rdi)
272 ; AVX-LABEL: test_zero_v8i32:
274 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
275 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
276 ; AVX-NEXT: vzeroupper
279 ; VLX-LABEL: test_zero_v8i32:
281 ; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
282 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
283 ; VLX-NEXT: vzeroupper
285 store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1
289 define void @test_zero_v4f64(<4 x double>* %dst) {
290 ; SSE-LABEL: test_zero_v4f64:
292 ; SSE-NEXT: xorps %xmm0, %xmm0
293 ; SSE-NEXT: movntps %xmm0, 16(%rdi)
294 ; SSE-NEXT: movntps %xmm0, (%rdi)
297 ; AVX-LABEL: test_zero_v4f64:
299 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
300 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
301 ; AVX-NEXT: vzeroupper
304 ; VLX-LABEL: test_zero_v4f64:
306 ; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
307 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
308 ; VLX-NEXT: vzeroupper
310 store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1
314 define void @test_zero_v4i64(<4 x i64>* %dst) {
315 ; SSE-LABEL: test_zero_v4i64:
317 ; SSE-NEXT: xorps %xmm0, %xmm0
318 ; SSE-NEXT: movntps %xmm0, 16(%rdi)
319 ; SSE-NEXT: movntps %xmm0, (%rdi)
322 ; AVX-LABEL: test_zero_v4i64:
324 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
325 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
326 ; AVX-NEXT: vzeroupper
329 ; VLX-LABEL: test_zero_v4i64:
331 ; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
332 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
333 ; VLX-NEXT: vzeroupper
335 store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1
339 define void @test_zero_v16i16(<16 x i16>* %dst) {
340 ; SSE-LABEL: test_zero_v16i16:
342 ; SSE-NEXT: xorps %xmm0, %xmm0
343 ; SSE-NEXT: movntps %xmm0, 16(%rdi)
344 ; SSE-NEXT: movntps %xmm0, (%rdi)
347 ; AVX-LABEL: test_zero_v16i16:
349 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
350 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
351 ; AVX-NEXT: vzeroupper
354 ; VLX-LABEL: test_zero_v16i16:
356 ; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
357 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
358 ; VLX-NEXT: vzeroupper
360 store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1
364 define void @test_zero_v32i8(<32 x i8>* %dst) {
365 ; SSE-LABEL: test_zero_v32i8:
367 ; SSE-NEXT: xorps %xmm0, %xmm0
368 ; SSE-NEXT: movntps %xmm0, 16(%rdi)
369 ; SSE-NEXT: movntps %xmm0, (%rdi)
372 ; AVX-LABEL: test_zero_v32i8:
374 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
375 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
376 ; AVX-NEXT: vzeroupper
379 ; VLX-LABEL: test_zero_v32i8:
381 ; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
382 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
383 ; VLX-NEXT: vzeroupper
385 store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1
390 ; Check that we also handle arguments. Here the type survives longer.
394 define void @test_arg_f32(float %arg, float* %dst) {
395 ; SSE2-LABEL: test_arg_f32:
397 ; SSE2-NEXT: movss %xmm0, (%rdi)
400 ; SSE4A-LABEL: test_arg_f32:
402 ; SSE4A-NEXT: movntss %xmm0, (%rdi)
405 ; SSE41-LABEL: test_arg_f32:
407 ; SSE41-NEXT: movss %xmm0, (%rdi)
410 ; AVX-LABEL: test_arg_f32:
412 ; AVX-NEXT: vmovss %xmm0, (%rdi)
415 ; VLX-LABEL: test_arg_f32:
417 ; VLX-NEXT: vmovss %xmm0, (%rdi)
419 store float %arg, float* %dst, align 1, !nontemporal !1
423 define void @test_arg_i32(i32 %arg, i32* %dst) {
424 ; SSE-LABEL: test_arg_i32:
426 ; SSE-NEXT: movntil %edi, (%rsi)
429 ; AVX-LABEL: test_arg_i32:
431 ; AVX-NEXT: movntil %edi, (%rsi)
434 ; VLX-LABEL: test_arg_i32:
436 ; VLX-NEXT: movntil %edi, (%rsi)
438 store i32 %arg, i32* %dst, align 1, !nontemporal !1
442 define void @test_arg_f64(double %arg, double* %dst) {
443 ; SSE2-LABEL: test_arg_f64:
445 ; SSE2-NEXT: movsd %xmm0, (%rdi)
448 ; SSE4A-LABEL: test_arg_f64:
450 ; SSE4A-NEXT: movntsd %xmm0, (%rdi)
453 ; SSE41-LABEL: test_arg_f64:
455 ; SSE41-NEXT: movsd %xmm0, (%rdi)
458 ; AVX-LABEL: test_arg_f64:
460 ; AVX-NEXT: vmovsd %xmm0, (%rdi)
463 ; VLX-LABEL: test_arg_f64:
465 ; VLX-NEXT: vmovsd %xmm0, (%rdi)
467 store double %arg, double* %dst, align 1, !nontemporal !1
471 define void @test_arg_i64(i64 %arg, i64* %dst) {
472 ; SSE-LABEL: test_arg_i64:
474 ; SSE-NEXT: movntiq %rdi, (%rsi)
477 ; AVX-LABEL: test_arg_i64:
479 ; AVX-NEXT: movntiq %rdi, (%rsi)
482 ; VLX-LABEL: test_arg_i64:
484 ; VLX-NEXT: movntiq %rdi, (%rsi)
486 store i64 %arg, i64* %dst, align 1, !nontemporal !1
492 define void @test_extract_f32(<4 x float> %arg, float* %dst) {
493 ; SSE2-LABEL: test_extract_f32:
495 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
496 ; SSE2-NEXT: movss %xmm0, (%rdi)
499 ; SSE4A-LABEL: test_extract_f32:
501 ; SSE4A-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
502 ; SSE4A-NEXT: movntss %xmm0, (%rdi)
505 ; SSE41-LABEL: test_extract_f32:
507 ; SSE41-NEXT: extractps $1, %xmm0, %eax
508 ; SSE41-NEXT: movntil %eax, (%rdi)
511 ; AVX-LABEL: test_extract_f32:
513 ; AVX-NEXT: vextractps $1, %xmm0, %eax
514 ; AVX-NEXT: movntil %eax, (%rdi)
517 ; VLX-LABEL: test_extract_f32:
519 ; VLX-NEXT: vextractps $1, %xmm0, %eax
520 ; VLX-NEXT: movntil %eax, (%rdi)
522 %1 = extractelement <4 x float> %arg, i32 1
523 store float %1, float* %dst, align 1, !nontemporal !1
527 define void @test_extract_i32(<4 x i32> %arg, i32* %dst) {
528 ; SSE2-LABEL: test_extract_i32:
530 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
531 ; SSE2-NEXT: movd %xmm0, %eax
532 ; SSE2-NEXT: movntil %eax, (%rdi)
535 ; SSE4A-LABEL: test_extract_i32:
537 ; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
538 ; SSE4A-NEXT: movd %xmm0, %eax
539 ; SSE4A-NEXT: movntil %eax, (%rdi)
542 ; SSE41-LABEL: test_extract_i32:
544 ; SSE41-NEXT: extractps $1, %xmm0, %eax
545 ; SSE41-NEXT: movntil %eax, (%rdi)
548 ; AVX-LABEL: test_extract_i32:
550 ; AVX-NEXT: vextractps $1, %xmm0, %eax
551 ; AVX-NEXT: movntil %eax, (%rdi)
554 ; VLX-LABEL: test_extract_i32:
556 ; VLX-NEXT: vextractps $1, %xmm0, %eax
557 ; VLX-NEXT: movntil %eax, (%rdi)
559 %1 = extractelement <4 x i32> %arg, i32 1
560 store i32 %1, i32* %dst, align 1, !nontemporal !1
564 define void @test_extract_f64(<2 x double> %arg, double* %dst) {
565 ; SSE2-LABEL: test_extract_f64:
567 ; SSE2-NEXT: movhpd %xmm0, (%rdi)
570 ; SSE4A-LABEL: test_extract_f64:
572 ; SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
573 ; SSE4A-NEXT: movntsd %xmm0, (%rdi)
576 ; SSE41-LABEL: test_extract_f64:
578 ; SSE41-NEXT: movhpd %xmm0, (%rdi)
581 ; AVX-LABEL: test_extract_f64:
583 ; AVX-NEXT: vmovhpd %xmm0, (%rdi)
586 ; VLX-LABEL: test_extract_f64:
588 ; VLX-NEXT: vmovhpd %xmm0, (%rdi)
590 %1 = extractelement <2 x double> %arg, i32 1
591 store double %1, double* %dst, align 1, !nontemporal !1
595 define void @test_extract_i64(<2 x i64> %arg, i64* %dst) {
596 ; SSE2-LABEL: test_extract_i64:
598 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
599 ; SSE2-NEXT: movq %xmm0, %rax
600 ; SSE2-NEXT: movntiq %rax, (%rdi)
603 ; SSE4A-LABEL: test_extract_i64:
605 ; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
606 ; SSE4A-NEXT: movq %xmm0, %rax
607 ; SSE4A-NEXT: movntiq %rax, (%rdi)
610 ; SSE41-LABEL: test_extract_i64:
612 ; SSE41-NEXT: pextrq $1, %xmm0, %rax
613 ; SSE41-NEXT: movntiq %rax, (%rdi)
616 ; AVX-LABEL: test_extract_i64:
618 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
619 ; AVX-NEXT: movntiq %rax, (%rdi)
622 ; VLX-LABEL: test_extract_i64:
624 ; VLX-NEXT: vpextrq $1, %xmm0, %rax
625 ; VLX-NEXT: movntiq %rax, (%rdi)
627 %1 = extractelement <2 x i64> %arg, i32 1
628 store i64 %1, i64* %dst, align 1, !nontemporal !1
632 ; And now XMM versions.
634 define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) {
635 ; SSE-LABEL: test_arg_v4f32:
637 ; SSE-NEXT: movntps %xmm0, (%rdi)
640 ; AVX-LABEL: test_arg_v4f32:
642 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
645 ; VLX-LABEL: test_arg_v4f32:
647 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
649 store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1
653 define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) {
654 ; SSE-LABEL: test_arg_v4i32:
656 ; SSE-NEXT: movntps %xmm0, (%rdi)
659 ; AVX-LABEL: test_arg_v4i32:
661 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
664 ; VLX-LABEL: test_arg_v4i32:
666 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
668 store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1
672 define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) {
673 ; SSE-LABEL: test_arg_v2f64:
675 ; SSE-NEXT: movntps %xmm0, (%rdi)
678 ; AVX-LABEL: test_arg_v2f64:
680 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
683 ; VLX-LABEL: test_arg_v2f64:
685 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
687 store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1
691 define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) {
692 ; SSE-LABEL: test_arg_v2i64:
694 ; SSE-NEXT: movntps %xmm0, (%rdi)
697 ; AVX-LABEL: test_arg_v2i64:
699 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
702 ; VLX-LABEL: test_arg_v2i64:
704 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
706 store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1
710 define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) {
711 ; SSE-LABEL: test_arg_v8i16:
713 ; SSE-NEXT: movntps %xmm0, (%rdi)
716 ; AVX-LABEL: test_arg_v8i16:
718 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
721 ; VLX-LABEL: test_arg_v8i16:
723 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
725 store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1
729 define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
730 ; SSE-LABEL: test_arg_v16i8:
732 ; SSE-NEXT: movntps %xmm0, (%rdi)
735 ; AVX-LABEL: test_arg_v16i8:
737 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
740 ; VLX-LABEL: test_arg_v16i8:
742 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
744 store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1
748 ; And now YMM versions.
750 define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) {
751 ; SSE-LABEL: test_arg_v8f32:
753 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
754 ; SSE-NEXT: movntps %xmm0, (%rdi)
757 ; AVX-LABEL: test_arg_v8f32:
759 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
760 ; AVX-NEXT: vzeroupper
763 ; VLX-LABEL: test_arg_v8f32:
765 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
766 ; VLX-NEXT: vzeroupper
768 store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1
772 define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) {
773 ; SSE-LABEL: test_arg_v8i32:
775 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
776 ; SSE-NEXT: movntps %xmm0, (%rdi)
779 ; AVX-LABEL: test_arg_v8i32:
781 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
782 ; AVX-NEXT: vzeroupper
785 ; VLX-LABEL: test_arg_v8i32:
787 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
788 ; VLX-NEXT: vzeroupper
790 store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1
794 define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) {
795 ; SSE-LABEL: test_arg_v4f64:
797 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
798 ; SSE-NEXT: movntps %xmm0, (%rdi)
801 ; AVX-LABEL: test_arg_v4f64:
803 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
804 ; AVX-NEXT: vzeroupper
807 ; VLX-LABEL: test_arg_v4f64:
809 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
810 ; VLX-NEXT: vzeroupper
812 store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1
816 define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) {
817 ; SSE-LABEL: test_arg_v4i64:
819 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
820 ; SSE-NEXT: movntps %xmm0, (%rdi)
823 ; AVX-LABEL: test_arg_v4i64:
825 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
826 ; AVX-NEXT: vzeroupper
829 ; VLX-LABEL: test_arg_v4i64:
831 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
832 ; VLX-NEXT: vzeroupper
834 store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1
838 define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) {
839 ; SSE-LABEL: test_arg_v16i16:
841 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
842 ; SSE-NEXT: movntps %xmm0, (%rdi)
845 ; AVX-LABEL: test_arg_v16i16:
847 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
848 ; AVX-NEXT: vzeroupper
851 ; VLX-LABEL: test_arg_v16i16:
853 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
854 ; VLX-NEXT: vzeroupper
856 store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1
860 define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
861 ; SSE-LABEL: test_arg_v32i8:
863 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
864 ; SSE-NEXT: movntps %xmm0, (%rdi)
867 ; AVX-LABEL: test_arg_v32i8:
869 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
870 ; AVX-NEXT: vzeroupper
873 ; VLX-LABEL: test_arg_v32i8:
875 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
876 ; VLX-NEXT: vzeroupper
878 store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1
883 ; Now check that if the execution domain is trivially visible, we use it.
884 ; We use an add to make the type survive all the way to the MOVNT.
886 define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) {
887 ; SSE-LABEL: test_op_v4f32:
889 ; SSE-NEXT: addps %xmm1, %xmm0
890 ; SSE-NEXT: movntps %xmm0, (%rdi)
893 ; AVX-LABEL: test_op_v4f32:
895 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
896 ; AVX-NEXT: vmovntps %xmm0, (%rdi)
899 ; VLX-LABEL: test_op_v4f32:
901 ; VLX-NEXT: vaddps %xmm1, %xmm0, %xmm0
902 ; VLX-NEXT: vmovntps %xmm0, (%rdi)
904 %r = fadd <4 x float> %a, %b
905 store <4 x float> %r, <4 x float>* %dst, align 16, !nontemporal !1
909 define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) {
910 ; SSE-LABEL: test_op_v4i32:
912 ; SSE-NEXT: paddd %xmm1, %xmm0
913 ; SSE-NEXT: movntdq %xmm0, (%rdi)
916 ; AVX-LABEL: test_op_v4i32:
918 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
919 ; AVX-NEXT: vmovntdq %xmm0, (%rdi)
922 ; VLX-LABEL: test_op_v4i32:
924 ; VLX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
925 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
927 %r = add <4 x i32> %a, %b
928 store <4 x i32> %r, <4 x i32>* %dst, align 16, !nontemporal !1
932 define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst) {
933 ; SSE-LABEL: test_op_v2f64:
935 ; SSE-NEXT: addpd %xmm1, %xmm0
936 ; SSE-NEXT: movntpd %xmm0, (%rdi)
939 ; AVX-LABEL: test_op_v2f64:
941 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
942 ; AVX-NEXT: vmovntpd %xmm0, (%rdi)
945 ; VLX-LABEL: test_op_v2f64:
947 ; VLX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
948 ; VLX-NEXT: vmovntpd %xmm0, (%rdi)
950 %r = fadd <2 x double> %a, %b
951 store <2 x double> %r, <2 x double>* %dst, align 16, !nontemporal !1
955 define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) {
956 ; SSE-LABEL: test_op_v2i64:
958 ; SSE-NEXT: paddq %xmm1, %xmm0
959 ; SSE-NEXT: movntdq %xmm0, (%rdi)
962 ; AVX-LABEL: test_op_v2i64:
964 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
965 ; AVX-NEXT: vmovntdq %xmm0, (%rdi)
968 ; VLX-LABEL: test_op_v2i64:
970 ; VLX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
971 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
973 %r = add <2 x i64> %a, %b
974 store <2 x i64> %r, <2 x i64>* %dst, align 16, !nontemporal !1
978 define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) {
979 ; SSE-LABEL: test_op_v8i16:
981 ; SSE-NEXT: paddw %xmm1, %xmm0
982 ; SSE-NEXT: movntdq %xmm0, (%rdi)
985 ; AVX-LABEL: test_op_v8i16:
987 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
988 ; AVX-NEXT: vmovntdq %xmm0, (%rdi)
991 ; VLX-LABEL: test_op_v8i16:
993 ; VLX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
994 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
996 %r = add <8 x i16> %a, %b
997 store <8 x i16> %r, <8 x i16>* %dst, align 16, !nontemporal !1
1001 define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) {
1002 ; SSE-LABEL: test_op_v16i8:
1004 ; SSE-NEXT: paddb %xmm1, %xmm0
1005 ; SSE-NEXT: movntdq %xmm0, (%rdi)
1008 ; AVX-LABEL: test_op_v16i8:
1010 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1011 ; AVX-NEXT: vmovntdq %xmm0, (%rdi)
1014 ; VLX-LABEL: test_op_v16i8:
1016 ; VLX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1017 ; VLX-NEXT: vmovntdq %xmm0, (%rdi)
1019 %r = add <16 x i8> %a, %b
1020 store <16 x i8> %r, <16 x i8>* %dst, align 16, !nontemporal !1
1024 ; And now YMM versions.
1026 define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
1027 ; SSE-LABEL: test_op_v8f32:
1029 ; SSE-NEXT: addps %xmm2, %xmm0
1030 ; SSE-NEXT: addps %xmm3, %xmm1
1031 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
1032 ; SSE-NEXT: movntps %xmm0, (%rdi)
1035 ; AVX-LABEL: test_op_v8f32:
1037 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
1038 ; AVX-NEXT: vmovntps %ymm0, (%rdi)
1039 ; AVX-NEXT: vzeroupper
1042 ; VLX-LABEL: test_op_v8f32:
1044 ; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
1045 ; VLX-NEXT: vmovntps %ymm0, (%rdi)
1046 ; VLX-NEXT: vzeroupper
1048 %r = fadd <8 x float> %a, %b
1049 store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1
1053 define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
1054 ; SSE-LABEL: test_op_v8i32:
1056 ; SSE-NEXT: paddd %xmm2, %xmm0
1057 ; SSE-NEXT: paddd %xmm3, %xmm1
1058 ; SSE-NEXT: movntdq %xmm1, 16(%rdi)
1059 ; SSE-NEXT: movntdq %xmm0, (%rdi)
1062 ; AVX1-LABEL: test_op_v8i32:
1064 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1065 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1066 ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
1067 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1068 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1069 ; AVX1-NEXT: vmovntps %ymm0, (%rdi)
1070 ; AVX1-NEXT: vzeroupper
1073 ; AVX2-LABEL: test_op_v8i32:
1075 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
1076 ; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
1077 ; AVX2-NEXT: vzeroupper
1080 ; VLX-LABEL: test_op_v8i32:
1082 ; VLX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
1083 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
1084 ; VLX-NEXT: vzeroupper
1086 %r = add <8 x i32> %a, %b
1087 store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1
1091 define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst) {
1092 ; SSE-LABEL: test_op_v4f64:
1094 ; SSE-NEXT: addpd %xmm2, %xmm0
1095 ; SSE-NEXT: addpd %xmm3, %xmm1
1096 ; SSE-NEXT: movntpd %xmm1, 16(%rdi)
1097 ; SSE-NEXT: movntpd %xmm0, (%rdi)
1100 ; AVX-LABEL: test_op_v4f64:
1102 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1103 ; AVX-NEXT: vmovntpd %ymm0, (%rdi)
1104 ; AVX-NEXT: vzeroupper
1107 ; VLX-LABEL: test_op_v4f64:
1109 ; VLX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
1110 ; VLX-NEXT: vmovntpd %ymm0, (%rdi)
1111 ; VLX-NEXT: vzeroupper
1113 %r = fadd <4 x double> %a, %b
1114 store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1
1118 define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
1119 ; SSE-LABEL: test_op_v4i64:
1121 ; SSE-NEXT: paddq %xmm2, %xmm0
1122 ; SSE-NEXT: paddq %xmm3, %xmm1
1123 ; SSE-NEXT: movntdq %xmm1, 16(%rdi)
1124 ; SSE-NEXT: movntdq %xmm0, (%rdi)
1127 ; AVX1-LABEL: test_op_v4i64:
1129 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1130 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1131 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
1132 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1133 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1134 ; AVX1-NEXT: vmovntps %ymm0, (%rdi)
1135 ; AVX1-NEXT: vzeroupper
1138 ; AVX2-LABEL: test_op_v4i64:
1140 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
1141 ; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
1142 ; AVX2-NEXT: vzeroupper
1145 ; VLX-LABEL: test_op_v4i64:
1147 ; VLX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
1148 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
1149 ; VLX-NEXT: vzeroupper
1151 %r = add <4 x i64> %a, %b
1152 store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1
1156 define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
1157 ; SSE-LABEL: test_op_v16i16:
1159 ; SSE-NEXT: paddw %xmm2, %xmm0
1160 ; SSE-NEXT: paddw %xmm3, %xmm1
1161 ; SSE-NEXT: movntdq %xmm1, 16(%rdi)
1162 ; SSE-NEXT: movntdq %xmm0, (%rdi)
1165 ; AVX1-LABEL: test_op_v16i16:
1167 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1168 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1169 ; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
1170 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
1171 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1172 ; AVX1-NEXT: vmovntps %ymm0, (%rdi)
1173 ; AVX1-NEXT: vzeroupper
1176 ; AVX2-LABEL: test_op_v16i16:
1178 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
1179 ; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
1180 ; AVX2-NEXT: vzeroupper
1183 ; VLX-LABEL: test_op_v16i16:
1185 ; VLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
1186 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
1187 ; VLX-NEXT: vzeroupper
1189 %r = add <16 x i16> %a, %b
1190 store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1
1194 define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
1195 ; SSE-LABEL: test_op_v32i8:
1197 ; SSE-NEXT: paddb %xmm2, %xmm0
1198 ; SSE-NEXT: paddb %xmm3, %xmm1
1199 ; SSE-NEXT: movntdq %xmm1, 16(%rdi)
1200 ; SSE-NEXT: movntdq %xmm0, (%rdi)
1203 ; AVX1-LABEL: test_op_v32i8:
1205 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1206 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1207 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
1208 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1209 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1210 ; AVX1-NEXT: vmovntps %ymm0, (%rdi)
1211 ; AVX1-NEXT: vzeroupper
1214 ; AVX2-LABEL: test_op_v32i8:
1216 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1217 ; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
1218 ; AVX2-NEXT: vzeroupper
1221 ; VLX-LABEL: test_op_v32i8:
1223 ; VLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1224 ; VLX-NEXT: vmovntdq %ymm0, (%rdi)
1225 ; VLX-NEXT: vzeroupper
1227 %r = add <32 x i8> %a, %b
1228 store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1
1232 ; 256-bit NT stores require 256-bit alignment.
1233 ; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we
1234 ; could even scalarize to movnti when we have 1-alignment: nontemporal is
1235 ; probably always worth even some 20 instruction scalarization.
1236 define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
1237 ; SSE-LABEL: test_unaligned_v8f32:
1239 ; SSE-NEXT: addps %xmm2, %xmm0
1240 ; SSE-NEXT: addps %xmm3, %xmm1
1241 ; SSE-NEXT: movntps %xmm1, 16(%rdi)
1242 ; SSE-NEXT: movntps %xmm0, (%rdi)
1245 ; AVX-LABEL: test_unaligned_v8f32:
1247 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
1248 ; AVX-NEXT: vmovups %ymm0, (%rdi)
1249 ; AVX-NEXT: vzeroupper
1252 ; VLX-LABEL: test_unaligned_v8f32:
1254 ; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
1255 ; VLX-NEXT: vmovups %ymm0, (%rdi)
1256 ; VLX-NEXT: vzeroupper
1258 %r = fadd <8 x float> %a, %b
1259 store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1