1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=SSE32
3 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=SSE64
4 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVXONLY32
5 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVXONLY64
6 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=KNL32
7 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=KNL64
8 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=SKX32
9 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=SKX64
11 define i32 @test_store_32(i32* nocapture %addr, i32 %value) {
12 ; ALL32-LABEL: test_store_32:
13 ; ALL32: # %bb.0: # %entry
14 ; ALL32-NEXT: movl %esi, %eax
15 ; ALL32-NEXT: movl %esi, (%rdi)
18 ; ALL64-LABEL: test_store_32:
19 ; ALL64: # %bb.0: # %entry
20 ; ALL64-NEXT: movl {{[0-9]+}}(%esp), %eax
21 ; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx
22 ; ALL64-NEXT: movl %eax, (%ecx)
25 store i32 %value, i32* %addr, align 1
29 define i16 @test_store_16(i16* nocapture %addr, i16 %value) {
30 ; ALL32-LABEL: test_store_16:
31 ; ALL32: # %bb.0: # %entry
32 ; ALL32-NEXT: movl %esi, %eax
33 ; ALL32-NEXT: movw %ax, (%rdi)
34 ; ALL32-NEXT: # kill: def $ax killed $ax killed $eax
37 ; ALL64-LABEL: test_store_16:
38 ; ALL64: # %bb.0: # %entry
39 ; ALL64-NEXT: movzwl {{[0-9]+}}(%esp), %eax
40 ; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx
41 ; ALL64-NEXT: movw %ax, (%ecx)
44 store i16 %value, i16* %addr, align 1
48 define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
49 ; SSE32-LABEL: test_store_4xi32:
51 ; SSE32-NEXT: paddd %xmm1, %xmm0
52 ; SSE32-NEXT: movdqu %xmm0, (%rdi)
55 ; SSE64-LABEL: test_store_4xi32:
57 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
58 ; SSE64-NEXT: paddd %xmm1, %xmm0
59 ; SSE64-NEXT: movdqu %xmm0, (%eax)
62 ; AVX32-LABEL: test_store_4xi32:
64 ; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
65 ; AVX32-NEXT: vmovdqu %xmm0, (%rdi)
68 ; AVX64-LABEL: test_store_4xi32:
70 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
71 ; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
72 ; AVX64-NEXT: vmovdqu %xmm0, (%eax)
74 %foo = add <4 x i32> %value, %value2 ; to force integer type on store
75 store <4 x i32> %foo, <4 x i32>* %addr, align 1
79 define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
80 ; SSE32-LABEL: test_store_4xi32_aligned:
82 ; SSE32-NEXT: paddd %xmm1, %xmm0
83 ; SSE32-NEXT: movdqa %xmm0, (%rdi)
86 ; SSE64-LABEL: test_store_4xi32_aligned:
88 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
89 ; SSE64-NEXT: paddd %xmm1, %xmm0
90 ; SSE64-NEXT: movdqa %xmm0, (%eax)
93 ; AVX32-LABEL: test_store_4xi32_aligned:
95 ; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
96 ; AVX32-NEXT: vmovdqa %xmm0, (%rdi)
99 ; AVX64-LABEL: test_store_4xi32_aligned:
101 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
102 ; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
103 ; AVX64-NEXT: vmovdqa %xmm0, (%eax)
105 %foo = add <4 x i32> %value, %value2 ; to force integer type on store
106 store <4 x i32> %foo, <4 x i32>* %addr, align 16
110 define <4 x float> @test_store_4xf32(<4 x float>* nocapture %addr, <4 x float> %value) {
111 ; SSE32-LABEL: test_store_4xf32:
113 ; SSE32-NEXT: movups %xmm0, (%rdi)
116 ; SSE64-LABEL: test_store_4xf32:
118 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
119 ; SSE64-NEXT: movups %xmm0, (%eax)
122 ; AVX32-LABEL: test_store_4xf32:
124 ; AVX32-NEXT: vmovups %xmm0, (%rdi)
127 ; AVX64-LABEL: test_store_4xf32:
129 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
130 ; AVX64-NEXT: vmovups %xmm0, (%eax)
132 store <4 x float> %value, <4 x float>* %addr, align 1
133 ret <4 x float> %value
136 define <4 x float> @test_store_4xf32_aligned(<4 x float>* nocapture %addr, <4 x float> %value) {
137 ; SSE32-LABEL: test_store_4xf32_aligned:
139 ; SSE32-NEXT: movaps %xmm0, (%rdi)
142 ; SSE64-LABEL: test_store_4xf32_aligned:
144 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
145 ; SSE64-NEXT: movaps %xmm0, (%eax)
148 ; AVX32-LABEL: test_store_4xf32_aligned:
150 ; AVX32-NEXT: vmovaps %xmm0, (%rdi)
153 ; AVX64-LABEL: test_store_4xf32_aligned:
155 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
156 ; AVX64-NEXT: vmovaps %xmm0, (%eax)
158 store <4 x float> %value, <4 x float>* %addr, align 16
159 ret <4 x float> %value
162 define <2 x double> @test_store_2xf64(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
163 ; SSE32-LABEL: test_store_2xf64:
165 ; SSE32-NEXT: addpd %xmm1, %xmm0
166 ; SSE32-NEXT: movupd %xmm0, (%rdi)
169 ; SSE64-LABEL: test_store_2xf64:
171 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
172 ; SSE64-NEXT: addpd %xmm1, %xmm0
173 ; SSE64-NEXT: movupd %xmm0, (%eax)
176 ; AVX32-LABEL: test_store_2xf64:
178 ; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0
179 ; AVX32-NEXT: vmovupd %xmm0, (%rdi)
182 ; AVX64-LABEL: test_store_2xf64:
184 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
185 ; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
186 ; AVX64-NEXT: vmovupd %xmm0, (%eax)
188 %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
189 store <2 x double> %foo, <2 x double>* %addr, align 1
190 ret <2 x double> %foo
193 define <2 x double> @test_store_2xf64_aligned(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
194 ; SSE32-LABEL: test_store_2xf64_aligned:
196 ; SSE32-NEXT: addpd %xmm1, %xmm0
197 ; SSE32-NEXT: movapd %xmm0, (%rdi)
200 ; SSE64-LABEL: test_store_2xf64_aligned:
202 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
203 ; SSE64-NEXT: addpd %xmm1, %xmm0
204 ; SSE64-NEXT: movapd %xmm0, (%eax)
207 ; AVX32-LABEL: test_store_2xf64_aligned:
209 ; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0
210 ; AVX32-NEXT: vmovapd %xmm0, (%rdi)
213 ; AVX64-LABEL: test_store_2xf64_aligned:
215 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
216 ; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
217 ; AVX64-NEXT: vmovapd %xmm0, (%eax)
219 %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
220 store <2 x double> %foo, <2 x double>* %addr, align 16
221 ret <2 x double> %foo
224 define <8 x i32> @test_store_8xi32(<8 x i32>* nocapture %addr, <8 x i32> %value) {
225 ; SSE32-LABEL: test_store_8xi32:
227 ; SSE32-NEXT: movups %xmm0, (%rdi)
228 ; SSE32-NEXT: movups %xmm1, 16(%rdi)
231 ; SSE64-LABEL: test_store_8xi32:
233 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
234 ; SSE64-NEXT: movups %xmm0, (%eax)
235 ; SSE64-NEXT: movups %xmm1, 16(%eax)
238 ; AVX32-LABEL: test_store_8xi32:
240 ; AVX32-NEXT: vmovups %ymm0, (%rdi)
243 ; AVX64-LABEL: test_store_8xi32:
245 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
246 ; AVX64-NEXT: vmovups %ymm0, (%eax)
248 store <8 x i32> %value, <8 x i32>* %addr, align 1
252 define <8 x i32> @test_store_8xi32_aligned(<8 x i32>* nocapture %addr, <8 x i32> %value) {
253 ; SSE32-LABEL: test_store_8xi32_aligned:
255 ; SSE32-NEXT: movaps %xmm0, (%rdi)
256 ; SSE32-NEXT: movaps %xmm1, 16(%rdi)
259 ; SSE64-LABEL: test_store_8xi32_aligned:
261 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
262 ; SSE64-NEXT: movaps %xmm0, (%eax)
263 ; SSE64-NEXT: movaps %xmm1, 16(%eax)
266 ; AVX32-LABEL: test_store_8xi32_aligned:
268 ; AVX32-NEXT: vmovaps %ymm0, (%rdi)
271 ; AVX64-LABEL: test_store_8xi32_aligned:
273 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
274 ; AVX64-NEXT: vmovaps %ymm0, (%eax)
276 store <8 x i32> %value, <8 x i32>* %addr, align 32
280 define <8 x float> @test_store_8xf32(<8 x float>* nocapture %addr, <8 x float> %value) {
281 ; SSE32-LABEL: test_store_8xf32:
283 ; SSE32-NEXT: movups %xmm0, (%rdi)
284 ; SSE32-NEXT: movups %xmm1, 16(%rdi)
287 ; SSE64-LABEL: test_store_8xf32:
289 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
290 ; SSE64-NEXT: movups %xmm0, (%eax)
291 ; SSE64-NEXT: movups %xmm1, 16(%eax)
294 ; AVX32-LABEL: test_store_8xf32:
296 ; AVX32-NEXT: vmovups %ymm0, (%rdi)
299 ; AVX64-LABEL: test_store_8xf32:
301 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
302 ; AVX64-NEXT: vmovups %ymm0, (%eax)
304 store <8 x float> %value, <8 x float>* %addr, align 1
305 ret <8 x float> %value
308 define <8 x float> @test_store_8xf32_aligned(<8 x float>* nocapture %addr, <8 x float> %value) {
309 ; SSE32-LABEL: test_store_8xf32_aligned:
311 ; SSE32-NEXT: movaps %xmm0, (%rdi)
312 ; SSE32-NEXT: movaps %xmm1, 16(%rdi)
315 ; SSE64-LABEL: test_store_8xf32_aligned:
317 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
318 ; SSE64-NEXT: movaps %xmm0, (%eax)
319 ; SSE64-NEXT: movaps %xmm1, 16(%eax)
322 ; AVX32-LABEL: test_store_8xf32_aligned:
324 ; AVX32-NEXT: vmovaps %ymm0, (%rdi)
327 ; AVX64-LABEL: test_store_8xf32_aligned:
329 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
330 ; AVX64-NEXT: vmovaps %ymm0, (%eax)
332 store <8 x float> %value, <8 x float>* %addr, align 32
333 ret <8 x float> %value
336 define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
337 ; SSE32-LABEL: test_store_4xf64:
339 ; SSE32-NEXT: addpd %xmm3, %xmm1
340 ; SSE32-NEXT: addpd %xmm2, %xmm0
341 ; SSE32-NEXT: movupd %xmm0, (%rdi)
342 ; SSE32-NEXT: movupd %xmm1, 16(%rdi)
345 ; SSE64-LABEL: test_store_4xf64:
347 ; SSE64-NEXT: subl $12, %esp
348 ; SSE64-NEXT: .cfi_def_cfa_offset 16
349 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
350 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1
351 ; SSE64-NEXT: addpd %xmm2, %xmm0
352 ; SSE64-NEXT: movupd %xmm0, (%eax)
353 ; SSE64-NEXT: movupd %xmm1, 16(%eax)
354 ; SSE64-NEXT: addl $12, %esp
355 ; SSE64-NEXT: .cfi_def_cfa_offset 4
358 ; AVX32-LABEL: test_store_4xf64:
360 ; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
361 ; AVX32-NEXT: vmovupd %ymm0, (%rdi)
364 ; AVX64-LABEL: test_store_4xf64:
366 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
367 ; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
368 ; AVX64-NEXT: vmovupd %ymm0, (%eax)
370 %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
371 store <4 x double> %foo, <4 x double>* %addr, align 1
372 ret <4 x double> %foo
375 define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
376 ; SSE32-LABEL: test_store_4xf64_aligned:
378 ; SSE32-NEXT: addpd %xmm3, %xmm1
379 ; SSE32-NEXT: addpd %xmm2, %xmm0
380 ; SSE32-NEXT: movapd %xmm0, (%rdi)
381 ; SSE32-NEXT: movapd %xmm1, 16(%rdi)
384 ; SSE64-LABEL: test_store_4xf64_aligned:
386 ; SSE64-NEXT: subl $12, %esp
387 ; SSE64-NEXT: .cfi_def_cfa_offset 16
388 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
389 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1
390 ; SSE64-NEXT: addpd %xmm2, %xmm0
391 ; SSE64-NEXT: movapd %xmm0, (%eax)
392 ; SSE64-NEXT: movapd %xmm1, 16(%eax)
393 ; SSE64-NEXT: addl $12, %esp
394 ; SSE64-NEXT: .cfi_def_cfa_offset 4
397 ; AVX32-LABEL: test_store_4xf64_aligned:
399 ; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
400 ; AVX32-NEXT: vmovapd %ymm0, (%rdi)
403 ; AVX64-LABEL: test_store_4xf64_aligned:
405 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
406 ; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
407 ; AVX64-NEXT: vmovapd %ymm0, (%eax)
409 %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
410 store <4 x double> %foo, <4 x double>* %addr, align 32
411 ret <4 x double> %foo
414 define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %value) {
415 ; SSE32-LABEL: test_store_16xi32:
417 ; SSE32-NEXT: movups %xmm0, (%rdi)
418 ; SSE32-NEXT: movups %xmm1, 16(%rdi)
419 ; SSE32-NEXT: movups %xmm2, 32(%rdi)
420 ; SSE32-NEXT: movups %xmm3, 48(%rdi)
423 ; SSE64-LABEL: test_store_16xi32:
425 ; SSE64-NEXT: subl $12, %esp
426 ; SSE64-NEXT: .cfi_def_cfa_offset 16
427 ; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3
428 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
429 ; SSE64-NEXT: movups %xmm0, (%eax)
430 ; SSE64-NEXT: movups %xmm1, 16(%eax)
431 ; SSE64-NEXT: movups %xmm2, 32(%eax)
432 ; SSE64-NEXT: movups %xmm3, 48(%eax)
433 ; SSE64-NEXT: addl $12, %esp
434 ; SSE64-NEXT: .cfi_def_cfa_offset 4
437 ; AVXONLY32-LABEL: test_store_16xi32:
438 ; AVXONLY32: # %bb.0:
439 ; AVXONLY32-NEXT: vmovups %ymm0, (%rdi)
440 ; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi)
441 ; AVXONLY32-NEXT: retq
443 ; AVXONLY64-LABEL: test_store_16xi32:
444 ; AVXONLY64: # %bb.0:
445 ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax
446 ; AVXONLY64-NEXT: vmovups %ymm0, (%eax)
447 ; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax)
448 ; AVXONLY64-NEXT: retl
450 ; AVX51232-LABEL: test_store_16xi32:
452 ; AVX51232-NEXT: vmovups %zmm0, (%rdi)
453 ; AVX51232-NEXT: retq
455 ; AVX51264-LABEL: test_store_16xi32:
457 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
458 ; AVX51264-NEXT: vmovups %zmm0, (%eax)
459 ; AVX51264-NEXT: retl
460 store <16 x i32> %value, <16 x i32>* %addr, align 1
461 ret <16 x i32> %value
464 define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x i32> %value) {
465 ; SSE32-LABEL: test_store_16xi32_aligned:
467 ; SSE32-NEXT: movaps %xmm0, (%rdi)
468 ; SSE32-NEXT: movaps %xmm1, 16(%rdi)
469 ; SSE32-NEXT: movaps %xmm2, 32(%rdi)
470 ; SSE32-NEXT: movaps %xmm3, 48(%rdi)
473 ; SSE64-LABEL: test_store_16xi32_aligned:
475 ; SSE64-NEXT: subl $12, %esp
476 ; SSE64-NEXT: .cfi_def_cfa_offset 16
477 ; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3
478 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
479 ; SSE64-NEXT: movaps %xmm0, (%eax)
480 ; SSE64-NEXT: movaps %xmm1, 16(%eax)
481 ; SSE64-NEXT: movaps %xmm2, 32(%eax)
482 ; SSE64-NEXT: movaps %xmm3, 48(%eax)
483 ; SSE64-NEXT: addl $12, %esp
484 ; SSE64-NEXT: .cfi_def_cfa_offset 4
487 ; AVXONLY32-LABEL: test_store_16xi32_aligned:
488 ; AVXONLY32: # %bb.0:
489 ; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi)
490 ; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi)
491 ; AVXONLY32-NEXT: retq
493 ; AVXONLY64-LABEL: test_store_16xi32_aligned:
494 ; AVXONLY64: # %bb.0:
495 ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax
496 ; AVXONLY64-NEXT: vmovaps %ymm0, (%eax)
497 ; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax)
498 ; AVXONLY64-NEXT: retl
500 ; AVX51232-LABEL: test_store_16xi32_aligned:
502 ; AVX51232-NEXT: vmovaps %zmm0, (%rdi)
503 ; AVX51232-NEXT: retq
505 ; AVX51264-LABEL: test_store_16xi32_aligned:
507 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
508 ; AVX51264-NEXT: vmovaps %zmm0, (%eax)
509 ; AVX51264-NEXT: retl
510 store <16 x i32> %value, <16 x i32>* %addr, align 64
511 ret <16 x i32> %value
514 define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x float> %value) {
515 ; SSE32-LABEL: test_store_16xf32:
517 ; SSE32-NEXT: movups %xmm0, (%rdi)
518 ; SSE32-NEXT: movups %xmm1, 16(%rdi)
519 ; SSE32-NEXT: movups %xmm2, 32(%rdi)
520 ; SSE32-NEXT: movups %xmm3, 48(%rdi)
523 ; SSE64-LABEL: test_store_16xf32:
525 ; SSE64-NEXT: subl $12, %esp
526 ; SSE64-NEXT: .cfi_def_cfa_offset 16
527 ; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3
528 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
529 ; SSE64-NEXT: movups %xmm0, (%eax)
530 ; SSE64-NEXT: movups %xmm1, 16(%eax)
531 ; SSE64-NEXT: movups %xmm2, 32(%eax)
532 ; SSE64-NEXT: movups %xmm3, 48(%eax)
533 ; SSE64-NEXT: addl $12, %esp
534 ; SSE64-NEXT: .cfi_def_cfa_offset 4
537 ; AVXONLY32-LABEL: test_store_16xf32:
538 ; AVXONLY32: # %bb.0:
539 ; AVXONLY32-NEXT: vmovups %ymm0, (%rdi)
540 ; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi)
541 ; AVXONLY32-NEXT: retq
543 ; AVXONLY64-LABEL: test_store_16xf32:
544 ; AVXONLY64: # %bb.0:
545 ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax
546 ; AVXONLY64-NEXT: vmovups %ymm0, (%eax)
547 ; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax)
548 ; AVXONLY64-NEXT: retl
550 ; AVX51232-LABEL: test_store_16xf32:
552 ; AVX51232-NEXT: vmovups %zmm0, (%rdi)
553 ; AVX51232-NEXT: retq
555 ; AVX51264-LABEL: test_store_16xf32:
557 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
558 ; AVX51264-NEXT: vmovups %zmm0, (%eax)
559 ; AVX51264-NEXT: retl
560 store <16 x float> %value, <16 x float>* %addr, align 1
561 ret <16 x float> %value
564 define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <16 x float> %value) {
565 ; SSE32-LABEL: test_store_16xf32_aligned:
567 ; SSE32-NEXT: movaps %xmm0, (%rdi)
568 ; SSE32-NEXT: movaps %xmm1, 16(%rdi)
569 ; SSE32-NEXT: movaps %xmm2, 32(%rdi)
570 ; SSE32-NEXT: movaps %xmm3, 48(%rdi)
573 ; SSE64-LABEL: test_store_16xf32_aligned:
575 ; SSE64-NEXT: subl $12, %esp
576 ; SSE64-NEXT: .cfi_def_cfa_offset 16
577 ; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3
578 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
579 ; SSE64-NEXT: movaps %xmm0, (%eax)
580 ; SSE64-NEXT: movaps %xmm1, 16(%eax)
581 ; SSE64-NEXT: movaps %xmm2, 32(%eax)
582 ; SSE64-NEXT: movaps %xmm3, 48(%eax)
583 ; SSE64-NEXT: addl $12, %esp
584 ; SSE64-NEXT: .cfi_def_cfa_offset 4
587 ; AVXONLY32-LABEL: test_store_16xf32_aligned:
588 ; AVXONLY32: # %bb.0:
589 ; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi)
590 ; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi)
591 ; AVXONLY32-NEXT: retq
593 ; AVXONLY64-LABEL: test_store_16xf32_aligned:
594 ; AVXONLY64: # %bb.0:
595 ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax
596 ; AVXONLY64-NEXT: vmovaps %ymm0, (%eax)
597 ; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax)
598 ; AVXONLY64-NEXT: retl
600 ; AVX51232-LABEL: test_store_16xf32_aligned:
602 ; AVX51232-NEXT: vmovaps %zmm0, (%rdi)
603 ; AVX51232-NEXT: retq
605 ; AVX51264-LABEL: test_store_16xf32_aligned:
607 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
608 ; AVX51264-NEXT: vmovaps %zmm0, (%eax)
609 ; AVX51264-NEXT: retl
610 store <16 x float> %value, <16 x float>* %addr, align 64
611 ret <16 x float> %value
614 define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
615 ; SSE32-LABEL: test_store_8xf64:
617 ; SSE32-NEXT: addpd %xmm7, %xmm3
618 ; SSE32-NEXT: addpd %xmm6, %xmm2
619 ; SSE32-NEXT: addpd %xmm5, %xmm1
620 ; SSE32-NEXT: addpd %xmm4, %xmm0
621 ; SSE32-NEXT: movupd %xmm0, (%rdi)
622 ; SSE32-NEXT: movupd %xmm1, 16(%rdi)
623 ; SSE32-NEXT: movupd %xmm2, 32(%rdi)
624 ; SSE32-NEXT: movupd %xmm3, 48(%rdi)
627 ; SSE64-LABEL: test_store_8xf64:
629 ; SSE64-NEXT: subl $12, %esp
630 ; SSE64-NEXT: .cfi_def_cfa_offset 16
631 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3
632 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
633 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm3
634 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm2
635 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1
636 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0
637 ; SSE64-NEXT: movupd %xmm0, (%eax)
638 ; SSE64-NEXT: movupd %xmm1, 16(%eax)
639 ; SSE64-NEXT: movupd %xmm2, 32(%eax)
640 ; SSE64-NEXT: movupd %xmm3, 48(%eax)
641 ; SSE64-NEXT: addl $12, %esp
642 ; SSE64-NEXT: .cfi_def_cfa_offset 4
645 ; AVXONLY32-LABEL: test_store_8xf64:
646 ; AVXONLY32: # %bb.0:
647 ; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1
648 ; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0
649 ; AVXONLY32-NEXT: vmovupd %ymm0, (%rdi)
650 ; AVXONLY32-NEXT: vmovupd %ymm1, 32(%rdi)
651 ; AVXONLY32-NEXT: retq
653 ; AVXONLY64-LABEL: test_store_8xf64:
654 ; AVXONLY64: # %bb.0:
655 ; AVXONLY64-NEXT: pushl %ebp
656 ; AVXONLY64-NEXT: .cfi_def_cfa_offset 8
657 ; AVXONLY64-NEXT: .cfi_offset %ebp, -8
658 ; AVXONLY64-NEXT: movl %esp, %ebp
659 ; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp
660 ; AVXONLY64-NEXT: andl $-32, %esp
661 ; AVXONLY64-NEXT: subl $32, %esp
662 ; AVXONLY64-NEXT: movl 8(%ebp), %eax
663 ; AVXONLY64-NEXT: vaddpd 40(%ebp), %ymm1, %ymm1
664 ; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
665 ; AVXONLY64-NEXT: vmovupd %ymm0, (%eax)
666 ; AVXONLY64-NEXT: vmovupd %ymm1, 32(%eax)
667 ; AVXONLY64-NEXT: movl %ebp, %esp
668 ; AVXONLY64-NEXT: popl %ebp
669 ; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4
670 ; AVXONLY64-NEXT: retl
672 ; AVX51232-LABEL: test_store_8xf64:
674 ; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0
675 ; AVX51232-NEXT: vmovupd %zmm0, (%rdi)
676 ; AVX51232-NEXT: retq
678 ; AVX51264-LABEL: test_store_8xf64:
680 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
681 ; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0
682 ; AVX51264-NEXT: vmovupd %zmm0, (%eax)
683 ; AVX51264-NEXT: retl
684 %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
685 store <8 x double> %foo, <8 x double>* %addr, align 1
686 ret <8 x double> %foo
689 define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
690 ; SSE32-LABEL: test_store_8xf64_aligned:
692 ; SSE32-NEXT: addpd %xmm7, %xmm3
693 ; SSE32-NEXT: addpd %xmm6, %xmm2
694 ; SSE32-NEXT: addpd %xmm5, %xmm1
695 ; SSE32-NEXT: addpd %xmm4, %xmm0
696 ; SSE32-NEXT: movapd %xmm0, (%rdi)
697 ; SSE32-NEXT: movapd %xmm1, 16(%rdi)
698 ; SSE32-NEXT: movapd %xmm2, 32(%rdi)
699 ; SSE32-NEXT: movapd %xmm3, 48(%rdi)
702 ; SSE64-LABEL: test_store_8xf64_aligned:
704 ; SSE64-NEXT: subl $12, %esp
705 ; SSE64-NEXT: .cfi_def_cfa_offset 16
706 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3
707 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
708 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm3
709 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm2
710 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1
711 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0
712 ; SSE64-NEXT: movapd %xmm0, (%eax)
713 ; SSE64-NEXT: movapd %xmm1, 16(%eax)
714 ; SSE64-NEXT: movapd %xmm2, 32(%eax)
715 ; SSE64-NEXT: movapd %xmm3, 48(%eax)
716 ; SSE64-NEXT: addl $12, %esp
717 ; SSE64-NEXT: .cfi_def_cfa_offset 4
720 ; AVXONLY32-LABEL: test_store_8xf64_aligned:
721 ; AVXONLY32: # %bb.0:
722 ; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1
723 ; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0
724 ; AVXONLY32-NEXT: vmovapd %ymm0, (%rdi)
725 ; AVXONLY32-NEXT: vmovapd %ymm1, 32(%rdi)
726 ; AVXONLY32-NEXT: retq
728 ; AVXONLY64-LABEL: test_store_8xf64_aligned:
729 ; AVXONLY64: # %bb.0:
730 ; AVXONLY64-NEXT: pushl %ebp
731 ; AVXONLY64-NEXT: .cfi_def_cfa_offset 8
732 ; AVXONLY64-NEXT: .cfi_offset %ebp, -8
733 ; AVXONLY64-NEXT: movl %esp, %ebp
734 ; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp
735 ; AVXONLY64-NEXT: andl $-32, %esp
736 ; AVXONLY64-NEXT: subl $32, %esp
737 ; AVXONLY64-NEXT: movl 8(%ebp), %eax
738 ; AVXONLY64-NEXT: vaddpd 40(%ebp), %ymm1, %ymm1
739 ; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
740 ; AVXONLY64-NEXT: vmovapd %ymm0, (%eax)
741 ; AVXONLY64-NEXT: vmovapd %ymm1, 32(%eax)
742 ; AVXONLY64-NEXT: movl %ebp, %esp
743 ; AVXONLY64-NEXT: popl %ebp
744 ; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4
745 ; AVXONLY64-NEXT: retl
747 ; AVX51232-LABEL: test_store_8xf64_aligned:
749 ; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0
750 ; AVX51232-NEXT: vmovapd %zmm0, (%rdi)
751 ; AVX51232-NEXT: retq
753 ; AVX51264-LABEL: test_store_8xf64_aligned:
755 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
756 ; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0
757 ; AVX51264-NEXT: vmovapd %zmm0, (%eax)
758 ; AVX51264-NEXT: retl
759 %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
760 store <8 x double> %foo, <8 x double>* %addr, align 64
761 ret <8 x double> %foo