1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=SSE32
3 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=SSE64
4 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVXONLY32
5 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVXONLY64
6 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=KNL32
7 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=KNL64
8 ; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=SKX32
9 ; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=SKX64
11 define i32 @test_store_32(i32* nocapture %addr, i32 %value) {
12 ; ALL32-LABEL: test_store_32:
13 ; ALL32: # %bb.0: # %entry
14 ; ALL32-NEXT: movl %esi, %eax
15 ; ALL32-NEXT: movl %esi, (%rdi)
18 ; ALL64-LABEL: test_store_32:
19 ; ALL64: # %bb.0: # %entry
20 ; ALL64-NEXT: movl {{[0-9]+}}(%esp), %eax
21 ; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx
22 ; ALL64-NEXT: movl %eax, (%ecx)
25 store i32 %value, i32* %addr, align 1
29 define i16 @test_store_16(i16* nocapture %addr, i16 %value) {
30 ; ALL32-LABEL: test_store_16:
31 ; ALL32: # %bb.0: # %entry
32 ; ALL32-NEXT: movl %esi, %eax
33 ; ALL32-NEXT: movw %ax, (%rdi)
34 ; ALL32-NEXT: # kill: def $ax killed $ax killed $eax
37 ; ALL64-LABEL: test_store_16:
38 ; ALL64: # %bb.0: # %entry
39 ; ALL64-NEXT: movzwl {{[0-9]+}}(%esp), %eax
40 ; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx
41 ; ALL64-NEXT: movw %ax, (%ecx)
44 store i16 %value, i16* %addr, align 1
48 define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
49 ; SSE32-LABEL: test_store_4xi32:
51 ; SSE32-NEXT: paddd %xmm1, %xmm0
52 ; SSE32-NEXT: movdqu %xmm0, (%rdi)
55 ; SSE64-LABEL: test_store_4xi32:
57 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
58 ; SSE64-NEXT: paddd %xmm1, %xmm0
59 ; SSE64-NEXT: movdqu %xmm0, (%eax)
62 ; AVX32-LABEL: test_store_4xi32:
64 ; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
65 ; AVX32-NEXT: vmovdqu %xmm0, (%rdi)
68 ; AVX64-LABEL: test_store_4xi32:
70 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
71 ; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
72 ; AVX64-NEXT: vmovdqu %xmm0, (%eax)
74 %foo = add <4 x i32> %value, %value2 ; to force integer type on store
75 store <4 x i32> %foo, <4 x i32>* %addr, align 1
79 define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
80 ; SSE32-LABEL: test_store_4xi32_aligned:
82 ; SSE32-NEXT: paddd %xmm1, %xmm0
83 ; SSE32-NEXT: movdqa %xmm0, (%rdi)
86 ; SSE64-LABEL: test_store_4xi32_aligned:
88 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
89 ; SSE64-NEXT: paddd %xmm1, %xmm0
90 ; SSE64-NEXT: movdqa %xmm0, (%eax)
93 ; AVX32-LABEL: test_store_4xi32_aligned:
95 ; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
96 ; AVX32-NEXT: vmovdqa %xmm0, (%rdi)
99 ; AVX64-LABEL: test_store_4xi32_aligned:
101 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
102 ; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
103 ; AVX64-NEXT: vmovdqa %xmm0, (%eax)
105 %foo = add <4 x i32> %value, %value2 ; to force integer type on store
106 store <4 x i32> %foo, <4 x i32>* %addr, align 16
110 define <4 x float> @test_store_4xf32(<4 x float>* nocapture %addr, <4 x float> %value) {
111 ; SSE32-LABEL: test_store_4xf32:
113 ; SSE32-NEXT: movups %xmm0, (%rdi)
116 ; SSE64-LABEL: test_store_4xf32:
118 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
119 ; SSE64-NEXT: movups %xmm0, (%eax)
122 ; AVX32-LABEL: test_store_4xf32:
124 ; AVX32-NEXT: vmovups %xmm0, (%rdi)
127 ; AVX64-LABEL: test_store_4xf32:
129 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
130 ; AVX64-NEXT: vmovups %xmm0, (%eax)
132 store <4 x float> %value, <4 x float>* %addr, align 1
133 ret <4 x float> %value
136 define <4 x float> @test_store_4xf32_aligned(<4 x float>* nocapture %addr, <4 x float> %value) {
137 ; SSE32-LABEL: test_store_4xf32_aligned:
139 ; SSE32-NEXT: movaps %xmm0, (%rdi)
142 ; SSE64-LABEL: test_store_4xf32_aligned:
144 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
145 ; SSE64-NEXT: movaps %xmm0, (%eax)
148 ; AVX32-LABEL: test_store_4xf32_aligned:
150 ; AVX32-NEXT: vmovaps %xmm0, (%rdi)
153 ; AVX64-LABEL: test_store_4xf32_aligned:
155 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
156 ; AVX64-NEXT: vmovaps %xmm0, (%eax)
158 store <4 x float> %value, <4 x float>* %addr, align 16
159 ret <4 x float> %value
162 define <2 x double> @test_store_2xf64(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
163 ; SSE32-LABEL: test_store_2xf64:
165 ; SSE32-NEXT: addpd %xmm1, %xmm0
166 ; SSE32-NEXT: movupd %xmm0, (%rdi)
169 ; SSE64-LABEL: test_store_2xf64:
171 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
172 ; SSE64-NEXT: addpd %xmm1, %xmm0
173 ; SSE64-NEXT: movupd %xmm0, (%eax)
176 ; AVX32-LABEL: test_store_2xf64:
178 ; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0
179 ; AVX32-NEXT: vmovupd %xmm0, (%rdi)
182 ; AVX64-LABEL: test_store_2xf64:
184 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
185 ; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
186 ; AVX64-NEXT: vmovupd %xmm0, (%eax)
188 %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
189 store <2 x double> %foo, <2 x double>* %addr, align 1
190 ret <2 x double> %foo
193 define <2 x double> @test_store_2xf64_aligned(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
194 ; SSE32-LABEL: test_store_2xf64_aligned:
196 ; SSE32-NEXT: addpd %xmm1, %xmm0
197 ; SSE32-NEXT: movapd %xmm0, (%rdi)
200 ; SSE64-LABEL: test_store_2xf64_aligned:
202 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
203 ; SSE64-NEXT: addpd %xmm1, %xmm0
204 ; SSE64-NEXT: movapd %xmm0, (%eax)
207 ; AVX32-LABEL: test_store_2xf64_aligned:
209 ; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0
210 ; AVX32-NEXT: vmovapd %xmm0, (%rdi)
213 ; AVX64-LABEL: test_store_2xf64_aligned:
215 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
216 ; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
217 ; AVX64-NEXT: vmovapd %xmm0, (%eax)
219 %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
220 store <2 x double> %foo, <2 x double>* %addr, align 16
221 ret <2 x double> %foo
224 define <8 x i32> @test_store_8xi32(<8 x i32>* nocapture %addr, <8 x i32> %value) {
225 ; SSE32-LABEL: test_store_8xi32:
227 ; SSE32-NEXT: movups %xmm0, (%rdi)
228 ; SSE32-NEXT: movups %xmm1, 16(%rdi)
231 ; SSE64-LABEL: test_store_8xi32:
233 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
234 ; SSE64-NEXT: movups %xmm0, (%eax)
235 ; SSE64-NEXT: movups %xmm1, 16(%eax)
238 ; AVX32-LABEL: test_store_8xi32:
240 ; AVX32-NEXT: vmovups %ymm0, (%rdi)
243 ; AVX64-LABEL: test_store_8xi32:
245 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
246 ; AVX64-NEXT: vmovups %ymm0, (%eax)
248 store <8 x i32> %value, <8 x i32>* %addr, align 1
252 define <8 x i32> @test_store_8xi32_aligned(<8 x i32>* nocapture %addr, <8 x i32> %value) {
253 ; SSE32-LABEL: test_store_8xi32_aligned:
255 ; SSE32-NEXT: movaps %xmm0, (%rdi)
256 ; SSE32-NEXT: movaps %xmm1, 16(%rdi)
259 ; SSE64-LABEL: test_store_8xi32_aligned:
261 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
262 ; SSE64-NEXT: movaps %xmm0, (%eax)
263 ; SSE64-NEXT: movaps %xmm1, 16(%eax)
266 ; AVX32-LABEL: test_store_8xi32_aligned:
268 ; AVX32-NEXT: vmovaps %ymm0, (%rdi)
271 ; AVX64-LABEL: test_store_8xi32_aligned:
273 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
274 ; AVX64-NEXT: vmovaps %ymm0, (%eax)
276 store <8 x i32> %value, <8 x i32>* %addr, align 32
280 define <8 x float> @test_store_8xf32(<8 x float>* nocapture %addr, <8 x float> %value) {
281 ; SSE32-LABEL: test_store_8xf32:
283 ; SSE32-NEXT: movups %xmm0, (%rdi)
284 ; SSE32-NEXT: movups %xmm1, 16(%rdi)
287 ; SSE64-LABEL: test_store_8xf32:
289 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
290 ; SSE64-NEXT: movups %xmm0, (%eax)
291 ; SSE64-NEXT: movups %xmm1, 16(%eax)
294 ; AVX32-LABEL: test_store_8xf32:
296 ; AVX32-NEXT: vmovups %ymm0, (%rdi)
299 ; AVX64-LABEL: test_store_8xf32:
301 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
302 ; AVX64-NEXT: vmovups %ymm0, (%eax)
304 store <8 x float> %value, <8 x float>* %addr, align 1
305 ret <8 x float> %value
308 define <8 x float> @test_store_8xf32_aligned(<8 x float>* nocapture %addr, <8 x float> %value) {
309 ; SSE32-LABEL: test_store_8xf32_aligned:
311 ; SSE32-NEXT: movaps %xmm0, (%rdi)
312 ; SSE32-NEXT: movaps %xmm1, 16(%rdi)
315 ; SSE64-LABEL: test_store_8xf32_aligned:
317 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
318 ; SSE64-NEXT: movaps %xmm0, (%eax)
319 ; SSE64-NEXT: movaps %xmm1, 16(%eax)
322 ; AVX32-LABEL: test_store_8xf32_aligned:
324 ; AVX32-NEXT: vmovaps %ymm0, (%rdi)
327 ; AVX64-LABEL: test_store_8xf32_aligned:
329 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
330 ; AVX64-NEXT: vmovaps %ymm0, (%eax)
332 store <8 x float> %value, <8 x float>* %addr, align 32
333 ret <8 x float> %value
336 define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
337 ; SSE32-LABEL: test_store_4xf64:
339 ; SSE32-NEXT: addpd %xmm2, %xmm0
340 ; SSE32-NEXT: movupd %xmm0, (%rdi)
341 ; SSE32-NEXT: addpd %xmm3, %xmm1
342 ; SSE32-NEXT: movupd %xmm1, 16(%rdi)
345 ; SSE64-LABEL: test_store_4xf64:
347 ; SSE64-NEXT: subl $12, %esp
348 ; SSE64-NEXT: .cfi_def_cfa_offset 16
349 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3
350 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
351 ; SSE64-NEXT: addpd %xmm2, %xmm0
352 ; SSE64-NEXT: movupd %xmm0, (%eax)
353 ; SSE64-NEXT: addpd %xmm3, %xmm1
354 ; SSE64-NEXT: movupd %xmm1, 16(%eax)
355 ; SSE64-NEXT: addl $12, %esp
356 ; SSE64-NEXT: .cfi_def_cfa_offset 4
359 ; AVX32-LABEL: test_store_4xf64:
361 ; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
362 ; AVX32-NEXT: vmovupd %ymm0, (%rdi)
365 ; AVX64-LABEL: test_store_4xf64:
367 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
368 ; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
369 ; AVX64-NEXT: vmovupd %ymm0, (%eax)
371 %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
372 store <4 x double> %foo, <4 x double>* %addr, align 1
373 ret <4 x double> %foo
376 define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
377 ; SSE32-LABEL: test_store_4xf64_aligned:
379 ; SSE32-NEXT: addpd %xmm2, %xmm0
380 ; SSE32-NEXT: movapd %xmm0, (%rdi)
381 ; SSE32-NEXT: addpd %xmm3, %xmm1
382 ; SSE32-NEXT: movapd %xmm1, 16(%rdi)
385 ; SSE64-LABEL: test_store_4xf64_aligned:
387 ; SSE64-NEXT: subl $12, %esp
388 ; SSE64-NEXT: .cfi_def_cfa_offset 16
389 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3
390 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
391 ; SSE64-NEXT: addpd %xmm2, %xmm0
392 ; SSE64-NEXT: movapd %xmm0, (%eax)
393 ; SSE64-NEXT: addpd %xmm3, %xmm1
394 ; SSE64-NEXT: movapd %xmm1, 16(%eax)
395 ; SSE64-NEXT: addl $12, %esp
396 ; SSE64-NEXT: .cfi_def_cfa_offset 4
399 ; AVX32-LABEL: test_store_4xf64_aligned:
401 ; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
402 ; AVX32-NEXT: vmovapd %ymm0, (%rdi)
405 ; AVX64-LABEL: test_store_4xf64_aligned:
407 ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
408 ; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
409 ; AVX64-NEXT: vmovapd %ymm0, (%eax)
411 %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
412 store <4 x double> %foo, <4 x double>* %addr, align 32
413 ret <4 x double> %foo
416 define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %value) {
417 ; SSE32-LABEL: test_store_16xi32:
419 ; SSE32-NEXT: movups %xmm0, (%rdi)
420 ; SSE32-NEXT: movups %xmm1, 16(%rdi)
421 ; SSE32-NEXT: movups %xmm2, 32(%rdi)
422 ; SSE32-NEXT: movups %xmm3, 48(%rdi)
425 ; SSE64-LABEL: test_store_16xi32:
427 ; SSE64-NEXT: subl $12, %esp
428 ; SSE64-NEXT: .cfi_def_cfa_offset 16
429 ; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3
430 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
431 ; SSE64-NEXT: movups %xmm0, (%eax)
432 ; SSE64-NEXT: movups %xmm1, 16(%eax)
433 ; SSE64-NEXT: movups %xmm2, 32(%eax)
434 ; SSE64-NEXT: movups %xmm3, 48(%eax)
435 ; SSE64-NEXT: addl $12, %esp
436 ; SSE64-NEXT: .cfi_def_cfa_offset 4
439 ; AVXONLY32-LABEL: test_store_16xi32:
440 ; AVXONLY32: # %bb.0:
441 ; AVXONLY32-NEXT: vmovups %ymm0, (%rdi)
442 ; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi)
443 ; AVXONLY32-NEXT: retq
445 ; AVXONLY64-LABEL: test_store_16xi32:
446 ; AVXONLY64: # %bb.0:
447 ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax
448 ; AVXONLY64-NEXT: vmovups %ymm0, (%eax)
449 ; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax)
450 ; AVXONLY64-NEXT: retl
452 ; AVX51232-LABEL: test_store_16xi32:
454 ; AVX51232-NEXT: vmovups %zmm0, (%rdi)
455 ; AVX51232-NEXT: retq
457 ; AVX51264-LABEL: test_store_16xi32:
459 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
460 ; AVX51264-NEXT: vmovups %zmm0, (%eax)
461 ; AVX51264-NEXT: retl
462 store <16 x i32> %value, <16 x i32>* %addr, align 1
463 ret <16 x i32> %value
466 define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x i32> %value) {
467 ; SSE32-LABEL: test_store_16xi32_aligned:
469 ; SSE32-NEXT: movaps %xmm0, (%rdi)
470 ; SSE32-NEXT: movaps %xmm1, 16(%rdi)
471 ; SSE32-NEXT: movaps %xmm2, 32(%rdi)
472 ; SSE32-NEXT: movaps %xmm3, 48(%rdi)
475 ; SSE64-LABEL: test_store_16xi32_aligned:
477 ; SSE64-NEXT: subl $12, %esp
478 ; SSE64-NEXT: .cfi_def_cfa_offset 16
479 ; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3
480 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
481 ; SSE64-NEXT: movaps %xmm0, (%eax)
482 ; SSE64-NEXT: movaps %xmm1, 16(%eax)
483 ; SSE64-NEXT: movaps %xmm2, 32(%eax)
484 ; SSE64-NEXT: movaps %xmm3, 48(%eax)
485 ; SSE64-NEXT: addl $12, %esp
486 ; SSE64-NEXT: .cfi_def_cfa_offset 4
489 ; AVXONLY32-LABEL: test_store_16xi32_aligned:
490 ; AVXONLY32: # %bb.0:
491 ; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi)
492 ; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi)
493 ; AVXONLY32-NEXT: retq
495 ; AVXONLY64-LABEL: test_store_16xi32_aligned:
496 ; AVXONLY64: # %bb.0:
497 ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax
498 ; AVXONLY64-NEXT: vmovaps %ymm0, (%eax)
499 ; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax)
500 ; AVXONLY64-NEXT: retl
502 ; AVX51232-LABEL: test_store_16xi32_aligned:
504 ; AVX51232-NEXT: vmovaps %zmm0, (%rdi)
505 ; AVX51232-NEXT: retq
507 ; AVX51264-LABEL: test_store_16xi32_aligned:
509 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
510 ; AVX51264-NEXT: vmovaps %zmm0, (%eax)
511 ; AVX51264-NEXT: retl
512 store <16 x i32> %value, <16 x i32>* %addr, align 64
513 ret <16 x i32> %value
516 define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x float> %value) {
517 ; SSE32-LABEL: test_store_16xf32:
519 ; SSE32-NEXT: movups %xmm0, (%rdi)
520 ; SSE32-NEXT: movups %xmm1, 16(%rdi)
521 ; SSE32-NEXT: movups %xmm2, 32(%rdi)
522 ; SSE32-NEXT: movups %xmm3, 48(%rdi)
525 ; SSE64-LABEL: test_store_16xf32:
527 ; SSE64-NEXT: subl $12, %esp
528 ; SSE64-NEXT: .cfi_def_cfa_offset 16
529 ; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3
530 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
531 ; SSE64-NEXT: movups %xmm0, (%eax)
532 ; SSE64-NEXT: movups %xmm1, 16(%eax)
533 ; SSE64-NEXT: movups %xmm2, 32(%eax)
534 ; SSE64-NEXT: movups %xmm3, 48(%eax)
535 ; SSE64-NEXT: addl $12, %esp
536 ; SSE64-NEXT: .cfi_def_cfa_offset 4
539 ; AVXONLY32-LABEL: test_store_16xf32:
540 ; AVXONLY32: # %bb.0:
541 ; AVXONLY32-NEXT: vmovups %ymm0, (%rdi)
542 ; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi)
543 ; AVXONLY32-NEXT: retq
545 ; AVXONLY64-LABEL: test_store_16xf32:
546 ; AVXONLY64: # %bb.0:
547 ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax
548 ; AVXONLY64-NEXT: vmovups %ymm0, (%eax)
549 ; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax)
550 ; AVXONLY64-NEXT: retl
552 ; AVX51232-LABEL: test_store_16xf32:
554 ; AVX51232-NEXT: vmovups %zmm0, (%rdi)
555 ; AVX51232-NEXT: retq
557 ; AVX51264-LABEL: test_store_16xf32:
559 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
560 ; AVX51264-NEXT: vmovups %zmm0, (%eax)
561 ; AVX51264-NEXT: retl
562 store <16 x float> %value, <16 x float>* %addr, align 1
563 ret <16 x float> %value
566 define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <16 x float> %value) {
567 ; SSE32-LABEL: test_store_16xf32_aligned:
569 ; SSE32-NEXT: movaps %xmm0, (%rdi)
570 ; SSE32-NEXT: movaps %xmm1, 16(%rdi)
571 ; SSE32-NEXT: movaps %xmm2, 32(%rdi)
572 ; SSE32-NEXT: movaps %xmm3, 48(%rdi)
575 ; SSE64-LABEL: test_store_16xf32_aligned:
577 ; SSE64-NEXT: subl $12, %esp
578 ; SSE64-NEXT: .cfi_def_cfa_offset 16
579 ; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3
580 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
581 ; SSE64-NEXT: movaps %xmm0, (%eax)
582 ; SSE64-NEXT: movaps %xmm1, 16(%eax)
583 ; SSE64-NEXT: movaps %xmm2, 32(%eax)
584 ; SSE64-NEXT: movaps %xmm3, 48(%eax)
585 ; SSE64-NEXT: addl $12, %esp
586 ; SSE64-NEXT: .cfi_def_cfa_offset 4
589 ; AVXONLY32-LABEL: test_store_16xf32_aligned:
590 ; AVXONLY32: # %bb.0:
591 ; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi)
592 ; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi)
593 ; AVXONLY32-NEXT: retq
595 ; AVXONLY64-LABEL: test_store_16xf32_aligned:
596 ; AVXONLY64: # %bb.0:
597 ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax
598 ; AVXONLY64-NEXT: vmovaps %ymm0, (%eax)
599 ; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax)
600 ; AVXONLY64-NEXT: retl
602 ; AVX51232-LABEL: test_store_16xf32_aligned:
604 ; AVX51232-NEXT: vmovaps %zmm0, (%rdi)
605 ; AVX51232-NEXT: retq
607 ; AVX51264-LABEL: test_store_16xf32_aligned:
609 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
610 ; AVX51264-NEXT: vmovaps %zmm0, (%eax)
611 ; AVX51264-NEXT: retl
612 store <16 x float> %value, <16 x float>* %addr, align 64
613 ret <16 x float> %value
616 define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
617 ; SSE32-LABEL: test_store_8xf64:
619 ; SSE32-NEXT: addpd %xmm4, %xmm0
620 ; SSE32-NEXT: movupd %xmm0, (%rdi)
621 ; SSE32-NEXT: addpd %xmm5, %xmm1
622 ; SSE32-NEXT: movupd %xmm1, 16(%rdi)
623 ; SSE32-NEXT: addpd %xmm6, %xmm2
624 ; SSE32-NEXT: movupd %xmm2, 32(%rdi)
625 ; SSE32-NEXT: addpd %xmm7, %xmm3
626 ; SSE32-NEXT: movupd %xmm3, 48(%rdi)
629 ; SSE64-LABEL: test_store_8xf64:
631 ; SSE64-NEXT: subl $12, %esp
632 ; SSE64-NEXT: .cfi_def_cfa_offset 16
633 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm4
634 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm5
635 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm6
636 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3
637 ; SSE64-NEXT: addpd %xmm4, %xmm3
638 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
639 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0
640 ; SSE64-NEXT: movupd %xmm0, (%eax)
641 ; SSE64-NEXT: addpd %xmm6, %xmm1
642 ; SSE64-NEXT: movupd %xmm1, 16(%eax)
643 ; SSE64-NEXT: addpd %xmm5, %xmm2
644 ; SSE64-NEXT: movupd %xmm2, 32(%eax)
645 ; SSE64-NEXT: movupd %xmm3, 48(%eax)
646 ; SSE64-NEXT: addl $12, %esp
647 ; SSE64-NEXT: .cfi_def_cfa_offset 4
650 ; AVXONLY32-LABEL: test_store_8xf64:
651 ; AVXONLY32: # %bb.0:
652 ; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0
653 ; AVXONLY32-NEXT: vmovupd %ymm0, (%rdi)
654 ; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1
655 ; AVXONLY32-NEXT: vmovupd %ymm1, 32(%rdi)
656 ; AVXONLY32-NEXT: retq
658 ; AVXONLY64-LABEL: test_store_8xf64:
659 ; AVXONLY64: # %bb.0:
660 ; AVXONLY64-NEXT: pushl %ebp
661 ; AVXONLY64-NEXT: .cfi_def_cfa_offset 8
662 ; AVXONLY64-NEXT: .cfi_offset %ebp, -8
663 ; AVXONLY64-NEXT: movl %esp, %ebp
664 ; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp
665 ; AVXONLY64-NEXT: andl $-32, %esp
666 ; AVXONLY64-NEXT: subl $32, %esp
667 ; AVXONLY64-NEXT: vmovapd 40(%ebp), %ymm3
668 ; AVXONLY64-NEXT: movl 8(%ebp), %eax
669 ; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
670 ; AVXONLY64-NEXT: vmovupd %ymm0, (%eax)
671 ; AVXONLY64-NEXT: vaddpd %ymm3, %ymm1, %ymm1
672 ; AVXONLY64-NEXT: vmovupd %ymm1, 32(%eax)
673 ; AVXONLY64-NEXT: movl %ebp, %esp
674 ; AVXONLY64-NEXT: popl %ebp
675 ; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4
676 ; AVXONLY64-NEXT: retl
678 ; AVX51232-LABEL: test_store_8xf64:
680 ; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0
681 ; AVX51232-NEXT: vmovupd %zmm0, (%rdi)
682 ; AVX51232-NEXT: retq
684 ; AVX51264-LABEL: test_store_8xf64:
686 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
687 ; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0
688 ; AVX51264-NEXT: vmovupd %zmm0, (%eax)
689 ; AVX51264-NEXT: retl
690 %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
691 store <8 x double> %foo, <8 x double>* %addr, align 1
692 ret <8 x double> %foo
695 define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
696 ; SSE32-LABEL: test_store_8xf64_aligned:
698 ; SSE32-NEXT: addpd %xmm4, %xmm0
699 ; SSE32-NEXT: movapd %xmm0, (%rdi)
700 ; SSE32-NEXT: addpd %xmm5, %xmm1
701 ; SSE32-NEXT: movapd %xmm1, 16(%rdi)
702 ; SSE32-NEXT: addpd %xmm6, %xmm2
703 ; SSE32-NEXT: movapd %xmm2, 32(%rdi)
704 ; SSE32-NEXT: addpd %xmm7, %xmm3
705 ; SSE32-NEXT: movapd %xmm3, 48(%rdi)
708 ; SSE64-LABEL: test_store_8xf64_aligned:
710 ; SSE64-NEXT: subl $12, %esp
711 ; SSE64-NEXT: .cfi_def_cfa_offset 16
712 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm4
713 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm5
714 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm6
715 ; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3
716 ; SSE64-NEXT: addpd %xmm4, %xmm3
717 ; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
718 ; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0
719 ; SSE64-NEXT: movapd %xmm0, (%eax)
720 ; SSE64-NEXT: addpd %xmm6, %xmm1
721 ; SSE64-NEXT: movapd %xmm1, 16(%eax)
722 ; SSE64-NEXT: addpd %xmm5, %xmm2
723 ; SSE64-NEXT: movapd %xmm2, 32(%eax)
724 ; SSE64-NEXT: movapd %xmm3, 48(%eax)
725 ; SSE64-NEXT: addl $12, %esp
726 ; SSE64-NEXT: .cfi_def_cfa_offset 4
729 ; AVXONLY32-LABEL: test_store_8xf64_aligned:
730 ; AVXONLY32: # %bb.0:
731 ; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0
732 ; AVXONLY32-NEXT: vmovapd %ymm0, (%rdi)
733 ; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1
734 ; AVXONLY32-NEXT: vmovapd %ymm1, 32(%rdi)
735 ; AVXONLY32-NEXT: retq
737 ; AVXONLY64-LABEL: test_store_8xf64_aligned:
738 ; AVXONLY64: # %bb.0:
739 ; AVXONLY64-NEXT: pushl %ebp
740 ; AVXONLY64-NEXT: .cfi_def_cfa_offset 8
741 ; AVXONLY64-NEXT: .cfi_offset %ebp, -8
742 ; AVXONLY64-NEXT: movl %esp, %ebp
743 ; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp
744 ; AVXONLY64-NEXT: andl $-32, %esp
745 ; AVXONLY64-NEXT: subl $32, %esp
746 ; AVXONLY64-NEXT: vmovapd 40(%ebp), %ymm3
747 ; AVXONLY64-NEXT: movl 8(%ebp), %eax
748 ; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
749 ; AVXONLY64-NEXT: vmovapd %ymm0, (%eax)
750 ; AVXONLY64-NEXT: vaddpd %ymm3, %ymm1, %ymm1
751 ; AVXONLY64-NEXT: vmovapd %ymm1, 32(%eax)
752 ; AVXONLY64-NEXT: movl %ebp, %esp
753 ; AVXONLY64-NEXT: popl %ebp
754 ; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4
755 ; AVXONLY64-NEXT: retl
757 ; AVX51232-LABEL: test_store_8xf64_aligned:
759 ; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0
760 ; AVX51232-NEXT: vmovapd %zmm0, (%rdi)
761 ; AVX51232-NEXT: retq
763 ; AVX51264-LABEL: test_store_8xf64_aligned:
765 ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
766 ; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0
767 ; AVX51264-NEXT: vmovapd %zmm0, (%eax)
768 ; AVX51264-NEXT: retl
769 %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
770 store <8 x double> %foo, <8 x double>* %addr, align 64
771 ret <8 x double> %foo