1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=X86,X86-SSE4A
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE4A
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
14 define void @merge_2_v4f32_align32(<4 x float>* %a0, <4 x float>* %a1) nounwind {
15 ; X86-LABEL: merge_2_v4f32_align32:
17 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
18 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
19 ; X86-NEXT: movaps (%ecx), %xmm0
20 ; X86-NEXT: movaps 16(%ecx), %xmm1
21 ; X86-NEXT: movntps %xmm0, (%eax)
22 ; X86-NEXT: movntps %xmm1, 16(%eax)
25 ; X64-SSE2-LABEL: merge_2_v4f32_align32:
27 ; X64-SSE2-NEXT: movaps (%rdi), %xmm0
28 ; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1
29 ; X64-SSE2-NEXT: movntps %xmm0, (%rsi)
30 ; X64-SSE2-NEXT: movntps %xmm1, 16(%rsi)
33 ; X64-SSE4A-LABEL: merge_2_v4f32_align32:
35 ; X64-SSE4A-NEXT: movaps (%rdi), %xmm0
36 ; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1
37 ; X64-SSE4A-NEXT: movntps %xmm0, (%rsi)
38 ; X64-SSE4A-NEXT: movntps %xmm1, 16(%rsi)
39 ; X64-SSE4A-NEXT: retq
41 ; X64-SSE41-LABEL: merge_2_v4f32_align32:
43 ; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0
44 ; X64-SSE41-NEXT: movntdqa 16(%rdi), %xmm1
45 ; X64-SSE41-NEXT: movntdq %xmm0, (%rsi)
46 ; X64-SSE41-NEXT: movntdq %xmm1, 16(%rsi)
47 ; X64-SSE41-NEXT: retq
49 ; X64-AVX1-LABEL: merge_2_v4f32_align32:
51 ; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm0
52 ; X64-AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
53 ; X64-AVX1-NEXT: vmovntdq %xmm1, 16(%rsi)
54 ; X64-AVX1-NEXT: vmovntdq %xmm0, (%rsi)
57 ; X64-AVX2-LABEL: merge_2_v4f32_align32:
59 ; X64-AVX2-NEXT: vmovntdqa (%rdi), %ymm0
60 ; X64-AVX2-NEXT: vmovntdq %ymm0, (%rsi)
61 ; X64-AVX2-NEXT: vzeroupper
63 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
64 %2 = bitcast float* %1 to <4 x float>*
65 %3 = load <4 x float>, <4 x float>* %a0, align 32, !nontemporal !0
66 %4 = load <4 x float>, <4 x float>* %2, align 16, !nontemporal !0
67 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
68 %6 = bitcast float* %5 to <4 x float>*
69 store <4 x float> %3, <4 x float>* %a1, align 32, !nontemporal !0
70 store <4 x float> %4, <4 x float>* %6, align 16, !nontemporal !0
74 ; Don't merge nt and non-nt loads even if aligned.
75 define void @merge_2_v4f32_align32_mix_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind {
76 ; X86-LABEL: merge_2_v4f32_align32_mix_ntload:
78 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
79 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
80 ; X86-NEXT: movaps (%ecx), %xmm0
81 ; X86-NEXT: movaps 16(%ecx), %xmm1
82 ; X86-NEXT: movaps %xmm0, (%eax)
83 ; X86-NEXT: movaps %xmm1, 16(%eax)
86 ; X64-SSE2-LABEL: merge_2_v4f32_align32_mix_ntload:
88 ; X64-SSE2-NEXT: movaps (%rdi), %xmm0
89 ; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1
90 ; X64-SSE2-NEXT: movaps %xmm0, (%rsi)
91 ; X64-SSE2-NEXT: movaps %xmm1, 16(%rsi)
94 ; X64-SSE4A-LABEL: merge_2_v4f32_align32_mix_ntload:
96 ; X64-SSE4A-NEXT: movaps (%rdi), %xmm0
97 ; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1
98 ; X64-SSE4A-NEXT: movaps %xmm0, (%rsi)
99 ; X64-SSE4A-NEXT: movaps %xmm1, 16(%rsi)
100 ; X64-SSE4A-NEXT: retq
102 ; X64-SSE41-LABEL: merge_2_v4f32_align32_mix_ntload:
103 ; X64-SSE41: # %bb.0:
104 ; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0
105 ; X64-SSE41-NEXT: movaps 16(%rdi), %xmm1
106 ; X64-SSE41-NEXT: movdqa %xmm0, (%rsi)
107 ; X64-SSE41-NEXT: movaps %xmm1, 16(%rsi)
108 ; X64-SSE41-NEXT: retq
110 ; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntload:
112 ; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0
113 ; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1
114 ; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi)
115 ; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi)
117 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
118 %2 = bitcast float* %1 to <4 x float>*
119 %3 = load <4 x float>, <4 x float>* %a0, align 32, !nontemporal !0
120 %4 = load <4 x float>, <4 x float>* %2, align 16
121 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
122 %6 = bitcast float* %5 to <4 x float>*
123 store <4 x float> %3, <4 x float>* %a1, align 32
124 store <4 x float> %4, <4 x float>* %6, align 16
128 ; Don't merge nt and non-nt stores even if aligned.
129 define void @merge_2_v4f32_align32_mix_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind {
130 ; X86-LABEL: merge_2_v4f32_align32_mix_ntstore:
132 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
133 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
134 ; X86-NEXT: movaps (%ecx), %xmm0
135 ; X86-NEXT: movaps 16(%ecx), %xmm1
136 ; X86-NEXT: movntps %xmm0, (%eax)
137 ; X86-NEXT: movaps %xmm1, 16(%eax)
140 ; X64-SSE-LABEL: merge_2_v4f32_align32_mix_ntstore:
142 ; X64-SSE-NEXT: movaps (%rdi), %xmm0
143 ; X64-SSE-NEXT: movaps 16(%rdi), %xmm1
144 ; X64-SSE-NEXT: movntps %xmm0, (%rsi)
145 ; X64-SSE-NEXT: movaps %xmm1, 16(%rsi)
148 ; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntstore:
150 ; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
151 ; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1
152 ; X64-AVX-NEXT: vmovntps %xmm0, (%rsi)
153 ; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi)
155 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
156 %2 = bitcast float* %1 to <4 x float>*
157 %3 = load <4 x float>, <4 x float>* %a0, align 32
158 %4 = load <4 x float>, <4 x float>* %2, align 16
159 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
160 %6 = bitcast float* %5 to <4 x float>*
161 store <4 x float> %3, <4 x float>* %a1, align 32, !nontemporal !0
162 store <4 x float> %4, <4 x float>* %6, align 16
166 ; AVX2 can't perform NT-load-ymm on 16-byte aligned memory.
167 ; Must be kept seperate as VMOVNTDQA xmm.
168 define void @merge_2_v4f32_align16_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind {
169 ; X86-LABEL: merge_2_v4f32_align16_ntload:
171 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
172 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
173 ; X86-NEXT: movaps (%ecx), %xmm0
174 ; X86-NEXT: movaps 16(%ecx), %xmm1
175 ; X86-NEXT: movaps %xmm0, (%eax)
176 ; X86-NEXT: movaps %xmm1, 16(%eax)
179 ; X64-SSE2-LABEL: merge_2_v4f32_align16_ntload:
181 ; X64-SSE2-NEXT: movaps (%rdi), %xmm0
182 ; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1
183 ; X64-SSE2-NEXT: movaps %xmm0, (%rsi)
184 ; X64-SSE2-NEXT: movaps %xmm1, 16(%rsi)
185 ; X64-SSE2-NEXT: retq
187 ; X64-SSE4A-LABEL: merge_2_v4f32_align16_ntload:
188 ; X64-SSE4A: # %bb.0:
189 ; X64-SSE4A-NEXT: movaps (%rdi), %xmm0
190 ; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1
191 ; X64-SSE4A-NEXT: movaps %xmm0, (%rsi)
192 ; X64-SSE4A-NEXT: movaps %xmm1, 16(%rsi)
193 ; X64-SSE4A-NEXT: retq
195 ; X64-SSE41-LABEL: merge_2_v4f32_align16_ntload:
196 ; X64-SSE41: # %bb.0:
197 ; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0
198 ; X64-SSE41-NEXT: movntdqa 16(%rdi), %xmm1
199 ; X64-SSE41-NEXT: movdqa %xmm0, (%rsi)
200 ; X64-SSE41-NEXT: movdqa %xmm1, 16(%rsi)
201 ; X64-SSE41-NEXT: retq
203 ; X64-AVX-LABEL: merge_2_v4f32_align16_ntload:
205 ; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0
206 ; X64-AVX-NEXT: vmovntdqa 16(%rdi), %xmm1
207 ; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi)
208 ; X64-AVX-NEXT: vmovdqa %xmm1, 16(%rsi)
210 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
211 %2 = bitcast float* %1 to <4 x float>*
212 %3 = load <4 x float>, <4 x float>* %a0, align 16, !nontemporal !0
213 %4 = load <4 x float>, <4 x float>* %2, align 16, !nontemporal !0
214 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
215 %6 = bitcast float* %5 to <4 x float>*
216 store <4 x float> %3, <4 x float>* %a1, align 16
217 store <4 x float> %4, <4 x float>* %6, align 16
221 ; AVX can't perform NT-store-ymm on 16-byte aligned memory.
222 ; Must be kept seperate as VMOVNTPS xmm.
223 define void @merge_2_v4f32_align16_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind {
224 ; X86-LABEL: merge_2_v4f32_align16_ntstore:
226 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
227 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
228 ; X86-NEXT: movaps (%ecx), %xmm0
229 ; X86-NEXT: movaps 16(%ecx), %xmm1
230 ; X86-NEXT: movntps %xmm0, (%eax)
231 ; X86-NEXT: movntps %xmm1, 16(%eax)
234 ; X64-SSE-LABEL: merge_2_v4f32_align16_ntstore:
236 ; X64-SSE-NEXT: movaps (%rdi), %xmm0
237 ; X64-SSE-NEXT: movaps 16(%rdi), %xmm1
238 ; X64-SSE-NEXT: movntps %xmm0, (%rsi)
239 ; X64-SSE-NEXT: movntps %xmm1, 16(%rsi)
242 ; X64-AVX-LABEL: merge_2_v4f32_align16_ntstore:
244 ; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
245 ; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1
246 ; X64-AVX-NEXT: vmovntps %xmm0, (%rsi)
247 ; X64-AVX-NEXT: vmovntps %xmm1, 16(%rsi)
249 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
250 %2 = bitcast float* %1 to <4 x float>*
251 %3 = load <4 x float>, <4 x float>* %a0, align 16
252 %4 = load <4 x float>, <4 x float>* %2, align 16
253 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
254 %6 = bitcast float* %5 to <4 x float>*
255 store <4 x float> %3, <4 x float>* %a1, align 16, !nontemporal !0
256 store <4 x float> %4, <4 x float>* %6, align 16, !nontemporal !0
260 ; Nothing can perform NT-load-vector on 1-byte aligned memory.
261 ; Just perform regular loads.
262 define void @merge_2_v4f32_align1_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind {
263 ; X86-LABEL: merge_2_v4f32_align1_ntload:
265 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
266 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
267 ; X86-NEXT: movups (%ecx), %xmm0
268 ; X86-NEXT: movups 16(%ecx), %xmm1
269 ; X86-NEXT: movups %xmm0, (%eax)
270 ; X86-NEXT: movups %xmm1, 16(%eax)
273 ; X64-SSE-LABEL: merge_2_v4f32_align1_ntload:
275 ; X64-SSE-NEXT: movups (%rdi), %xmm0
276 ; X64-SSE-NEXT: movups 16(%rdi), %xmm1
277 ; X64-SSE-NEXT: movups %xmm0, (%rsi)
278 ; X64-SSE-NEXT: movups %xmm1, 16(%rsi)
281 ; X64-AVX-LABEL: merge_2_v4f32_align1_ntload:
283 ; X64-AVX-NEXT: vmovups (%rdi), %ymm0
284 ; X64-AVX-NEXT: vmovups %ymm0, (%rsi)
285 ; X64-AVX-NEXT: vzeroupper
287 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
288 %2 = bitcast float* %1 to <4 x float>*
289 %3 = load <4 x float>, <4 x float>* %a0, align 1, !nontemporal !0
290 %4 = load <4 x float>, <4 x float>* %2, align 1, !nontemporal !0
291 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
292 %6 = bitcast float* %5 to <4 x float>*
293 store <4 x float> %3, <4 x float>* %a1, align 1
294 store <4 x float> %4, <4 x float>* %6, align 1
298 ; Nothing can perform NT-store-vector on 1-byte aligned memory.
299 ; Must be scalarized to use MOVTNI/MOVNTSD.
300 define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind {
301 ; X86-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
303 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
304 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
305 ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
306 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
307 ; X86-SSE2-NEXT: movd %xmm0, %ecx
308 ; X86-SSE2-NEXT: movntil %ecx, (%eax)
309 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
310 ; X86-SSE2-NEXT: movd %xmm2, %ecx
311 ; X86-SSE2-NEXT: movntil %ecx, 12(%eax)
312 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
313 ; X86-SSE2-NEXT: movd %xmm2, %ecx
314 ; X86-SSE2-NEXT: movntil %ecx, 8(%eax)
315 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
316 ; X86-SSE2-NEXT: movd %xmm0, %ecx
317 ; X86-SSE2-NEXT: movntil %ecx, 4(%eax)
318 ; X86-SSE2-NEXT: movd %xmm1, %ecx
319 ; X86-SSE2-NEXT: movntil %ecx, 16(%eax)
320 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
321 ; X86-SSE2-NEXT: movd %xmm0, %ecx
322 ; X86-SSE2-NEXT: movntil %ecx, 28(%eax)
323 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
324 ; X86-SSE2-NEXT: movd %xmm0, %ecx
325 ; X86-SSE2-NEXT: movntil %ecx, 24(%eax)
326 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
327 ; X86-SSE2-NEXT: movd %xmm0, %ecx
328 ; X86-SSE2-NEXT: movntil %ecx, 20(%eax)
329 ; X86-SSE2-NEXT: retl
331 ; X86-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
332 ; X86-SSE4A: # %bb.0:
333 ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax
334 ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
335 ; X86-SSE4A-NEXT: movups (%ecx), %xmm0
336 ; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1
337 ; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
338 ; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
339 ; X86-SSE4A-NEXT: movntsd %xmm0, 8(%eax)
340 ; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax)
341 ; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
342 ; X86-SSE4A-NEXT: movntsd %xmm1, 24(%eax)
343 ; X86-SSE4A-NEXT: retl
345 ; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
347 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
348 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
349 ; X64-SSE2-NEXT: movq %xmm0, %rax
350 ; X64-SSE2-NEXT: movntiq %rax, (%rsi)
351 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
352 ; X64-SSE2-NEXT: movq %xmm0, %rax
353 ; X64-SSE2-NEXT: movntiq %rax, 8(%rsi)
354 ; X64-SSE2-NEXT: movq %xmm1, %rax
355 ; X64-SSE2-NEXT: movntiq %rax, 16(%rsi)
356 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
357 ; X64-SSE2-NEXT: movq %xmm0, %rax
358 ; X64-SSE2-NEXT: movntiq %rax, 24(%rsi)
359 ; X64-SSE2-NEXT: retq
361 ; X64-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
362 ; X64-SSE4A: # %bb.0:
363 ; X64-SSE4A-NEXT: movups (%rdi), %xmm0
364 ; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1
365 ; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
366 ; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
367 ; X64-SSE4A-NEXT: movntsd %xmm0, 8(%rsi)
368 ; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi)
369 ; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
370 ; X64-SSE4A-NEXT: movntsd %xmm1, 24(%rsi)
371 ; X64-SSE4A-NEXT: retq
373 ; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore:
374 ; X64-SSE41: # %bb.0:
375 ; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
376 ; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1
377 ; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax
378 ; X64-SSE41-NEXT: movntiq %rax, 8(%rsi)
379 ; X64-SSE41-NEXT: movq %xmm0, %rax
380 ; X64-SSE41-NEXT: movntiq %rax, (%rsi)
381 ; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax
382 ; X64-SSE41-NEXT: movntiq %rax, 24(%rsi)
383 ; X64-SSE41-NEXT: movq %xmm1, %rax
384 ; X64-SSE41-NEXT: movntiq %rax, 16(%rsi)
385 ; X64-SSE41-NEXT: retq
387 ; X64-AVX-LABEL: merge_2_v4f32_align1_ntstore:
389 ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
390 ; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1
391 ; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax
392 ; X64-AVX-NEXT: movntiq %rax, 8(%rsi)
393 ; X64-AVX-NEXT: vmovq %xmm0, %rax
394 ; X64-AVX-NEXT: movntiq %rax, (%rsi)
395 ; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax
396 ; X64-AVX-NEXT: movntiq %rax, 24(%rsi)
397 ; X64-AVX-NEXT: vmovq %xmm1, %rax
398 ; X64-AVX-NEXT: movntiq %rax, 16(%rsi)
400 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
401 %2 = bitcast float* %1 to <4 x float>*
402 %3 = load <4 x float>, <4 x float>* %a0, align 1
403 %4 = load <4 x float>, <4 x float>* %2, align 1
404 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
405 %6 = bitcast float* %5 to <4 x float>*
406 store <4 x float> %3, <4 x float>* %a1, align 1, !nontemporal !0
407 store <4 x float> %4, <4 x float>* %6, align 1, !nontemporal !0
411 ; Nothing can perform NT-load-vector on 1-byte aligned memory.
412 ; Just perform regular loads and scalarize NT-stores.
413 define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
414 ; X86-SSE2-LABEL: merge_2_v4f32_align1:
416 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
417 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
418 ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
419 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
420 ; X86-SSE2-NEXT: movd %xmm0, %ecx
421 ; X86-SSE2-NEXT: movntil %ecx, (%eax)
422 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
423 ; X86-SSE2-NEXT: movd %xmm2, %ecx
424 ; X86-SSE2-NEXT: movntil %ecx, 12(%eax)
425 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
426 ; X86-SSE2-NEXT: movd %xmm2, %ecx
427 ; X86-SSE2-NEXT: movntil %ecx, 8(%eax)
428 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
429 ; X86-SSE2-NEXT: movd %xmm0, %ecx
430 ; X86-SSE2-NEXT: movntil %ecx, 4(%eax)
431 ; X86-SSE2-NEXT: movd %xmm1, %ecx
432 ; X86-SSE2-NEXT: movntil %ecx, 16(%eax)
433 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
434 ; X86-SSE2-NEXT: movd %xmm0, %ecx
435 ; X86-SSE2-NEXT: movntil %ecx, 28(%eax)
436 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
437 ; X86-SSE2-NEXT: movd %xmm0, %ecx
438 ; X86-SSE2-NEXT: movntil %ecx, 24(%eax)
439 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
440 ; X86-SSE2-NEXT: movd %xmm0, %ecx
441 ; X86-SSE2-NEXT: movntil %ecx, 20(%eax)
442 ; X86-SSE2-NEXT: retl
444 ; X86-SSE4A-LABEL: merge_2_v4f32_align1:
445 ; X86-SSE4A: # %bb.0:
446 ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax
447 ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
448 ; X86-SSE4A-NEXT: movups (%ecx), %xmm0
449 ; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1
450 ; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
451 ; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
452 ; X86-SSE4A-NEXT: movntsd %xmm0, 8(%eax)
453 ; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax)
454 ; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
455 ; X86-SSE4A-NEXT: movntsd %xmm1, 24(%eax)
456 ; X86-SSE4A-NEXT: retl
458 ; X64-SSE2-LABEL: merge_2_v4f32_align1:
460 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
461 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
462 ; X64-SSE2-NEXT: movq %xmm0, %rax
463 ; X64-SSE2-NEXT: movntiq %rax, (%rsi)
464 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
465 ; X64-SSE2-NEXT: movq %xmm0, %rax
466 ; X64-SSE2-NEXT: movntiq %rax, 8(%rsi)
467 ; X64-SSE2-NEXT: movq %xmm1, %rax
468 ; X64-SSE2-NEXT: movntiq %rax, 16(%rsi)
469 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
470 ; X64-SSE2-NEXT: movq %xmm0, %rax
471 ; X64-SSE2-NEXT: movntiq %rax, 24(%rsi)
472 ; X64-SSE2-NEXT: retq
474 ; X64-SSE4A-LABEL: merge_2_v4f32_align1:
475 ; X64-SSE4A: # %bb.0:
476 ; X64-SSE4A-NEXT: movups (%rdi), %xmm0
477 ; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1
478 ; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
479 ; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
480 ; X64-SSE4A-NEXT: movntsd %xmm0, 8(%rsi)
481 ; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi)
482 ; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
483 ; X64-SSE4A-NEXT: movntsd %xmm1, 24(%rsi)
484 ; X64-SSE4A-NEXT: retq
486 ; X64-SSE41-LABEL: merge_2_v4f32_align1:
487 ; X64-SSE41: # %bb.0:
488 ; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
489 ; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1
490 ; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax
491 ; X64-SSE41-NEXT: movntiq %rax, 8(%rsi)
492 ; X64-SSE41-NEXT: movq %xmm0, %rax
493 ; X64-SSE41-NEXT: movntiq %rax, (%rsi)
494 ; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax
495 ; X64-SSE41-NEXT: movntiq %rax, 24(%rsi)
496 ; X64-SSE41-NEXT: movq %xmm1, %rax
497 ; X64-SSE41-NEXT: movntiq %rax, 16(%rsi)
498 ; X64-SSE41-NEXT: retq
500 ; X64-AVX-LABEL: merge_2_v4f32_align1:
502 ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
503 ; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1
504 ; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax
505 ; X64-AVX-NEXT: movntiq %rax, 8(%rsi)
506 ; X64-AVX-NEXT: vmovq %xmm0, %rax
507 ; X64-AVX-NEXT: movntiq %rax, (%rsi)
508 ; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax
509 ; X64-AVX-NEXT: movntiq %rax, 24(%rsi)
510 ; X64-AVX-NEXT: vmovq %xmm1, %rax
511 ; X64-AVX-NEXT: movntiq %rax, 16(%rsi)
513 %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
514 %2 = bitcast float* %1 to <4 x float>*
515 %3 = load <4 x float>, <4 x float>* %a0, align 1, !nontemporal !0
516 %4 = load <4 x float>, <4 x float>* %2, align 1, !nontemporal !0
517 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0
518 %6 = bitcast float* %5 to <4 x float>*
519 store <4 x float> %3, <4 x float>* %a1, align 1, !nontemporal !0
520 store <4 x float> %4, <4 x float>* %6, align 1, !nontemporal !0