1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=X86,X86-SSE4A
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64-SSE,X64-SSE2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=X64-SSE,X64-SSE4A
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X64-SSE,X64-SSE41
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2
14 define void @merge_2_v4f32_align32(ptr %a0, ptr %a1) nounwind {
15 ; X86-LABEL: merge_2_v4f32_align32:
17 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
18 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
19 ; X86-NEXT: movaps (%ecx), %xmm0
20 ; X86-NEXT: movaps 16(%ecx), %xmm1
21 ; X86-NEXT: movntps %xmm0, (%eax)
22 ; X86-NEXT: movntps %xmm1, 16(%eax)
25 ; X64-SSE2-LABEL: merge_2_v4f32_align32:
27 ; X64-SSE2-NEXT: movaps (%rdi), %xmm0
28 ; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1
29 ; X64-SSE2-NEXT: movntps %xmm0, (%rsi)
30 ; X64-SSE2-NEXT: movntps %xmm1, 16(%rsi)
33 ; X64-SSE4A-LABEL: merge_2_v4f32_align32:
35 ; X64-SSE4A-NEXT: movaps (%rdi), %xmm0
36 ; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1
37 ; X64-SSE4A-NEXT: movntps %xmm0, (%rsi)
38 ; X64-SSE4A-NEXT: movntps %xmm1, 16(%rsi)
39 ; X64-SSE4A-NEXT: retq
41 ; X64-SSE41-LABEL: merge_2_v4f32_align32:
43 ; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0
44 ; X64-SSE41-NEXT: movntdqa 16(%rdi), %xmm1
45 ; X64-SSE41-NEXT: movntdq %xmm0, (%rsi)
46 ; X64-SSE41-NEXT: movntdq %xmm1, 16(%rsi)
47 ; X64-SSE41-NEXT: retq
49 ; X64-AVX1-LABEL: merge_2_v4f32_align32:
51 ; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm0
52 ; X64-AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
53 ; X64-AVX1-NEXT: vmovntdq %xmm0, (%rsi)
54 ; X64-AVX1-NEXT: vmovntdq %xmm1, 16(%rsi)
57 ; X64-AVX2-LABEL: merge_2_v4f32_align32:
59 ; X64-AVX2-NEXT: vmovntdqa (%rdi), %ymm0
60 ; X64-AVX2-NEXT: vmovntdq %ymm0, (%rsi)
61 ; X64-AVX2-NEXT: vzeroupper
63 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
64 %2 = load <4 x float>, ptr %a0, align 32, !nontemporal !0
65 %3 = load <4 x float>, ptr %1, align 16, !nontemporal !0
66 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
67 store <4 x float> %2, ptr %a1, align 32, !nontemporal !0
68 store <4 x float> %3, ptr %4, align 16, !nontemporal !0
72 ; Don't merge nt and non-nt loads even if aligned.
73 define void @merge_2_v4f32_align32_mix_ntload(ptr %a0, ptr %a1) nounwind {
74 ; X86-LABEL: merge_2_v4f32_align32_mix_ntload:
76 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
77 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
78 ; X86-NEXT: movaps (%ecx), %xmm0
79 ; X86-NEXT: movaps 16(%ecx), %xmm1
80 ; X86-NEXT: movaps %xmm0, (%eax)
81 ; X86-NEXT: movaps %xmm1, 16(%eax)
84 ; X64-SSE2-LABEL: merge_2_v4f32_align32_mix_ntload:
86 ; X64-SSE2-NEXT: movaps (%rdi), %xmm0
87 ; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1
88 ; X64-SSE2-NEXT: movaps %xmm0, (%rsi)
89 ; X64-SSE2-NEXT: movaps %xmm1, 16(%rsi)
92 ; X64-SSE4A-LABEL: merge_2_v4f32_align32_mix_ntload:
94 ; X64-SSE4A-NEXT: movaps (%rdi), %xmm0
95 ; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1
96 ; X64-SSE4A-NEXT: movaps %xmm0, (%rsi)
97 ; X64-SSE4A-NEXT: movaps %xmm1, 16(%rsi)
98 ; X64-SSE4A-NEXT: retq
100 ; X64-SSE41-LABEL: merge_2_v4f32_align32_mix_ntload:
101 ; X64-SSE41: # %bb.0:
102 ; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0
103 ; X64-SSE41-NEXT: movaps 16(%rdi), %xmm1
104 ; X64-SSE41-NEXT: movdqa %xmm0, (%rsi)
105 ; X64-SSE41-NEXT: movaps %xmm1, 16(%rsi)
106 ; X64-SSE41-NEXT: retq
108 ; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntload:
110 ; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0
111 ; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1
112 ; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi)
113 ; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi)
115 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
116 %2 = load <4 x float>, ptr %a0, align 32, !nontemporal !0
117 %3 = load <4 x float>, ptr %1, align 16
118 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
119 store <4 x float> %2, ptr %a1, align 32
120 store <4 x float> %3, ptr %4, align 16
124 ; Don't merge nt and non-nt stores even if aligned.
125 define void @merge_2_v4f32_align32_mix_ntstore(ptr %a0, ptr %a1) nounwind {
126 ; X86-LABEL: merge_2_v4f32_align32_mix_ntstore:
128 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
129 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
130 ; X86-NEXT: movaps (%ecx), %xmm0
131 ; X86-NEXT: movaps 16(%ecx), %xmm1
132 ; X86-NEXT: movntps %xmm0, (%eax)
133 ; X86-NEXT: movaps %xmm1, 16(%eax)
136 ; X64-SSE-LABEL: merge_2_v4f32_align32_mix_ntstore:
138 ; X64-SSE-NEXT: movaps (%rdi), %xmm0
139 ; X64-SSE-NEXT: movaps 16(%rdi), %xmm1
140 ; X64-SSE-NEXT: movntps %xmm0, (%rsi)
141 ; X64-SSE-NEXT: movaps %xmm1, 16(%rsi)
144 ; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntstore:
146 ; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
147 ; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1
148 ; X64-AVX-NEXT: vmovntps %xmm0, (%rsi)
149 ; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi)
151 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
152 %2 = load <4 x float>, ptr %a0, align 32
153 %3 = load <4 x float>, ptr %1, align 16
154 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
155 store <4 x float> %2, ptr %a1, align 32, !nontemporal !0
156 store <4 x float> %3, ptr %4, align 16
160 ; AVX2 can't perform NT-load-ymm on 16-byte aligned memory.
161 ; Must be kept separate as VMOVNTDQA xmm.
162 define void @merge_2_v4f32_align16_ntload(ptr %a0, ptr %a1) nounwind {
163 ; X86-LABEL: merge_2_v4f32_align16_ntload:
165 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
166 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
167 ; X86-NEXT: movaps (%ecx), %xmm0
168 ; X86-NEXT: movaps 16(%ecx), %xmm1
169 ; X86-NEXT: movaps %xmm0, (%eax)
170 ; X86-NEXT: movaps %xmm1, 16(%eax)
173 ; X64-SSE2-LABEL: merge_2_v4f32_align16_ntload:
175 ; X64-SSE2-NEXT: movaps (%rdi), %xmm0
176 ; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1
177 ; X64-SSE2-NEXT: movaps %xmm0, (%rsi)
178 ; X64-SSE2-NEXT: movaps %xmm1, 16(%rsi)
179 ; X64-SSE2-NEXT: retq
181 ; X64-SSE4A-LABEL: merge_2_v4f32_align16_ntload:
182 ; X64-SSE4A: # %bb.0:
183 ; X64-SSE4A-NEXT: movaps (%rdi), %xmm0
184 ; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1
185 ; X64-SSE4A-NEXT: movaps %xmm0, (%rsi)
186 ; X64-SSE4A-NEXT: movaps %xmm1, 16(%rsi)
187 ; X64-SSE4A-NEXT: retq
189 ; X64-SSE41-LABEL: merge_2_v4f32_align16_ntload:
190 ; X64-SSE41: # %bb.0:
191 ; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0
192 ; X64-SSE41-NEXT: movntdqa 16(%rdi), %xmm1
193 ; X64-SSE41-NEXT: movdqa %xmm0, (%rsi)
194 ; X64-SSE41-NEXT: movdqa %xmm1, 16(%rsi)
195 ; X64-SSE41-NEXT: retq
197 ; X64-AVX-LABEL: merge_2_v4f32_align16_ntload:
199 ; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0
200 ; X64-AVX-NEXT: vmovntdqa 16(%rdi), %xmm1
201 ; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi)
202 ; X64-AVX-NEXT: vmovdqa %xmm1, 16(%rsi)
204 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
205 %2 = load <4 x float>, ptr %a0, align 16, !nontemporal !0
206 %3 = load <4 x float>, ptr %1, align 16, !nontemporal !0
207 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
208 store <4 x float> %2, ptr %a1, align 16
209 store <4 x float> %3, ptr %4, align 16
213 ; AVX can't perform NT-store-ymm on 16-byte aligned memory.
214 ; Must be kept separate as VMOVNTPS xmm.
215 define void @merge_2_v4f32_align16_ntstore(ptr %a0, ptr %a1) nounwind {
216 ; X86-LABEL: merge_2_v4f32_align16_ntstore:
218 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
219 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
220 ; X86-NEXT: movaps (%ecx), %xmm0
221 ; X86-NEXT: movaps 16(%ecx), %xmm1
222 ; X86-NEXT: movntps %xmm0, (%eax)
223 ; X86-NEXT: movntps %xmm1, 16(%eax)
226 ; X64-SSE-LABEL: merge_2_v4f32_align16_ntstore:
228 ; X64-SSE-NEXT: movaps (%rdi), %xmm0
229 ; X64-SSE-NEXT: movaps 16(%rdi), %xmm1
230 ; X64-SSE-NEXT: movntps %xmm0, (%rsi)
231 ; X64-SSE-NEXT: movntps %xmm1, 16(%rsi)
234 ; X64-AVX-LABEL: merge_2_v4f32_align16_ntstore:
236 ; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
237 ; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1
238 ; X64-AVX-NEXT: vmovntps %xmm0, (%rsi)
239 ; X64-AVX-NEXT: vmovntps %xmm1, 16(%rsi)
241 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
242 %2 = load <4 x float>, ptr %a0, align 16
243 %3 = load <4 x float>, ptr %1, align 16
244 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
245 store <4 x float> %2, ptr %a1, align 16, !nontemporal !0
246 store <4 x float> %3, ptr %4, align 16, !nontemporal !0
250 ; Nothing can perform NT-load-vector on 1-byte aligned memory.
251 ; Just perform regular loads.
252 define void @merge_2_v4f32_align1_ntload(ptr %a0, ptr %a1) nounwind {
253 ; X86-LABEL: merge_2_v4f32_align1_ntload:
255 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
256 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
257 ; X86-NEXT: movups (%ecx), %xmm0
258 ; X86-NEXT: movups 16(%ecx), %xmm1
259 ; X86-NEXT: movups %xmm0, (%eax)
260 ; X86-NEXT: movups %xmm1, 16(%eax)
263 ; X64-SSE-LABEL: merge_2_v4f32_align1_ntload:
265 ; X64-SSE-NEXT: movups (%rdi), %xmm0
266 ; X64-SSE-NEXT: movups 16(%rdi), %xmm1
267 ; X64-SSE-NEXT: movups %xmm0, (%rsi)
268 ; X64-SSE-NEXT: movups %xmm1, 16(%rsi)
271 ; X64-AVX-LABEL: merge_2_v4f32_align1_ntload:
273 ; X64-AVX-NEXT: vmovups (%rdi), %ymm0
274 ; X64-AVX-NEXT: vmovups %ymm0, (%rsi)
275 ; X64-AVX-NEXT: vzeroupper
277 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
278 %2 = load <4 x float>, ptr %a0, align 1, !nontemporal !0
279 %3 = load <4 x float>, ptr %1, align 1, !nontemporal !0
280 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
281 store <4 x float> %2, ptr %a1, align 1
282 store <4 x float> %3, ptr %4, align 1
286 ; Nothing can perform NT-store-vector on 1-byte aligned memory.
287 ; Must be scalarized to use MOVTNI/MOVNTSD.
288 define void @merge_2_v4f32_align1_ntstore(ptr %a0, ptr %a1) nounwind {
289 ; X86-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
291 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
292 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
293 ; X86-SSE2-NEXT: movdqu (%ecx), %xmm1
294 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
295 ; X86-SSE2-NEXT: movd %xmm1, %ecx
296 ; X86-SSE2-NEXT: movntil %ecx, (%eax)
297 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
298 ; X86-SSE2-NEXT: movd %xmm2, %ecx
299 ; X86-SSE2-NEXT: movntil %ecx, 12(%eax)
300 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
301 ; X86-SSE2-NEXT: movd %xmm2, %ecx
302 ; X86-SSE2-NEXT: movntil %ecx, 8(%eax)
303 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
304 ; X86-SSE2-NEXT: movd %xmm1, %ecx
305 ; X86-SSE2-NEXT: movntil %ecx, 4(%eax)
306 ; X86-SSE2-NEXT: movd %xmm0, %ecx
307 ; X86-SSE2-NEXT: movntil %ecx, 16(%eax)
308 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
309 ; X86-SSE2-NEXT: movd %xmm1, %ecx
310 ; X86-SSE2-NEXT: movntil %ecx, 28(%eax)
311 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
312 ; X86-SSE2-NEXT: movd %xmm1, %ecx
313 ; X86-SSE2-NEXT: movntil %ecx, 24(%eax)
314 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
315 ; X86-SSE2-NEXT: movd %xmm0, %ecx
316 ; X86-SSE2-NEXT: movntil %ecx, 20(%eax)
317 ; X86-SSE2-NEXT: retl
319 ; X86-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
320 ; X86-SSE4A: # %bb.0:
321 ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax
322 ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
323 ; X86-SSE4A-NEXT: movsd (%ecx), %xmm0 # xmm0 = mem[0],zero
324 ; X86-SSE4A-NEXT: movsd 8(%ecx), %xmm1 # xmm1 = mem[0],zero
325 ; X86-SSE4A-NEXT: movsd 16(%ecx), %xmm2 # xmm2 = mem[0],zero
326 ; X86-SSE4A-NEXT: movsd 24(%ecx), %xmm3 # xmm3 = mem[0],zero
327 ; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
328 ; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax)
329 ; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax)
330 ; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax)
331 ; X86-SSE4A-NEXT: retl
333 ; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
335 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
336 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
337 ; X64-SSE2-NEXT: movq %xmm0, %rax
338 ; X64-SSE2-NEXT: movntiq %rax, (%rsi)
339 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
340 ; X64-SSE2-NEXT: movq %xmm0, %rax
341 ; X64-SSE2-NEXT: movntiq %rax, 8(%rsi)
342 ; X64-SSE2-NEXT: movq %xmm1, %rax
343 ; X64-SSE2-NEXT: movntiq %rax, 16(%rsi)
344 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
345 ; X64-SSE2-NEXT: movq %xmm0, %rax
346 ; X64-SSE2-NEXT: movntiq %rax, 24(%rsi)
347 ; X64-SSE2-NEXT: retq
349 ; X64-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
350 ; X64-SSE4A: # %bb.0:
351 ; X64-SSE4A-NEXT: movsd (%rdi), %xmm0 # xmm0 = mem[0],zero
352 ; X64-SSE4A-NEXT: movsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero
353 ; X64-SSE4A-NEXT: movsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero
354 ; X64-SSE4A-NEXT: movsd 24(%rdi), %xmm3 # xmm3 = mem[0],zero
355 ; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
356 ; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi)
357 ; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi)
358 ; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi)
359 ; X64-SSE4A-NEXT: retq
361 ; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore:
362 ; X64-SSE41: # %bb.0:
363 ; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
364 ; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1
365 ; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax
366 ; X64-SSE41-NEXT: movntiq %rax, 8(%rsi)
367 ; X64-SSE41-NEXT: movq %xmm0, %rax
368 ; X64-SSE41-NEXT: movntiq %rax, (%rsi)
369 ; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax
370 ; X64-SSE41-NEXT: movntiq %rax, 24(%rsi)
371 ; X64-SSE41-NEXT: movq %xmm1, %rax
372 ; X64-SSE41-NEXT: movntiq %rax, 16(%rsi)
373 ; X64-SSE41-NEXT: retq
375 ; X64-AVX-LABEL: merge_2_v4f32_align1_ntstore:
377 ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
378 ; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1
379 ; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax
380 ; X64-AVX-NEXT: movntiq %rax, 8(%rsi)
381 ; X64-AVX-NEXT: vmovq %xmm0, %rax
382 ; X64-AVX-NEXT: movntiq %rax, (%rsi)
383 ; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax
384 ; X64-AVX-NEXT: movntiq %rax, 24(%rsi)
385 ; X64-AVX-NEXT: vmovq %xmm1, %rax
386 ; X64-AVX-NEXT: movntiq %rax, 16(%rsi)
388 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
389 %2 = load <4 x float>, ptr %a0, align 1
390 %3 = load <4 x float>, ptr %1, align 1
391 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
392 store <4 x float> %2, ptr %a1, align 1, !nontemporal !0
393 store <4 x float> %3, ptr %4, align 1, !nontemporal !0
397 ; Nothing can perform NT-load-vector on 1-byte aligned memory.
398 ; Just perform regular loads and scalarize NT-stores.
399 define void @merge_2_v4f32_align1(ptr %a0, ptr %a1) nounwind {
400 ; X86-SSE2-LABEL: merge_2_v4f32_align1:
402 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
403 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
404 ; X86-SSE2-NEXT: movdqu (%ecx), %xmm1
405 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
406 ; X86-SSE2-NEXT: movd %xmm1, %ecx
407 ; X86-SSE2-NEXT: movntil %ecx, (%eax)
408 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
409 ; X86-SSE2-NEXT: movd %xmm2, %ecx
410 ; X86-SSE2-NEXT: movntil %ecx, 12(%eax)
411 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
412 ; X86-SSE2-NEXT: movd %xmm2, %ecx
413 ; X86-SSE2-NEXT: movntil %ecx, 8(%eax)
414 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
415 ; X86-SSE2-NEXT: movd %xmm1, %ecx
416 ; X86-SSE2-NEXT: movntil %ecx, 4(%eax)
417 ; X86-SSE2-NEXT: movd %xmm0, %ecx
418 ; X86-SSE2-NEXT: movntil %ecx, 16(%eax)
419 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
420 ; X86-SSE2-NEXT: movd %xmm1, %ecx
421 ; X86-SSE2-NEXT: movntil %ecx, 28(%eax)
422 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
423 ; X86-SSE2-NEXT: movd %xmm1, %ecx
424 ; X86-SSE2-NEXT: movntil %ecx, 24(%eax)
425 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
426 ; X86-SSE2-NEXT: movd %xmm0, %ecx
427 ; X86-SSE2-NEXT: movntil %ecx, 20(%eax)
428 ; X86-SSE2-NEXT: retl
430 ; X86-SSE4A-LABEL: merge_2_v4f32_align1:
431 ; X86-SSE4A: # %bb.0:
432 ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax
433 ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
434 ; X86-SSE4A-NEXT: movsd (%ecx), %xmm0 # xmm0 = mem[0],zero
435 ; X86-SSE4A-NEXT: movsd 8(%ecx), %xmm1 # xmm1 = mem[0],zero
436 ; X86-SSE4A-NEXT: movsd 16(%ecx), %xmm2 # xmm2 = mem[0],zero
437 ; X86-SSE4A-NEXT: movsd 24(%ecx), %xmm3 # xmm3 = mem[0],zero
438 ; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
439 ; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax)
440 ; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax)
441 ; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax)
442 ; X86-SSE4A-NEXT: retl
444 ; X64-SSE2-LABEL: merge_2_v4f32_align1:
446 ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
447 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
448 ; X64-SSE2-NEXT: movq %xmm0, %rax
449 ; X64-SSE2-NEXT: movntiq %rax, (%rsi)
450 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
451 ; X64-SSE2-NEXT: movq %xmm0, %rax
452 ; X64-SSE2-NEXT: movntiq %rax, 8(%rsi)
453 ; X64-SSE2-NEXT: movq %xmm1, %rax
454 ; X64-SSE2-NEXT: movntiq %rax, 16(%rsi)
455 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
456 ; X64-SSE2-NEXT: movq %xmm0, %rax
457 ; X64-SSE2-NEXT: movntiq %rax, 24(%rsi)
458 ; X64-SSE2-NEXT: retq
460 ; X64-SSE4A-LABEL: merge_2_v4f32_align1:
461 ; X64-SSE4A: # %bb.0:
462 ; X64-SSE4A-NEXT: movsd (%rdi), %xmm0 # xmm0 = mem[0],zero
463 ; X64-SSE4A-NEXT: movsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero
464 ; X64-SSE4A-NEXT: movsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero
465 ; X64-SSE4A-NEXT: movsd 24(%rdi), %xmm3 # xmm3 = mem[0],zero
466 ; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
467 ; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi)
468 ; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi)
469 ; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi)
470 ; X64-SSE4A-NEXT: retq
472 ; X64-SSE41-LABEL: merge_2_v4f32_align1:
473 ; X64-SSE41: # %bb.0:
474 ; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
475 ; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1
476 ; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax
477 ; X64-SSE41-NEXT: movntiq %rax, 8(%rsi)
478 ; X64-SSE41-NEXT: movq %xmm0, %rax
479 ; X64-SSE41-NEXT: movntiq %rax, (%rsi)
480 ; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax
481 ; X64-SSE41-NEXT: movntiq %rax, 24(%rsi)
482 ; X64-SSE41-NEXT: movq %xmm1, %rax
483 ; X64-SSE41-NEXT: movntiq %rax, 16(%rsi)
484 ; X64-SSE41-NEXT: retq
486 ; X64-AVX-LABEL: merge_2_v4f32_align1:
488 ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
489 ; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1
490 ; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax
491 ; X64-AVX-NEXT: movntiq %rax, 8(%rsi)
492 ; X64-AVX-NEXT: vmovq %xmm0, %rax
493 ; X64-AVX-NEXT: movntiq %rax, (%rsi)
494 ; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax
495 ; X64-AVX-NEXT: movntiq %rax, 24(%rsi)
496 ; X64-AVX-NEXT: vmovq %xmm1, %rax
497 ; X64-AVX-NEXT: movntiq %rax, 16(%rsi)
499 %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
500 %2 = load <4 x float>, ptr %a0, align 1, !nontemporal !0
501 %3 = load <4 x float>, ptr %1, align 1, !nontemporal !0
502 %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
503 store <4 x float> %2, ptr %a1, align 1, !nontemporal !0
504 store <4 x float> %3, ptr %4, align 1, !nontemporal !0