1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-linux -mattr=+sse2 < %s | FileCheck %s --check-prefixes=LIN,LIN-SSE2
3 ; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s --check-prefixes=LIN,LIN-SSE4
4 ; RUN: llc -mtriple=x86_64-win32 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=WIN,WIN-SSE2
5 ; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefixes=WIN,WIN-SSE4
6 ; RUN: llc -mtriple=i686-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN32
9 ; When doing vector gather-scatter index calculation with 32-bit indices,
10 ; minimize shuffling of each individual element out of the index vector.
12 define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
13 ; LIN-SSE2-LABEL: foo:
15 ; LIN-SSE2-NEXT: movdqa (%rsi), %xmm0
16 ; LIN-SSE2-NEXT: pand (%rdx), %xmm0
17 ; LIN-SSE2-NEXT: movd %xmm0, %eax
18 ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
19 ; LIN-SSE2-NEXT: movd %xmm1, %ecx
20 ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
21 ; LIN-SSE2-NEXT: movd %xmm1, %edx
22 ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
23 ; LIN-SSE2-NEXT: movd %xmm0, %esi
25 ; LIN-SSE2-NEXT: movslq %ecx, %rcx
26 ; LIN-SSE2-NEXT: movslq %edx, %rdx
27 ; LIN-SSE2-NEXT: movslq %esi, %rsi
28 ; LIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
29 ; LIN-SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
30 ; LIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
31 ; LIN-SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
34 ; LIN-SSE4-LABEL: foo:
36 ; LIN-SSE4-NEXT: movdqa (%rsi), %xmm0
37 ; LIN-SSE4-NEXT: pand (%rdx), %xmm0
38 ; LIN-SSE4-NEXT: movd %xmm0, %eax
39 ; LIN-SSE4-NEXT: pextrd $1, %xmm0, %ecx
40 ; LIN-SSE4-NEXT: pextrd $2, %xmm0, %edx
41 ; LIN-SSE4-NEXT: pextrd $3, %xmm0, %esi
43 ; LIN-SSE4-NEXT: movslq %ecx, %rcx
44 ; LIN-SSE4-NEXT: movslq %edx, %rdx
45 ; LIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
46 ; LIN-SSE4-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
47 ; LIN-SSE4-NEXT: movslq %esi, %rax
48 ; LIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
49 ; LIN-SSE4-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
52 ; WIN-SSE2-LABEL: foo:
54 ; WIN-SSE2-NEXT: movdqa (%rdx), %xmm0
55 ; WIN-SSE2-NEXT: pand (%r8), %xmm0
56 ; WIN-SSE2-NEXT: movd %xmm0, %r8d
57 ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
58 ; WIN-SSE2-NEXT: movd %xmm1, %r9d
59 ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
60 ; WIN-SSE2-NEXT: movd %xmm1, %r10d
61 ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
62 ; WIN-SSE2-NEXT: movd %xmm0, %edx
63 ; WIN-SSE2-NEXT: movslq %r8d, %rax
64 ; WIN-SSE2-NEXT: movslq %r9d, %r8
65 ; WIN-SSE2-NEXT: movslq %r10d, %r9
66 ; WIN-SSE2-NEXT: movslq %edx, %rdx
67 ; WIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
68 ; WIN-SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
69 ; WIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
70 ; WIN-SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
73 ; WIN-SSE4-LABEL: foo:
75 ; WIN-SSE4-NEXT: movdqa (%rdx), %xmm0
76 ; WIN-SSE4-NEXT: pand (%r8), %xmm0
77 ; WIN-SSE4-NEXT: movd %xmm0, %eax
78 ; WIN-SSE4-NEXT: pextrd $1, %xmm0, %edx
79 ; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d
80 ; WIN-SSE4-NEXT: pextrd $3, %xmm0, %r9d
82 ; WIN-SSE4-NEXT: movslq %edx, %rdx
83 ; WIN-SSE4-NEXT: movslq %r8d, %r8
84 ; WIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
85 ; WIN-SSE4-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
86 ; WIN-SSE4-NEXT: movslq %r9d, %rax
87 ; WIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
88 ; WIN-SSE4-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
93 ; LIN32-NEXT: pushl %edi
94 ; LIN32-NEXT: pushl %esi
95 ; LIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
96 ; LIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
97 ; LIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
98 ; LIN32-NEXT: movdqa (%edx), %xmm0
99 ; LIN32-NEXT: pand (%ecx), %xmm0
100 ; LIN32-NEXT: pextrd $1, %xmm0, %ecx
101 ; LIN32-NEXT: pextrd $2, %xmm0, %edx
102 ; LIN32-NEXT: pextrd $3, %xmm0, %esi
103 ; LIN32-NEXT: movd %xmm0, %edi
104 ; LIN32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
105 ; LIN32-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
106 ; LIN32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
107 ; LIN32-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
108 ; LIN32-NEXT: popl %esi
109 ; LIN32-NEXT: popl %edi
111 %a = load <4 x i32>, <4 x i32>* %i
112 %b = load <4 x i32>, <4 x i32>* %h
113 %j = and <4 x i32> %a, %b
114 %d0 = extractelement <4 x i32> %j, i32 0
115 %d1 = extractelement <4 x i32> %j, i32 1
116 %d2 = extractelement <4 x i32> %j, i32 2
117 %d3 = extractelement <4 x i32> %j, i32 3
118 %q0 = getelementptr double, double* %p, i32 %d0
119 %q1 = getelementptr double, double* %p, i32 %d1
120 %q2 = getelementptr double, double* %p, i32 %d2
121 %q3 = getelementptr double, double* %p, i32 %d3
122 %r0 = load double, double* %q0
123 %r1 = load double, double* %q1
124 %r2 = load double, double* %q2
125 %r3 = load double, double* %q3
126 %v0 = insertelement <4 x double> undef, double %r0, i32 0
127 %v1 = insertelement <4 x double> %v0, double %r1, i32 1
128 %v2 = insertelement <4 x double> %v1, double %r2, i32 2
129 %v3 = insertelement <4 x double> %v2, double %r3, i32 3
133 ; Check that the sequence previously used above, which bounces the vector off the
134 ; cache works for x86-32. Note that in this case it will not be used for index
135 ; calculation, since indexes are 32-bit, not 64.
136 define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind {
137 ; LIN-SSE2-LABEL: old:
139 ; LIN-SSE2-NEXT: movdqa (%rsi), %xmm0
140 ; LIN-SSE2-NEXT: pand (%rdx), %xmm0
141 ; LIN-SSE2-NEXT: movd %xmm0, %eax
142 ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
143 ; LIN-SSE2-NEXT: movd %xmm1, %edx
144 ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
145 ; LIN-SSE2-NEXT: movd %xmm1, %esi
146 ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
147 ; LIN-SSE2-NEXT: movd %xmm0, %edi
148 ; LIN-SSE2-NEXT: andl %ecx, %eax
149 ; LIN-SSE2-NEXT: andl %ecx, %edx
150 ; LIN-SSE2-NEXT: andl %ecx, %esi
151 ; LIN-SSE2-NEXT: andl %ecx, %edi
152 ; LIN-SSE2-NEXT: movq %rax, %xmm0
153 ; LIN-SSE2-NEXT: movq %rdx, %xmm1
154 ; LIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
155 ; LIN-SSE2-NEXT: movq %rdi, %xmm2
156 ; LIN-SSE2-NEXT: movq %rsi, %xmm1
157 ; LIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
158 ; LIN-SSE2-NEXT: retq
160 ; LIN-SSE4-LABEL: old:
162 ; LIN-SSE4-NEXT: movdqa (%rsi), %xmm0
163 ; LIN-SSE4-NEXT: pand (%rdx), %xmm0
164 ; LIN-SSE4-NEXT: movd %xmm0, %eax
165 ; LIN-SSE4-NEXT: pextrd $1, %xmm0, %edx
166 ; LIN-SSE4-NEXT: pextrd $2, %xmm0, %esi
167 ; LIN-SSE4-NEXT: pextrd $3, %xmm0, %edi
168 ; LIN-SSE4-NEXT: andl %ecx, %eax
169 ; LIN-SSE4-NEXT: andl %ecx, %edx
170 ; LIN-SSE4-NEXT: andl %ecx, %esi
171 ; LIN-SSE4-NEXT: andl %ecx, %edi
172 ; LIN-SSE4-NEXT: movq %rdx, %xmm1
173 ; LIN-SSE4-NEXT: movq %rax, %xmm0
174 ; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
175 ; LIN-SSE4-NEXT: movq %rdi, %xmm2
176 ; LIN-SSE4-NEXT: movq %rsi, %xmm1
177 ; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
178 ; LIN-SSE4-NEXT: retq
180 ; WIN-SSE2-LABEL: old:
182 ; WIN-SSE2-NEXT: movdqa (%rdx), %xmm0
183 ; WIN-SSE2-NEXT: pand (%r8), %xmm0
184 ; WIN-SSE2-NEXT: movd %xmm0, %eax
185 ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
186 ; WIN-SSE2-NEXT: movd %xmm1, %ecx
187 ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
188 ; WIN-SSE2-NEXT: movd %xmm1, %r8d
189 ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
190 ; WIN-SSE2-NEXT: movd %xmm0, %edx
191 ; WIN-SSE2-NEXT: andl %r9d, %eax
192 ; WIN-SSE2-NEXT: andl %r9d, %ecx
193 ; WIN-SSE2-NEXT: andl %r9d, %r8d
194 ; WIN-SSE2-NEXT: andl %r9d, %edx
195 ; WIN-SSE2-NEXT: movq %rax, %xmm0
196 ; WIN-SSE2-NEXT: movq %rcx, %xmm1
197 ; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
198 ; WIN-SSE2-NEXT: movq %rdx, %xmm2
199 ; WIN-SSE2-NEXT: movq %r8, %xmm1
200 ; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
201 ; WIN-SSE2-NEXT: retq
203 ; WIN-SSE4-LABEL: old:
205 ; WIN-SSE4-NEXT: movdqa (%rdx), %xmm0
206 ; WIN-SSE4-NEXT: pand (%r8), %xmm0
207 ; WIN-SSE4-NEXT: movd %xmm0, %eax
208 ; WIN-SSE4-NEXT: pextrd $1, %xmm0, %ecx
209 ; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d
210 ; WIN-SSE4-NEXT: pextrd $3, %xmm0, %edx
211 ; WIN-SSE4-NEXT: andl %r9d, %eax
212 ; WIN-SSE4-NEXT: andl %r9d, %ecx
213 ; WIN-SSE4-NEXT: andl %r9d, %r8d
214 ; WIN-SSE4-NEXT: andl %r9d, %edx
215 ; WIN-SSE4-NEXT: movq %rcx, %xmm1
216 ; WIN-SSE4-NEXT: movq %rax, %xmm0
217 ; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
218 ; WIN-SSE4-NEXT: movq %rdx, %xmm2
219 ; WIN-SSE4-NEXT: movq %r8, %xmm1
220 ; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
221 ; WIN-SSE4-NEXT: retq
225 ; LIN32-NEXT: pushl %edi
226 ; LIN32-NEXT: pushl %esi
227 ; LIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
228 ; LIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
229 ; LIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
230 ; LIN32-NEXT: movdqa (%edx), %xmm0
231 ; LIN32-NEXT: pand (%ecx), %xmm0
232 ; LIN32-NEXT: movd %xmm0, %ecx
233 ; LIN32-NEXT: pextrd $1, %xmm0, %edx
234 ; LIN32-NEXT: pextrd $2, %xmm0, %esi
235 ; LIN32-NEXT: pextrd $3, %xmm0, %edi
236 ; LIN32-NEXT: andl %eax, %ecx
237 ; LIN32-NEXT: andl %eax, %edx
238 ; LIN32-NEXT: andl %eax, %esi
239 ; LIN32-NEXT: andl %eax, %edi
240 ; LIN32-NEXT: movd %edx, %xmm1
241 ; LIN32-NEXT: movd %ecx, %xmm0
242 ; LIN32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
243 ; LIN32-NEXT: movd %edi, %xmm2
244 ; LIN32-NEXT: movd %esi, %xmm1
245 ; LIN32-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
246 ; LIN32-NEXT: popl %esi
247 ; LIN32-NEXT: popl %edi
249 %a = load <4 x i32>, <4 x i32>* %i
250 %b = load <4 x i32>, <4 x i32>* %h
251 %j = and <4 x i32> %a, %b
252 %d0 = extractelement <4 x i32> %j, i32 0
253 %d1 = extractelement <4 x i32> %j, i32 1
254 %d2 = extractelement <4 x i32> %j, i32 2
255 %d3 = extractelement <4 x i32> %j, i32 3
256 %q0 = zext i32 %d0 to i64
257 %q1 = zext i32 %d1 to i64
258 %q2 = zext i32 %d2 to i64
259 %q3 = zext i32 %d3 to i64
260 %r0 = and i64 %q0, %f
261 %r1 = and i64 %q1, %f
262 %r2 = and i64 %q2, %f
263 %r3 = and i64 %q3, %f
264 %v0 = insertelement <4 x i64> undef, i64 %r0, i32 0
265 %v1 = insertelement <4 x i64> %v0, i64 %r1, i32 1
266 %v2 = insertelement <4 x i64> %v1, i64 %r2, i32 2
267 %v3 = insertelement <4 x i64> %v2, i64 %r3, i32 3