1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=X64,X64-SSSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX
7 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
9 define i32 @t(<2 x i64>* %val) nounwind {
12 ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
13 ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,2,3]
14 ; X32-SSE2-NEXT: movd %xmm0, %eax
19 ; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,2,3]
20 ; X64-SSSE3-NEXT: movd %xmm0, %eax
21 ; X64-SSSE3-NEXT: retq
25 ; X64-AVX-NEXT: movl 8(%rdi), %eax
27 %tmp2 = load <2 x i64>, <2 x i64>* %val, align 16 ; <<2 x i64>> [#uses=1]
28 %tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32> ; <<4 x i32>> [#uses=1]
29 %tmp4 = extractelement <4 x i32> %tmp3, i32 2 ; <i32> [#uses=1]
33 ; Case where extractelement of load ends up as undef.
34 ; (Making sure this doesn't crash.)
35 define i32 @t2(<8 x i32>* %xp) {
43 %x = load <8 x i32>, <8 x i32>* %xp
44 %Shuff68 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3>
45 %y = extractelement <8 x i32> %Shuff68, i32 0
49 ; This case could easily end up inf-looping in the DAG combiner due to an
50 ; low alignment load of the vector which prevents us from reliably forming a
53 define void @t3(<2 x double>* %a0) {
55 ; X32-SSE2: # %bb.0: # %bb
56 ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
57 ; X32-SSE2-NEXT: movups (%eax), %xmm0
58 ; X32-SSE2-NEXT: movhps %xmm0, (%eax)
61 ; X64-SSSE3-LABEL: t3:
62 ; X64-SSSE3: # %bb.0: # %bb
63 ; X64-SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
64 ; X64-SSSE3-NEXT: movsd %xmm0, (%rax)
65 ; X64-SSSE3-NEXT: retq
68 ; X64-AVX: # %bb.0: # %bb
69 ; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
70 ; X64-AVX-NEXT: vmovsd %xmm0, (%rax)
73 %tmp13 = load <2 x double>, <2 x double>* %a0, align 1
74 %.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1
75 store double %.sroa.3.24.vec.extract, double* undef, align 8
79 ; Case where a load is unary shuffled, then bitcast (to a type with the same
80 ; number of elements) before extractelement.
81 ; This is testing for an assertion - the extraction was assuming that the undef
82 ; second shuffle operand was a post-bitcast type instead of a pre-bitcast type.
83 define i64 @t4(<2 x double>* %a) {
86 ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
87 ; X32-SSE2-NEXT: movdqa (%eax), %xmm0
88 ; X32-SSE2-NEXT: movd %xmm0, %eax
89 ; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
90 ; X32-SSE2-NEXT: movd %xmm0, %edx
95 ; X64-NEXT: movq (%rdi), %rax
97 %b = load <2 x double>, <2 x double>* %a, align 16
98 %c = shufflevector <2 x double> %b, <2 x double> %b, <2 x i32> <i32 1, i32 0>
99 %d = bitcast <2 x double> %c to <2 x i64>
100 %e = extractelement <2 x i64> %d, i32 1
104 ; Don't extract from a volatile.
105 define void @t5(<2 x double> *%a0, double *%a1) {
106 ; X32-SSE2-LABEL: t5:
108 ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
109 ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
110 ; X32-SSE2-NEXT: movaps (%ecx), %xmm0
111 ; X32-SSE2-NEXT: movhps %xmm0, (%eax)
112 ; X32-SSE2-NEXT: retl
114 ; X64-SSSE3-LABEL: t5:
115 ; X64-SSSE3: # %bb.0:
116 ; X64-SSSE3-NEXT: movaps (%rdi), %xmm0
117 ; X64-SSSE3-NEXT: movhps %xmm0, (%rsi)
118 ; X64-SSSE3-NEXT: retq
122 ; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
123 ; X64-AVX-NEXT: vmovhps %xmm0, (%rsi)
125 %vecload = load volatile <2 x double>, <2 x double>* %a0, align 16
126 %vecext = extractelement <2 x double> %vecload, i32 1
127 store volatile double %vecext, double* %a1, align 8
131 ; Check for multiuse.
132 define float @t6(<8 x float> *%a0) {
133 ; X32-SSE2-LABEL: t6:
135 ; X32-SSE2-NEXT: pushl %eax
136 ; X32-SSE2-NEXT: .cfi_def_cfa_offset 8
137 ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
138 ; X32-SSE2-NEXT: movaps (%eax), %xmm0
139 ; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
140 ; X32-SSE2-NEXT: xorps %xmm1, %xmm1
141 ; X32-SSE2-NEXT: cmpeqss %xmm0, %xmm1
142 ; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
143 ; X32-SSE2-NEXT: andps %xmm1, %xmm2
144 ; X32-SSE2-NEXT: andnps %xmm0, %xmm1
145 ; X32-SSE2-NEXT: orps %xmm2, %xmm1
146 ; X32-SSE2-NEXT: movss %xmm1, (%esp)
147 ; X32-SSE2-NEXT: flds (%esp)
148 ; X32-SSE2-NEXT: popl %eax
149 ; X32-SSE2-NEXT: .cfi_def_cfa_offset 4
150 ; X32-SSE2-NEXT: retl
152 ; X64-SSSE3-LABEL: t6:
153 ; X64-SSSE3: # %bb.0:
154 ; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
155 ; X64-SSSE3-NEXT: xorps %xmm0, %xmm0
156 ; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0
157 ; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
158 ; X64-SSSE3-NEXT: andps %xmm0, %xmm2
159 ; X64-SSSE3-NEXT: andnps %xmm1, %xmm0
160 ; X64-SSSE3-NEXT: orps %xmm2, %xmm0
161 ; X64-SSSE3-NEXT: retq
165 ; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
166 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
167 ; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1
168 ; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
169 ; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
171 %vecload = load <8 x float>, <8 x float>* %a0, align 32
172 %vecext = extractelement <8 x float> %vecload, i32 1
173 %cmp = fcmp oeq float %vecext, 0.000000e+00
174 %cond = select i1 %cmp, float 1.000000e+00, float %vecext
178 define void @PR43971(<8 x float> *%a0, float *%a1) {
179 ; X32-SSE2-LABEL: PR43971:
180 ; X32-SSE2: # %bb.0: # %entry
181 ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
182 ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
183 ; X32-SSE2-NEXT: movaps 16(%ecx), %xmm0
184 ; X32-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
185 ; X32-SSE2-NEXT: xorps %xmm1, %xmm1
186 ; X32-SSE2-NEXT: cmpltss %xmm0, %xmm1
187 ; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
188 ; X32-SSE2-NEXT: andps %xmm1, %xmm2
189 ; X32-SSE2-NEXT: andnps %xmm0, %xmm1
190 ; X32-SSE2-NEXT: orps %xmm2, %xmm1
191 ; X32-SSE2-NEXT: movss %xmm1, (%eax)
192 ; X32-SSE2-NEXT: retl
194 ; X64-SSSE3-LABEL: PR43971:
195 ; X64-SSSE3: # %bb.0: # %entry
196 ; X64-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
197 ; X64-SSSE3-NEXT: xorps %xmm1, %xmm1
198 ; X64-SSSE3-NEXT: cmpltss %xmm0, %xmm1
199 ; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
200 ; X64-SSSE3-NEXT: andps %xmm1, %xmm2
201 ; X64-SSSE3-NEXT: andnps %xmm0, %xmm1
202 ; X64-SSSE3-NEXT: orps %xmm2, %xmm1
203 ; X64-SSSE3-NEXT: movss %xmm1, (%rsi)
204 ; X64-SSSE3-NEXT: retq
206 ; X64-AVX-LABEL: PR43971:
207 ; X64-AVX: # %bb.0: # %entry
208 ; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
209 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
210 ; X64-AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1
211 ; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
212 ; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
213 ; X64-AVX-NEXT: vmovss %xmm0, (%rsi)
216 %0 = load <8 x float>, <8 x float>* %a0, align 32
217 %vecext = extractelement <8 x float> %0, i32 6
218 %cmp = fcmp ogt float %vecext, 0.000000e+00
219 %1 = load float, float* %a1, align 4
220 %cond = select i1 %cmp, float %1, float %vecext
221 store float %cond, float* %a1, align 4
225 define float @PR43971_1(<8 x float> *%a0) nounwind {
226 ; X32-SSE2-LABEL: PR43971_1:
227 ; X32-SSE2: # %bb.0: # %entry
228 ; X32-SSE2-NEXT: pushl %eax
229 ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
230 ; X32-SSE2-NEXT: movaps (%eax), %xmm0
231 ; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
232 ; X32-SSE2-NEXT: xorps %xmm1, %xmm1
233 ; X32-SSE2-NEXT: cmpeqss %xmm0, %xmm1
234 ; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
235 ; X32-SSE2-NEXT: andps %xmm1, %xmm2
236 ; X32-SSE2-NEXT: andnps %xmm0, %xmm1
237 ; X32-SSE2-NEXT: orps %xmm2, %xmm1
238 ; X32-SSE2-NEXT: movss %xmm1, (%esp)
239 ; X32-SSE2-NEXT: flds (%esp)
240 ; X32-SSE2-NEXT: popl %eax
241 ; X32-SSE2-NEXT: retl
243 ; X64-SSSE3-LABEL: PR43971_1:
244 ; X64-SSSE3: # %bb.0: # %entry
245 ; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
246 ; X64-SSSE3-NEXT: xorps %xmm0, %xmm0
247 ; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0
248 ; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
249 ; X64-SSSE3-NEXT: andps %xmm0, %xmm2
250 ; X64-SSSE3-NEXT: andnps %xmm1, %xmm0
251 ; X64-SSSE3-NEXT: orps %xmm2, %xmm0
252 ; X64-SSSE3-NEXT: retq
254 ; X64-AVX-LABEL: PR43971_1:
255 ; X64-AVX: # %bb.0: # %entry
256 ; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
257 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
258 ; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1
259 ; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
260 ; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
263 %0 = load <8 x float>, <8 x float>* %a0, align 32
264 %vecext = extractelement <8 x float> %0, i32 1
265 %cmp = fcmp oeq float %vecext, 0.000000e+00
266 %cond = select i1 %cmp, float 1.000000e+00, float %vecext
270 ; Test for bad extractions from a VBROADCAST_LOAD of the <2 x i16> non-uniform constant bitcast as <4 x i32>.
271 define void @subextract_broadcast_load_constant(<2 x i16>* nocapture %0, i16* nocapture %1, i16* nocapture %2) {
272 ; X32-SSE2-LABEL: subextract_broadcast_load_constant:
274 ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
275 ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
276 ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
277 ; X32-SSE2-NEXT: movl $-1583308898, (%edx) # imm = 0xA1A09F9E
278 ; X32-SSE2-NEXT: movw $-24674, (%ecx) # imm = 0x9F9E
279 ; X32-SSE2-NEXT: movw $-24160, (%eax) # imm = 0xA1A0
280 ; X32-SSE2-NEXT: retl
282 ; X64-LABEL: subextract_broadcast_load_constant:
284 ; X64-NEXT: movl $-1583308898, (%rdi) # imm = 0xA1A09F9E
285 ; X64-NEXT: movw $-24674, (%rsi) # imm = 0x9F9E
286 ; X64-NEXT: movw $-24160, (%rdx) # imm = 0xA1A0
288 %4 = bitcast <2 x i16>* %0 to i8*
289 store i8 -98, i8* %4, align 1
290 %5 = getelementptr inbounds i8, i8* %4, i64 1
291 store i8 -97, i8* %5, align 1
292 %6 = getelementptr inbounds i8, i8* %4, i64 2
293 store i8 -96, i8* %6, align 1
294 %7 = getelementptr inbounds i8, i8* %4, i64 3
295 store i8 -95, i8* %7, align 1
296 %8 = load <2 x i16>, <2 x i16>* %0, align 4
297 %9 = extractelement <2 x i16> %8, i32 0
298 store i16 %9, i16* %1, align 2
299 %10 = extractelement <2 x i16> %8, i32 1
300 store i16 %10, i16* %2, align 2