1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2,X64,X64-SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2,X64,X64-AVX2
4 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2,X86,X86-SSE2
5 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2,X86,X86-AVX2
7 ;------------------------------ 32-bit shuffles -------------------------------;
9 define <4 x i32> @shuffle_i32_of_shl_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
10 ; SSE2-LABEL: shuffle_i32_of_shl_i16:
12 ; SSE2-NEXT: psllw $15, %xmm0
13 ; SSE2-NEXT: psllw $15, %xmm1
14 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
15 ; SSE2-NEXT: movaps %xmm1, %xmm0
16 ; SSE2-NEXT: ret{{[l|q]}}
18 ; AVX2-LABEL: shuffle_i32_of_shl_i16:
20 ; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
21 ; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1
22 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
23 ; AVX2-NEXT: ret{{[l|q]}}
24 %i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15)
25 %i2 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %y, i32 15)
26 %i3 = bitcast <8 x i16> %i1 to <4 x i32>
27 %i4 = bitcast <8 x i16> %i2 to <4 x i32>
28 %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
31 define <4 x i32> @shuffle_i32_of_lshr_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
32 ; SSE2-LABEL: shuffle_i32_of_lshr_i16:
34 ; SSE2-NEXT: psrlw $15, %xmm0
35 ; SSE2-NEXT: psrlw $15, %xmm1
36 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
37 ; SSE2-NEXT: movaps %xmm1, %xmm0
38 ; SSE2-NEXT: ret{{[l|q]}}
40 ; AVX2-LABEL: shuffle_i32_of_lshr_i16:
42 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
43 ; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm1
44 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
45 ; AVX2-NEXT: ret{{[l|q]}}
46 %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15)
47 %i2 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %y, i32 15)
48 %i3 = bitcast <8 x i16> %i1 to <4 x i32>
49 %i4 = bitcast <8 x i16> %i2 to <4 x i32>
50 %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
53 define <4 x i32> @shuffle_i32_of_ashr_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
54 ; SSE2-LABEL: shuffle_i32_of_ashr_i16:
56 ; SSE2-NEXT: psraw $15, %xmm0
57 ; SSE2-NEXT: psraw $15, %xmm1
58 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
59 ; SSE2-NEXT: movaps %xmm1, %xmm0
60 ; SSE2-NEXT: ret{{[l|q]}}
62 ; AVX2-LABEL: shuffle_i32_of_ashr_i16:
64 ; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0
65 ; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1
66 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
67 ; AVX2-NEXT: ret{{[l|q]}}
68 %i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15)
69 %i2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %y, i32 15)
70 %i3 = bitcast <8 x i16> %i1 to <4 x i32>
71 %i4 = bitcast <8 x i16> %i2 to <4 x i32>
72 %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
76 define <4 x i32> @shuffle_i32_of_shl_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
77 ; SSE2-LABEL: shuffle_i32_of_shl_i32:
79 ; SSE2-NEXT: pslld $31, %xmm0
80 ; SSE2-NEXT: pslld $31, %xmm1
81 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
82 ; SSE2-NEXT: movaps %xmm1, %xmm0
83 ; SSE2-NEXT: ret{{[l|q]}}
85 ; AVX2-LABEL: shuffle_i32_of_shl_i32:
87 ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
88 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
89 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
90 ; AVX2-NEXT: ret{{[l|q]}}
91 %i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31)
92 %i2 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %y, i32 31)
93 %i3 = shufflevector <4 x i32> %i1, <4 x i32> %i2, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
96 define <4 x i32> @shuffle_i32_of_lshr_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
97 ; SSE2-LABEL: shuffle_i32_of_lshr_i32:
99 ; SSE2-NEXT: psrld $31, %xmm0
100 ; SSE2-NEXT: psrld $31, %xmm1
101 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
102 ; SSE2-NEXT: movaps %xmm1, %xmm0
103 ; SSE2-NEXT: ret{{[l|q]}}
105 ; AVX2-LABEL: shuffle_i32_of_lshr_i32:
107 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
108 ; AVX2-NEXT: vpsrld $31, %xmm1, %xmm1
109 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
110 ; AVX2-NEXT: ret{{[l|q]}}
111 %i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31)
112 %i2 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %y, i32 31)
113 %i3 = shufflevector <4 x i32> %i1, <4 x i32> %i2, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
116 define <4 x i32> @shuffle_i32_of_ashr_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
117 ; SSE2-LABEL: shuffle_i32_of_ashr_i32:
119 ; SSE2-NEXT: psrad $31, %xmm0
120 ; SSE2-NEXT: psrad $31, %xmm1
121 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
122 ; SSE2-NEXT: movaps %xmm1, %xmm0
123 ; SSE2-NEXT: ret{{[l|q]}}
125 ; AVX2-LABEL: shuffle_i32_of_ashr_i32:
127 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
128 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
129 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
130 ; AVX2-NEXT: ret{{[l|q]}}
131 %i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31)
132 %i2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %y, i32 31)
133 %i3 = shufflevector <4 x i32> %i1, <4 x i32> %i2, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
137 define <4 x i32> @shuffle_i32_of_shl_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
138 ; SSE2-LABEL: shuffle_i32_of_shl_i64:
140 ; SSE2-NEXT: psllq $63, %xmm0
141 ; SSE2-NEXT: psllq $63, %xmm1
142 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
143 ; SSE2-NEXT: movaps %xmm1, %xmm0
144 ; SSE2-NEXT: ret{{[l|q]}}
146 ; AVX2-LABEL: shuffle_i32_of_shl_i64:
148 ; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
149 ; AVX2-NEXT: vpsllq $63, %xmm1, %xmm1
150 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
151 ; AVX2-NEXT: ret{{[l|q]}}
152 %i1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %x, i32 63)
153 %i2 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %y, i32 63)
154 %i3 = bitcast <2 x i64> %i1 to <4 x i32>
155 %i4 = bitcast <2 x i64> %i2 to <4 x i32>
156 %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
159 define <4 x i32> @shuffle_i32_of_lshr_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
160 ; SSE2-LABEL: shuffle_i32_of_lshr_i64:
162 ; SSE2-NEXT: psrlq $63, %xmm0
163 ; SSE2-NEXT: psrlq $63, %xmm1
164 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
165 ; SSE2-NEXT: movaps %xmm1, %xmm0
166 ; SSE2-NEXT: ret{{[l|q]}}
168 ; AVX2-LABEL: shuffle_i32_of_lshr_i64:
170 ; AVX2-NEXT: vpsrlq $63, %xmm0, %xmm0
171 ; AVX2-NEXT: vpsrlq $63, %xmm1, %xmm1
172 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
173 ; AVX2-NEXT: ret{{[l|q]}}
174 %i1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %x, i32 63)
175 %i2 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %y, i32 63)
176 %i3 = bitcast <2 x i64> %i1 to <4 x i32>
177 %i4 = bitcast <2 x i64> %i2 to <4 x i32>
178 %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
181 define <4 x i32> @shuffle_i32_of_ashr_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
182 ; X64-SSE2-LABEL: shuffle_i32_of_ashr_i64:
184 ; X64-SSE2-NEXT: subq $40, %rsp
185 ; X64-SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
186 ; X64-SSE2-NEXT: movl $63, %edi
187 ; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT
188 ; X64-SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
189 ; X64-SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
190 ; X64-SSE2-NEXT: movl $63, %edi
191 ; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT
192 ; X64-SSE2-NEXT: shufps $27, (%rsp), %xmm0 # 16-byte Folded Reload
193 ; X64-SSE2-NEXT: # xmm0 = xmm0[3,2],mem[1,0]
194 ; X64-SSE2-NEXT: addq $40, %rsp
195 ; X64-SSE2-NEXT: retq
197 ; X64-AVX2-LABEL: shuffle_i32_of_ashr_i64:
199 ; X64-AVX2-NEXT: subq $40, %rsp
200 ; X64-AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
201 ; X64-AVX2-NEXT: movl $63, %edi
202 ; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT
203 ; X64-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
204 ; X64-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
205 ; X64-AVX2-NEXT: movl $63, %edi
206 ; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT
207 ; X64-AVX2-NEXT: vshufps $27, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
208 ; X64-AVX2-NEXT: # xmm0 = xmm0[3,2],mem[1,0]
209 ; X64-AVX2-NEXT: addq $40, %rsp
210 ; X64-AVX2-NEXT: retq
212 ; X86-SSE2-LABEL: shuffle_i32_of_ashr_i64:
214 ; X86-SSE2-NEXT: subl $32, %esp
215 ; X86-SSE2-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
216 ; X86-SSE2-NEXT: pushl $63
217 ; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT
218 ; X86-SSE2-NEXT: addl $4, %esp
219 ; X86-SSE2-NEXT: movups %xmm0, (%esp) # 16-byte Spill
220 ; X86-SSE2-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
221 ; X86-SSE2-NEXT: pushl $63
222 ; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT
223 ; X86-SSE2-NEXT: addl $4, %esp
224 ; X86-SSE2-NEXT: movups (%esp), %xmm1 # 16-byte Reload
225 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[1,0]
226 ; X86-SSE2-NEXT: addl $32, %esp
227 ; X86-SSE2-NEXT: retl
229 ; X86-AVX2-LABEL: shuffle_i32_of_ashr_i64:
231 ; X86-AVX2-NEXT: subl $32, %esp
232 ; X86-AVX2-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
233 ; X86-AVX2-NEXT: pushl $63
234 ; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT
235 ; X86-AVX2-NEXT: addl $4, %esp
236 ; X86-AVX2-NEXT: vmovups %xmm0, (%esp) # 16-byte Spill
237 ; X86-AVX2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
238 ; X86-AVX2-NEXT: pushl $63
239 ; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT
240 ; X86-AVX2-NEXT: addl $4, %esp
241 ; X86-AVX2-NEXT: vshufps $27, (%esp), %xmm0, %xmm0 # 16-byte Folded Reload
242 ; X86-AVX2-NEXT: # xmm0 = xmm0[3,2],mem[1,0]
243 ; X86-AVX2-NEXT: addl $32, %esp
244 ; X86-AVX2-NEXT: retl
245 %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63)
246 %i2 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %y, i32 63)
247 %i3 = bitcast <2 x i64> %i1 to <4 x i32>
248 %i4 = bitcast <2 x i64> %i2 to <4 x i32>
249 %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
253 ;------------------------------ 64-bit shuffles -------------------------------;
255 define <2 x i64> @shuffle_i64_of_shl_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
256 ; SSE2-LABEL: shuffle_i64_of_shl_i16:
258 ; SSE2-NEXT: psllw $15, %xmm0
259 ; SSE2-NEXT: psllw $15, %xmm1
260 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
261 ; SSE2-NEXT: movaps %xmm1, %xmm0
262 ; SSE2-NEXT: ret{{[l|q]}}
264 ; AVX2-LABEL: shuffle_i64_of_shl_i16:
266 ; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
267 ; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1
268 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
269 ; AVX2-NEXT: ret{{[l|q]}}
270 %i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15)
271 %i2 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %y, i32 15)
272 %i3 = bitcast <8 x i16> %i1 to <2 x i64>
273 %i4 = bitcast <8 x i16> %i2 to <2 x i64>
274 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
277 define <2 x i64> @shuffle_i64_of_lshr_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
278 ; SSE2-LABEL: shuffle_i64_of_lshr_i16:
280 ; SSE2-NEXT: psrlw $15, %xmm0
281 ; SSE2-NEXT: psrlw $15, %xmm1
282 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
283 ; SSE2-NEXT: movaps %xmm1, %xmm0
284 ; SSE2-NEXT: ret{{[l|q]}}
286 ; AVX2-LABEL: shuffle_i64_of_lshr_i16:
288 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
289 ; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm1
290 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
291 ; AVX2-NEXT: ret{{[l|q]}}
292 %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15)
293 %i2 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %y, i32 15)
294 %i3 = bitcast <8 x i16> %i1 to <2 x i64>
295 %i4 = bitcast <8 x i16> %i2 to <2 x i64>
296 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
299 define <2 x i64> @shuffle_i64_of_ashr_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
300 ; SSE2-LABEL: shuffle_i64_of_ashr_i16:
302 ; SSE2-NEXT: psraw $15, %xmm0
303 ; SSE2-NEXT: psraw $15, %xmm1
304 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
305 ; SSE2-NEXT: movaps %xmm1, %xmm0
306 ; SSE2-NEXT: ret{{[l|q]}}
308 ; AVX2-LABEL: shuffle_i64_of_ashr_i16:
310 ; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0
311 ; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1
312 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
313 ; AVX2-NEXT: ret{{[l|q]}}
314 %i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15)
315 %i2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %y, i32 15)
316 %i3 = bitcast <8 x i16> %i1 to <2 x i64>
317 %i4 = bitcast <8 x i16> %i2 to <2 x i64>
318 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
322 define <2 x i64> @shuffle_i64_of_shl_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
323 ; SSE2-LABEL: shuffle_i64_of_shl_i32:
325 ; SSE2-NEXT: pslld $31, %xmm0
326 ; SSE2-NEXT: pslld $31, %xmm1
327 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
328 ; SSE2-NEXT: movaps %xmm1, %xmm0
329 ; SSE2-NEXT: ret{{[l|q]}}
331 ; AVX2-LABEL: shuffle_i64_of_shl_i32:
333 ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
334 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
335 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
336 ; AVX2-NEXT: ret{{[l|q]}}
337 %i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31)
338 %i2 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %y, i32 31)
339 %i3 = bitcast <4 x i32> %i1 to <2 x i64>
340 %i4 = bitcast <4 x i32> %i2 to <2 x i64>
341 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
344 define <2 x i64> @shuffle_i64_of_lshr_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
345 ; SSE2-LABEL: shuffle_i64_of_lshr_i32:
347 ; SSE2-NEXT: psrld $31, %xmm0
348 ; SSE2-NEXT: psrld $31, %xmm1
349 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
350 ; SSE2-NEXT: movaps %xmm1, %xmm0
351 ; SSE2-NEXT: ret{{[l|q]}}
353 ; AVX2-LABEL: shuffle_i64_of_lshr_i32:
355 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
356 ; AVX2-NEXT: vpsrld $31, %xmm1, %xmm1
357 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
358 ; AVX2-NEXT: ret{{[l|q]}}
359 %i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31)
360 %i2 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %y, i32 31)
361 %i3 = bitcast <4 x i32> %i1 to <2 x i64>
362 %i4 = bitcast <4 x i32> %i2 to <2 x i64>
363 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
366 define <2 x i64> @shuffle_i64_of_ashr_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
367 ; SSE2-LABEL: shuffle_i64_of_ashr_i32:
369 ; SSE2-NEXT: psrad $31, %xmm0
370 ; SSE2-NEXT: psrad $31, %xmm1
371 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
372 ; SSE2-NEXT: movaps %xmm1, %xmm0
373 ; SSE2-NEXT: ret{{[l|q]}}
375 ; AVX2-LABEL: shuffle_i64_of_ashr_i32:
377 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
378 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
379 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
380 ; AVX2-NEXT: ret{{[l|q]}}
381 %i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31)
382 %i2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %y, i32 31)
383 %i3 = bitcast <4 x i32> %i1 to <2 x i64>
384 %i4 = bitcast <4 x i32> %i2 to <2 x i64>
385 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
389 define <2 x i64> @shuffle_i64_of_shl_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
390 ; SSE2-LABEL: shuffle_i64_of_shl_i64:
392 ; SSE2-NEXT: psllq $63, %xmm0
393 ; SSE2-NEXT: psllq $63, %xmm1
394 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
395 ; SSE2-NEXT: movaps %xmm1, %xmm0
396 ; SSE2-NEXT: ret{{[l|q]}}
398 ; AVX2-LABEL: shuffle_i64_of_shl_i64:
400 ; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0
401 ; AVX2-NEXT: vpsllq $63, %xmm1, %xmm1
402 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
403 ; AVX2-NEXT: ret{{[l|q]}}
404 %i1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %x, i32 63)
405 %i2 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %y, i32 63)
406 %i3 = bitcast <2 x i64> %i1 to <2 x i64>
407 %i4 = bitcast <2 x i64> %i2 to <2 x i64>
408 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
411 define <2 x i64> @shuffle_i64_of_lshr_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
412 ; SSE2-LABEL: shuffle_i64_of_lshr_i64:
414 ; SSE2-NEXT: psrlq $63, %xmm0
415 ; SSE2-NEXT: psrlq $63, %xmm1
416 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
417 ; SSE2-NEXT: movaps %xmm1, %xmm0
418 ; SSE2-NEXT: ret{{[l|q]}}
420 ; AVX2-LABEL: shuffle_i64_of_lshr_i64:
422 ; AVX2-NEXT: vpsrlq $63, %xmm0, %xmm0
423 ; AVX2-NEXT: vpsrlq $63, %xmm1, %xmm1
424 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
425 ; AVX2-NEXT: ret{{[l|q]}}
426 %i1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %x, i32 63)
427 %i2 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %y, i32 63)
428 %i3 = bitcast <2 x i64> %i1 to <2 x i64>
429 %i4 = bitcast <2 x i64> %i2 to <2 x i64>
430 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
433 define <2 x i64> @shuffle_i64_of_ashr_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
434 ; X64-SSE2-LABEL: shuffle_i64_of_ashr_i64:
436 ; X64-SSE2-NEXT: subq $40, %rsp
437 ; X64-SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
438 ; X64-SSE2-NEXT: movl $63, %edi
439 ; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT
440 ; X64-SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
441 ; X64-SSE2-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
442 ; X64-SSE2-NEXT: movl $63, %edi
443 ; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT
444 ; X64-SSE2-NEXT: shufpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
445 ; X64-SSE2-NEXT: # xmm0 = xmm0[1],mem[0]
446 ; X64-SSE2-NEXT: addq $40, %rsp
447 ; X64-SSE2-NEXT: retq
449 ; X64-AVX2-LABEL: shuffle_i64_of_ashr_i64:
451 ; X64-AVX2-NEXT: subq $40, %rsp
452 ; X64-AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
453 ; X64-AVX2-NEXT: movl $63, %edi
454 ; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT
455 ; X64-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
456 ; X64-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
457 ; X64-AVX2-NEXT: movl $63, %edi
458 ; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT
459 ; X64-AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
460 ; X64-AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
461 ; X64-AVX2-NEXT: addq $40, %rsp
462 ; X64-AVX2-NEXT: retq
464 ; X86-SSE2-LABEL: shuffle_i64_of_ashr_i64:
466 ; X86-SSE2-NEXT: subl $32, %esp
467 ; X86-SSE2-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
468 ; X86-SSE2-NEXT: pushl $63
469 ; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT
470 ; X86-SSE2-NEXT: addl $4, %esp
471 ; X86-SSE2-NEXT: movups %xmm0, (%esp) # 16-byte Spill
472 ; X86-SSE2-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
473 ; X86-SSE2-NEXT: pushl $63
474 ; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT
475 ; X86-SSE2-NEXT: addl $4, %esp
476 ; X86-SSE2-NEXT: movups (%esp), %xmm1 # 16-byte Reload
477 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
478 ; X86-SSE2-NEXT: addl $32, %esp
479 ; X86-SSE2-NEXT: retl
481 ; X86-AVX2-LABEL: shuffle_i64_of_ashr_i64:
483 ; X86-AVX2-NEXT: subl $32, %esp
484 ; X86-AVX2-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
485 ; X86-AVX2-NEXT: pushl $63
486 ; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT
487 ; X86-AVX2-NEXT: addl $4, %esp
488 ; X86-AVX2-NEXT: vmovups %xmm0, (%esp) # 16-byte Spill
489 ; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
490 ; X86-AVX2-NEXT: pushl $63
491 ; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT
492 ; X86-AVX2-NEXT: addl $4, %esp
493 ; X86-AVX2-NEXT: vmovdqu (%esp), %xmm1 # 16-byte Reload
494 ; X86-AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
495 ; X86-AVX2-NEXT: addl $32, %esp
496 ; X86-AVX2-NEXT: retl
497 %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63)
498 %i2 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %y, i32 63)
499 %i3 = bitcast <2 x i64> %i1 to <2 x i64>
500 %i4 = bitcast <2 x i64> %i2 to <2 x i64>
501 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
505 declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32)
506 declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
507 declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32)
508 declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32)
509 declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32)
510 declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32)
511 declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32)
512 declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32)
513 declare <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64>, i32) ; does not exist
514 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: