1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X86-SSE,X86-SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X64-SSE,X64-SSE2
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X86-SSE,X86-SSE4
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4
6 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX,X86-AVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X64-AVX,X64-AVX1
8 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X86-AVX,X86-AVX2
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX,X64-AVX2
11 define <4 x i32> @trunc_lshr_v4i64(<4 x i64> %a) nounwind {
12 ; SSE2-LABEL: trunc_lshr_v4i64:
14 ; SSE2-NEXT: psrlq $63, %xmm1
15 ; SSE2-NEXT: psrlq $63, %xmm0
16 ; SSE2-NEXT: packuswb %xmm1, %xmm0
17 ; SSE2-NEXT: ret{{[l|q]}}
19 ; SSE4-LABEL: trunc_lshr_v4i64:
21 ; SSE4-NEXT: psrlq $63, %xmm1
22 ; SSE4-NEXT: psrlq $63, %xmm0
23 ; SSE4-NEXT: packusdw %xmm1, %xmm0
24 ; SSE4-NEXT: ret{{[l|q]}}
26 ; AVX1-LABEL: trunc_lshr_v4i64:
28 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
29 ; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
30 ; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
31 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
32 ; AVX1-NEXT: vzeroupper
33 ; AVX1-NEXT: ret{{[l|q]}}
35 ; AVX2-LABEL: trunc_lshr_v4i64:
37 ; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0
38 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
39 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
40 ; AVX2-NEXT: vzeroupper
41 ; AVX2-NEXT: ret{{[l|q]}}
42 %1 = lshr <4 x i64> %a, <i64 63, i64 63, i64 63, i64 63>
43 %2 = trunc <4 x i64> %1 to <4 x i32>
47 define <8 x i16> @trunc_lshr_v4i64_bitcast(<4 x i64> %a0) {
48 ; SSE2-LABEL: trunc_lshr_v4i64_bitcast:
50 ; SSE2-NEXT: psrlq $49, %xmm1
51 ; SSE2-NEXT: psrlq $49, %xmm0
52 ; SSE2-NEXT: packssdw %xmm1, %xmm0
53 ; SSE2-NEXT: ret{{[l|q]}}
55 ; SSE4-LABEL: trunc_lshr_v4i64_bitcast:
57 ; SSE4-NEXT: psrlq $49, %xmm1
58 ; SSE4-NEXT: psrlq $49, %xmm0
59 ; SSE4-NEXT: packusdw %xmm1, %xmm0
60 ; SSE4-NEXT: ret{{[l|q]}}
62 ; AVX1-LABEL: trunc_lshr_v4i64_bitcast:
64 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
65 ; AVX1-NEXT: vpsrlq $49, %xmm1, %xmm1
66 ; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
67 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
68 ; AVX1-NEXT: vzeroupper
69 ; AVX1-NEXT: ret{{[l|q]}}
71 ; AVX2-LABEL: trunc_lshr_v4i64_bitcast:
73 ; AVX2-NEXT: vpsrlq $49, %ymm0, %ymm0
74 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
75 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
76 ; AVX2-NEXT: vzeroupper
77 ; AVX2-NEXT: ret{{[l|q]}}
78 %1 = lshr <4 x i64> %a0, <i64 49, i64 49, i64 49, i64 49>
79 %2 = bitcast <4 x i64> %1 to <8 x i32>
80 %3 = trunc <8 x i32> %2 to <8 x i16>
84 define <8 x i16> @trunc_lshr_v8i32(<8 x i32> %a) nounwind {
85 ; SSE2-LABEL: trunc_lshr_v8i32:
87 ; SSE2-NEXT: psrld $31, %xmm1
88 ; SSE2-NEXT: psrld $31, %xmm0
89 ; SSE2-NEXT: packuswb %xmm1, %xmm0
90 ; SSE2-NEXT: ret{{[l|q]}}
92 ; SSE4-LABEL: trunc_lshr_v8i32:
94 ; SSE4-NEXT: psrld $31, %xmm1
95 ; SSE4-NEXT: psrld $31, %xmm0
96 ; SSE4-NEXT: packusdw %xmm1, %xmm0
97 ; SSE4-NEXT: ret{{[l|q]}}
99 ; AVX1-LABEL: trunc_lshr_v8i32:
101 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
102 ; AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
103 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
104 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
105 ; AVX1-NEXT: vzeroupper
106 ; AVX1-NEXT: ret{{[l|q]}}
108 ; AVX2-LABEL: trunc_lshr_v8i32:
110 ; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0
111 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
112 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
113 ; AVX2-NEXT: vzeroupper
114 ; AVX2-NEXT: ret{{[l|q]}}
115 %1 = lshr <8 x i32> %a, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
116 %2 = trunc <8 x i32> %1 to <8 x i16>
120 define <8 x i16> @trunc_lshr_v4i64_demandedelts(<4 x i64> %a0) {
121 ; SSE2-LABEL: trunc_lshr_v4i64_demandedelts:
123 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
124 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
125 ; SSE2-NEXT: pand %xmm2, %xmm1
126 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
127 ; SSE2-NEXT: pand %xmm2, %xmm0
128 ; SSE2-NEXT: packuswb %xmm1, %xmm0
129 ; SSE2-NEXT: ret{{[l|q]}}
131 ; SSE4-LABEL: trunc_lshr_v4i64_demandedelts:
133 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
134 ; SSE4-NEXT: pmovsxbd {{.*#+}} xmm2 = [1,1,1,1]
135 ; SSE4-NEXT: pand %xmm2, %xmm1
136 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
137 ; SSE4-NEXT: pand %xmm2, %xmm0
138 ; SSE4-NEXT: packusdw %xmm1, %xmm0
139 ; SSE4-NEXT: ret{{[l|q]}}
141 ; X86-AVX1-LABEL: trunc_lshr_v4i64_demandedelts:
143 ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
144 ; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
145 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
146 ; X86-AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
147 ; X86-AVX1-NEXT: vzeroupper
148 ; X86-AVX1-NEXT: retl
150 ; X64-AVX1-LABEL: trunc_lshr_v4i64_demandedelts:
152 ; X64-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
153 ; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
154 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
155 ; X64-AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
156 ; X64-AVX1-NEXT: vzeroupper
157 ; X64-AVX1-NEXT: retq
159 ; AVX2-LABEL: trunc_lshr_v4i64_demandedelts:
161 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
162 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
163 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
164 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
165 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
166 ; AVX2-NEXT: vzeroupper
167 ; AVX2-NEXT: ret{{[l|q]}}
168 %1 = shl <4 x i64> %a0, <i64 63, i64 0, i64 63, i64 0>
169 %2 = lshr <4 x i64> %1, <i64 63, i64 0, i64 63, i64 0>
170 %3 = bitcast <4 x i64> %2 to <8 x i32>
171 %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
172 %5 = trunc <8 x i32> %4 to <8 x i16>
176 define <16 x i8> @shuffle_lshr_2v8i16(<8 x i16> %a0, <8 x i16> %a1) {
177 ; SSE-LABEL: shuffle_lshr_2v8i16:
179 ; SSE-NEXT: psrlw $15, %xmm0
180 ; SSE-NEXT: psrlw $15, %xmm1
181 ; SSE-NEXT: packuswb %xmm1, %xmm0
182 ; SSE-NEXT: ret{{[l|q]}}
184 ; AVX-LABEL: shuffle_lshr_2v8i16:
186 ; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
187 ; AVX-NEXT: vpsrlw $15, %xmm1, %xmm1
188 ; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
189 ; AVX-NEXT: ret{{[l|q]}}
190 %lshr0 = lshr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
191 %lshr1 = lshr <8 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
192 %bc0 = bitcast <8 x i16> %lshr0 to <16 x i8>
193 %bc1 = bitcast <8 x i16> %lshr1 to <16 x i8>
194 %res = shufflevector <16 x i8> %bc0, <16 x i8> %bc1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
198 define <8 x i16> @shuffle_lshr_2v4i32(<4 x i32> %a0, <4 x i32> %a1) {
199 ; SSE2-LABEL: shuffle_lshr_2v4i32:
201 ; SSE2-NEXT: psrld $31, %xmm0
202 ; SSE2-NEXT: psrld $31, %xmm1
203 ; SSE2-NEXT: packssdw %xmm1, %xmm0
204 ; SSE2-NEXT: ret{{[l|q]}}
206 ; SSE4-LABEL: shuffle_lshr_2v4i32:
208 ; SSE4-NEXT: psrld $31, %xmm0
209 ; SSE4-NEXT: psrld $31, %xmm1
210 ; SSE4-NEXT: packusdw %xmm1, %xmm0
211 ; SSE4-NEXT: ret{{[l|q]}}
213 ; AVX-LABEL: shuffle_lshr_2v4i32:
215 ; AVX-NEXT: vpsrld $31, %xmm0, %xmm0
216 ; AVX-NEXT: vpsrld $31, %xmm1, %xmm1
217 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
218 ; AVX-NEXT: ret{{[l|q]}}
219 %lshr0 = lshr <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31>
220 %lshr1 = lshr <4 x i32> %a1, <i32 31, i32 31, i32 31, i32 31>
221 %bc0 = bitcast <4 x i32> %lshr0 to <8 x i16>
222 %bc1 = bitcast <4 x i32> %lshr1 to <8 x i16>
223 %res = shufflevector <8 x i16> %bc0, <8 x i16> %bc1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
227 define <4 x i32> @shuffle_lshr_2v2i64(<2 x i64> %a0, <2 x i64> %a1) {
228 ; SSE2-LABEL: shuffle_lshr_2v2i64:
230 ; SSE2-NEXT: psrlq $63, %xmm0
231 ; SSE2-NEXT: psrlq $63, %xmm1
232 ; SSE2-NEXT: packuswb %xmm1, %xmm0
233 ; SSE2-NEXT: ret{{[l|q]}}
235 ; SSE4-LABEL: shuffle_lshr_2v2i64:
237 ; SSE4-NEXT: psrlq $63, %xmm0
238 ; SSE4-NEXT: psrlq $63, %xmm1
239 ; SSE4-NEXT: packusdw %xmm1, %xmm0
240 ; SSE4-NEXT: ret{{[l|q]}}
242 ; AVX-LABEL: shuffle_lshr_2v2i64:
244 ; AVX-NEXT: vpsrlq $63, %xmm0, %xmm0
245 ; AVX-NEXT: vpsrlq $63, %xmm1, %xmm1
246 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
247 ; AVX-NEXT: ret{{[l|q]}}
248 %lshr0 = lshr <2 x i64> %a0, <i64 63, i64 63>
249 %lshr1 = lshr <2 x i64> %a1, <i64 63, i64 63>
250 %bc0 = bitcast <2 x i64> %lshr0 to <4 x i32>
251 %bc1 = bitcast <2 x i64> %lshr1 to <4 x i32>
252 %res = shufflevector <4 x i32> %bc0, <4 x i32> %bc1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
256 define <4 x float> @shuffle_lshr_2v2i64_bitcast(<2 x i64> %a0, <2 x i64> %a1) {
257 ; SSE2-LABEL: shuffle_lshr_2v2i64_bitcast:
259 ; SSE2-NEXT: psrlq $63, %xmm0
260 ; SSE2-NEXT: psrlq $63, %xmm1
261 ; SSE2-NEXT: packuswb %xmm1, %xmm0
262 ; SSE2-NEXT: ret{{[l|q]}}
264 ; SSE4-LABEL: shuffle_lshr_2v2i64_bitcast:
266 ; SSE4-NEXT: psrlq $63, %xmm0
267 ; SSE4-NEXT: psrlq $63, %xmm1
268 ; SSE4-NEXT: packusdw %xmm1, %xmm0
269 ; SSE4-NEXT: ret{{[l|q]}}
271 ; AVX-LABEL: shuffle_lshr_2v2i64_bitcast:
273 ; AVX-NEXT: vpsrlq $63, %xmm0, %xmm0
274 ; AVX-NEXT: vpsrlq $63, %xmm1, %xmm1
275 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
276 ; AVX-NEXT: ret{{[l|q]}}
277 %lshr0 = lshr <2 x i64> %a0, <i64 63, i64 63>
278 %lshr1 = lshr <2 x i64> %a1, <i64 63, i64 63>
279 %bc0 = bitcast <2 x i64> %lshr0 to <4 x float>
280 %bc1 = bitcast <2 x i64> %lshr1 to <4 x float>
281 %res = shufflevector <4 x float> %bc0, <4 x float> %bc1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
285 define <16 x i8> @packuswb_icmp_zero_128(<8 x i16> %a0) {
286 ; X86-SSE-LABEL: packuswb_icmp_zero_128:
288 ; X86-SSE-NEXT: pxor %xmm1, %xmm1
289 ; X86-SSE-NEXT: pcmpeqw %xmm0, %xmm1
290 ; X86-SSE-NEXT: packsswb %xmm1, %xmm1
291 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
292 ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
295 ; X64-SSE-LABEL: packuswb_icmp_zero_128:
297 ; X64-SSE-NEXT: pxor %xmm1, %xmm1
298 ; X64-SSE-NEXT: pcmpeqw %xmm0, %xmm1
299 ; X64-SSE-NEXT: packsswb %xmm1, %xmm1
300 ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
301 ; X64-SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
304 ; X86-AVX-LABEL: packuswb_icmp_zero_128:
306 ; X86-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
307 ; X86-AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
308 ; X86-AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
309 ; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
310 ; X86-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
313 ; X64-AVX-LABEL: packuswb_icmp_zero_128:
315 ; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
316 ; X64-AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
317 ; X64-AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
318 ; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
319 ; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
321 %1 = icmp eq <8 x i16> %a0, zeroinitializer
322 %2 = zext <8 x i1> %1 to <8 x i8>
323 %3 = shufflevector <8 x i8> %2, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
327 define <16 x i8> @packuswb_icmp_zero_trunc_128(<8 x i16> %a0) {
328 ; SSE-LABEL: packuswb_icmp_zero_trunc_128:
330 ; SSE-NEXT: pxor %xmm1, %xmm1
331 ; SSE-NEXT: pcmpeqw %xmm1, %xmm0
332 ; SSE-NEXT: psrlw $15, %xmm0
333 ; SSE-NEXT: packuswb %xmm1, %xmm0
334 ; SSE-NEXT: ret{{[l|q]}}
336 ; AVX-LABEL: packuswb_icmp_zero_trunc_128:
338 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
339 ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
340 ; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
341 ; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
342 ; AVX-NEXT: ret{{[l|q]}}
343 %1 = icmp eq <8 x i16> %a0, zeroinitializer
344 %2 = zext <8 x i1> %1 to <8 x i16>
345 %3 = shufflevector <8 x i16> %2, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
346 %4 = trunc <16 x i16> %3 to <16 x i8>
350 define <32 x i8> @packuswb_icmp_zero_256(<16 x i16> %a0) {
351 ; SSE-LABEL: packuswb_icmp_zero_256:
353 ; SSE-NEXT: pxor %xmm2, %xmm2
354 ; SSE-NEXT: pcmpeqw %xmm2, %xmm1
355 ; SSE-NEXT: psrlw $15, %xmm1
356 ; SSE-NEXT: pcmpeqw %xmm2, %xmm0
357 ; SSE-NEXT: psrlw $15, %xmm0
358 ; SSE-NEXT: pxor %xmm3, %xmm3
359 ; SSE-NEXT: packuswb %xmm0, %xmm3
360 ; SSE-NEXT: packuswb %xmm1, %xmm2
361 ; SSE-NEXT: movdqa %xmm3, %xmm0
362 ; SSE-NEXT: movdqa %xmm2, %xmm1
363 ; SSE-NEXT: ret{{[l|q]}}
365 ; X86-AVX1-LABEL: packuswb_icmp_zero_256:
367 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
368 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
369 ; X86-AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
370 ; X86-AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
371 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
372 ; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
373 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
374 ; X86-AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
375 ; X86-AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
376 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
377 ; X86-AVX1-NEXT: retl
379 ; X64-AVX1-LABEL: packuswb_icmp_zero_256:
381 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
382 ; X64-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
383 ; X64-AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
384 ; X64-AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
385 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
386 ; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
387 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
388 ; X64-AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
389 ; X64-AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
390 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
391 ; X64-AVX1-NEXT: retq
393 ; AVX2-LABEL: packuswb_icmp_zero_256:
395 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
396 ; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
397 ; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0
398 ; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
399 ; AVX2-NEXT: ret{{[l|q]}}
400 %1 = icmp eq <16 x i16> %a0, zeroinitializer
401 %2 = zext <16 x i1> %1 to <16 x i16>
402 %3 = bitcast <16 x i16> %2 to <32 x i8>
403 %4 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
407 define <32 x i8> @packuswb_icmp_zero_trunc_256(<16 x i16> %a0) {
408 ; SSE-LABEL: packuswb_icmp_zero_trunc_256:
410 ; SSE-NEXT: pxor %xmm2, %xmm2
411 ; SSE-NEXT: pcmpeqw %xmm2, %xmm1
412 ; SSE-NEXT: psrlw $15, %xmm1
413 ; SSE-NEXT: pcmpeqw %xmm2, %xmm0
414 ; SSE-NEXT: psrlw $15, %xmm0
415 ; SSE-NEXT: pxor %xmm3, %xmm3
416 ; SSE-NEXT: packuswb %xmm0, %xmm3
417 ; SSE-NEXT: packuswb %xmm1, %xmm2
418 ; SSE-NEXT: movdqa %xmm3, %xmm0
419 ; SSE-NEXT: movdqa %xmm2, %xmm1
420 ; SSE-NEXT: ret{{[l|q]}}
422 ; AVX1-LABEL: packuswb_icmp_zero_trunc_256:
424 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
425 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm2
426 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
427 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
428 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
429 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
430 ; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm2
431 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
432 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
433 ; AVX1-NEXT: ret{{[l|q]}}
435 ; AVX2-LABEL: packuswb_icmp_zero_trunc_256:
437 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
438 ; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
439 ; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0
440 ; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
441 ; AVX2-NEXT: ret{{[l|q]}}
442 %1 = icmp eq <16 x i16> %a0, zeroinitializer
443 %2 = zext <16 x i1> %1 to <16 x i16>
444 %3 = shufflevector <16 x i16> zeroinitializer, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
445 %4 = trunc <32 x i16> %3 to <32 x i8>
448 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: