1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
10 define <8 x i16> @test1(<8 x i16> %x) nounwind {
12 ; SSE: # %bb.0: # %vector.ph
13 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
17 ; AVX: # %bb.0: # %vector.ph
18 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
21 %0 = icmp slt <8 x i16> %x, zeroinitializer
22 %1 = xor <8 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
23 %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
27 define <8 x i16> @test2(<8 x i16> %x) nounwind {
29 ; SSE: # %bb.0: # %vector.ph
30 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
34 ; AVX: # %bb.0: # %vector.ph
35 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
38 %0 = icmp ugt <8 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
39 %1 = add <8 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
40 %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
44 define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
46 ; SSE: # %bb.0: # %vector.ph
47 ; SSE-NEXT: movd %edi, %xmm1
48 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
49 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
50 ; SSE-NEXT: psubusw %xmm1, %xmm0
54 ; AVX1: # %bb.0: # %vector.ph
55 ; AVX1-NEXT: vmovd %edi, %xmm1
56 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
57 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
58 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
62 ; AVX2: # %bb.0: # %vector.ph
63 ; AVX2-NEXT: vmovd %edi, %xmm1
64 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
65 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
68 ; AVX512-LABEL: test3:
69 ; AVX512: # %bb.0: # %vector.ph
70 ; AVX512-NEXT: vpbroadcastw %edi, %xmm1
71 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
74 %0 = insertelement <8 x i16> undef, i16 %w, i32 0
75 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
76 %1 = icmp ult <8 x i16> %x, %broadcast15
77 %2 = sub <8 x i16> %x, %broadcast15
78 %res = select <8 x i1> %1, <8 x i16> zeroinitializer, <8 x i16> %2
82 define <16 x i8> @test4(<16 x i8> %x) nounwind {
84 ; SSE: # %bb.0: # %vector.ph
85 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
89 ; AVX: # %bb.0: # %vector.ph
90 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
93 %0 = icmp slt <16 x i8> %x, zeroinitializer
94 %1 = xor <16 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
95 %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
99 define <16 x i8> @test5(<16 x i8> %x) nounwind {
101 ; SSE: # %bb.0: # %vector.ph
102 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
106 ; AVX: # %bb.0: # %vector.ph
107 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
110 %0 = icmp ugt <16 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
111 %1 = add <16 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
112 %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
116 define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
118 ; SSE2: # %bb.0: # %vector.ph
119 ; SSE2-NEXT: movd %edi, %xmm1
120 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
121 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
122 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
123 ; SSE2-NEXT: psubusb %xmm1, %xmm0
126 ; SSSE3-LABEL: test6:
127 ; SSSE3: # %bb.0: # %vector.ph
128 ; SSSE3-NEXT: movd %edi, %xmm1
129 ; SSSE3-NEXT: pxor %xmm2, %xmm2
130 ; SSSE3-NEXT: pshufb %xmm2, %xmm1
131 ; SSSE3-NEXT: psubusb %xmm1, %xmm0
134 ; SSE41-LABEL: test6:
135 ; SSE41: # %bb.0: # %vector.ph
136 ; SSE41-NEXT: movd %edi, %xmm1
137 ; SSE41-NEXT: pxor %xmm2, %xmm2
138 ; SSE41-NEXT: pshufb %xmm2, %xmm1
139 ; SSE41-NEXT: psubusb %xmm1, %xmm0
143 ; AVX1: # %bb.0: # %vector.ph
144 ; AVX1-NEXT: vmovd %edi, %xmm1
145 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
146 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
147 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
151 ; AVX2: # %bb.0: # %vector.ph
152 ; AVX2-NEXT: vmovd %edi, %xmm1
153 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
154 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
157 ; AVX512-LABEL: test6:
158 ; AVX512: # %bb.0: # %vector.ph
159 ; AVX512-NEXT: vpbroadcastb %edi, %xmm1
160 ; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
163 %0 = insertelement <16 x i8> undef, i8 %w, i32 0
164 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
165 %1 = icmp ult <16 x i8> %x, %broadcast15
166 %2 = sub <16 x i8> %x, %broadcast15
167 %res = select <16 x i1> %1, <16 x i8> zeroinitializer, <16 x i8> %2
171 define <16 x i16> @test7(<16 x i16> %x) nounwind {
173 ; SSE: # %bb.0: # %vector.ph
174 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
175 ; SSE-NEXT: psubusw %xmm2, %xmm0
176 ; SSE-NEXT: psubusw %xmm2, %xmm1
180 ; AVX1: # %bb.0: # %vector.ph
181 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
182 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
183 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
184 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
185 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
189 ; AVX2: # %bb.0: # %vector.ph
190 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
193 ; AVX512-LABEL: test7:
194 ; AVX512: # %bb.0: # %vector.ph
195 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
198 %0 = icmp slt <16 x i16> %x, zeroinitializer
199 %1 = xor <16 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
200 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
204 define <16 x i16> @test8(<16 x i16> %x) nounwind {
206 ; SSE: # %bb.0: # %vector.ph
207 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
208 ; SSE-NEXT: psubusw %xmm2, %xmm0
209 ; SSE-NEXT: psubusw %xmm2, %xmm1
213 ; AVX1: # %bb.0: # %vector.ph
214 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
215 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
216 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
217 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
218 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
222 ; AVX2: # %bb.0: # %vector.ph
223 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
226 ; AVX512-LABEL: test8:
227 ; AVX512: # %bb.0: # %vector.ph
228 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
231 %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
232 %1 = add <16 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
233 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
237 define <16 x i16> @test8a(<16 x i16> %x) nounwind {
239 ; SSE: # %bb.0: # %vector.ph
240 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
241 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm1
244 ; AVX1-LABEL: test8a:
245 ; AVX1: # %bb.0: # %vector.ph
246 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm1
247 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
248 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
249 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
252 ; AVX2-LABEL: test8a:
253 ; AVX2: # %bb.0: # %vector.ph
254 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
257 ; AVX512-LABEL: test8a:
258 ; AVX512: # %bb.0: # %vector.ph
259 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
262 %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32765, i16 32764, i16 32763, i16 32762, i16 32761, i16 32760, i16 32759, i16 32758, i16 32757, i16 32756, i16 32755, i16 32754, i16 32753, i16 32752, i16 32751>
263 %1 = add <16 x i16> %x, <i16 -32767, i16 -32766, i16 -32765, i16 -32764, i16 -32763, i16 -32762, i16 -32761, i16 -32760, i16 -32759, i16 -32758, i16 -32757, i16 -32756, i16 -32755, i16 -32754, i16 -32753, i16 -32752>
264 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
268 define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
270 ; SSE: # %bb.0: # %vector.ph
271 ; SSE-NEXT: movd %edi, %xmm2
272 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
273 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
274 ; SSE-NEXT: psubusw %xmm2, %xmm0
275 ; SSE-NEXT: psubusw %xmm2, %xmm1
279 ; AVX1: # %bb.0: # %vector.ph
280 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
281 ; AVX1-NEXT: vmovd %edi, %xmm2
282 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
283 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
284 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
285 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
286 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
290 ; AVX2: # %bb.0: # %vector.ph
291 ; AVX2-NEXT: vmovd %edi, %xmm1
292 ; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
293 ; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
296 ; AVX512-LABEL: test9:
297 ; AVX512: # %bb.0: # %vector.ph
298 ; AVX512-NEXT: vpbroadcastw %edi, %ymm1
299 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
302 %0 = insertelement <16 x i16> undef, i16 %w, i32 0
303 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
304 %1 = icmp ult <16 x i16> %x, %broadcast15
305 %2 = sub <16 x i16> %x, %broadcast15
306 %res = select <16 x i1> %1, <16 x i16> zeroinitializer, <16 x i16> %2
310 define <32 x i8> @test10(<32 x i8> %x) nounwind {
312 ; SSE: # %bb.0: # %vector.ph
313 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
314 ; SSE-NEXT: psubusb %xmm2, %xmm0
315 ; SSE-NEXT: psubusb %xmm2, %xmm1
318 ; AVX1-LABEL: test10:
319 ; AVX1: # %bb.0: # %vector.ph
320 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
321 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
322 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
323 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
324 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
327 ; AVX2-LABEL: test10:
328 ; AVX2: # %bb.0: # %vector.ph
329 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
332 ; AVX512-LABEL: test10:
333 ; AVX512: # %bb.0: # %vector.ph
334 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
337 %0 = icmp slt <32 x i8> %x, zeroinitializer
338 %1 = xor <32 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
339 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
343 define <32 x i8> @test11(<32 x i8> %x) nounwind {
345 ; SSE: # %bb.0: # %vector.ph
346 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
347 ; SSE-NEXT: psubusb %xmm2, %xmm0
348 ; SSE-NEXT: psubusb %xmm2, %xmm1
351 ; AVX1-LABEL: test11:
352 ; AVX1: # %bb.0: # %vector.ph
353 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
354 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
355 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
356 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
357 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
360 ; AVX2-LABEL: test11:
361 ; AVX2: # %bb.0: # %vector.ph
362 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
365 ; AVX512-LABEL: test11:
366 ; AVX512: # %bb.0: # %vector.ph
367 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
370 %0 = icmp ugt <32 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
371 %1 = add <32 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
372 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
376 define <32 x i8> @test11a(<32 x i8> %x) nounwind {
377 ; SSE-LABEL: test11a:
378 ; SSE: # %bb.0: # %vector.ph
379 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
380 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm1
383 ; AVX1-LABEL: test11a:
384 ; AVX1: # %bb.0: # %vector.ph
385 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm1
386 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
387 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
388 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
391 ; AVX2-LABEL: test11a:
392 ; AVX2: # %bb.0: # %vector.ph
393 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
396 ; AVX512-LABEL: test11a:
397 ; AVX512: # %bb.0: # %vector.ph
398 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
401 %0 = icmp ugt <32 x i8> %x, <i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 117, i8 116, i8 115, i8 114, i8 113, i8 112, i8 111, i8 110, i8 109, i8 108, i8 107, i8 106, i8 105, i8 104, i8 103, i8 102, i8 101, i8 100, i8 99, i8 98, i8 97, i8 96, i8 95>
402 %1 = add <32 x i8> %x, <i8 -127, i8 -126, i8 -125, i8 -124, i8 -123, i8 -122, i8 -121, i8 -120, i8 -119, i8 -118, i8 -117, i8 -116, i8 -115, i8 -114, i8 -113, i8 -112, i8 -111, i8 -110, i8 -109, i8 -108, i8 -107, i8 -106, i8 -105, i8 -104, i8 -103, i8 -102, i8 -101, i8 -100, i8 -99, i8 -98, i8 -97, i8 -96>
403 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
407 define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
408 ; SSE2-LABEL: test12:
409 ; SSE2: # %bb.0: # %vector.ph
410 ; SSE2-NEXT: movd %edi, %xmm2
411 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
412 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
413 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
414 ; SSE2-NEXT: psubusb %xmm2, %xmm0
415 ; SSE2-NEXT: psubusb %xmm2, %xmm1
418 ; SSSE3-LABEL: test12:
419 ; SSSE3: # %bb.0: # %vector.ph
420 ; SSSE3-NEXT: movd %edi, %xmm2
421 ; SSSE3-NEXT: pxor %xmm3, %xmm3
422 ; SSSE3-NEXT: pshufb %xmm3, %xmm2
423 ; SSSE3-NEXT: psubusb %xmm2, %xmm0
424 ; SSSE3-NEXT: psubusb %xmm2, %xmm1
427 ; SSE41-LABEL: test12:
428 ; SSE41: # %bb.0: # %vector.ph
429 ; SSE41-NEXT: movd %edi, %xmm2
430 ; SSE41-NEXT: pxor %xmm3, %xmm3
431 ; SSE41-NEXT: pshufb %xmm3, %xmm2
432 ; SSE41-NEXT: psubusb %xmm2, %xmm0
433 ; SSE41-NEXT: psubusb %xmm2, %xmm1
436 ; AVX1-LABEL: test12:
437 ; AVX1: # %bb.0: # %vector.ph
438 ; AVX1-NEXT: vmovd %edi, %xmm1
439 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
440 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
441 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
442 ; AVX1-NEXT: vpsubusb %xmm1, %xmm2, %xmm2
443 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
444 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
447 ; AVX2-LABEL: test12:
448 ; AVX2: # %bb.0: # %vector.ph
449 ; AVX2-NEXT: vmovd %edi, %xmm1
450 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
451 ; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
454 ; AVX512-LABEL: test12:
455 ; AVX512: # %bb.0: # %vector.ph
456 ; AVX512-NEXT: vpbroadcastb %edi, %ymm1
457 ; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
460 %0 = insertelement <32 x i8> undef, i8 %w, i32 0
461 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
462 %1 = icmp ult <32 x i8> %x, %broadcast15
463 %2 = sub <32 x i8> %x, %broadcast15
464 %res = select <32 x i1> %1, <32 x i8> zeroinitializer, <32 x i8> %2
468 define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
469 ; SSE2-LABEL: test13:
470 ; SSE2: # %bb.0: # %vector.ph
471 ; SSE2-NEXT: pxor %xmm4, %xmm4
472 ; SSE2-NEXT: movdqa %xmm0, %xmm3
473 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
474 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
475 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
476 ; SSE2-NEXT: movdqa %xmm0, %xmm5
477 ; SSE2-NEXT: psubd %xmm2, %xmm0
478 ; SSE2-NEXT: movdqa %xmm2, %xmm6
479 ; SSE2-NEXT: pxor %xmm4, %xmm6
480 ; SSE2-NEXT: por %xmm4, %xmm5
481 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
482 ; SSE2-NEXT: movdqa %xmm1, %xmm2
483 ; SSE2-NEXT: pxor %xmm4, %xmm2
484 ; SSE2-NEXT: por %xmm3, %xmm4
485 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
486 ; SSE2-NEXT: packssdw %xmm6, %xmm2
487 ; SSE2-NEXT: psubd %xmm1, %xmm3
488 ; SSE2-NEXT: pslld $16, %xmm0
489 ; SSE2-NEXT: psrad $16, %xmm0
490 ; SSE2-NEXT: pslld $16, %xmm3
491 ; SSE2-NEXT: psrad $16, %xmm3
492 ; SSE2-NEXT: packssdw %xmm0, %xmm3
493 ; SSE2-NEXT: pandn %xmm3, %xmm2
494 ; SSE2-NEXT: movdqa %xmm2, %xmm0
497 ; SSSE3-LABEL: test13:
498 ; SSSE3: # %bb.0: # %vector.ph
499 ; SSSE3-NEXT: pxor %xmm3, %xmm3
500 ; SSSE3-NEXT: movdqa %xmm0, %xmm4
501 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
502 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
503 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
504 ; SSSE3-NEXT: movdqa %xmm0, %xmm5
505 ; SSSE3-NEXT: psubd %xmm2, %xmm0
506 ; SSSE3-NEXT: movdqa %xmm2, %xmm6
507 ; SSSE3-NEXT: pxor %xmm3, %xmm6
508 ; SSSE3-NEXT: por %xmm3, %xmm5
509 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
510 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
511 ; SSSE3-NEXT: pxor %xmm3, %xmm2
512 ; SSSE3-NEXT: por %xmm4, %xmm3
513 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
514 ; SSSE3-NEXT: packssdw %xmm6, %xmm2
515 ; SSSE3-NEXT: psubd %xmm1, %xmm4
516 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
517 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
518 ; SSSE3-NEXT: pshufb %xmm1, %xmm4
519 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
520 ; SSSE3-NEXT: pandn %xmm4, %xmm2
521 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
524 ; SSE41-LABEL: test13:
525 ; SSE41: # %bb.0: # %vector.ph
526 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
527 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
528 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
529 ; SSE41-NEXT: movdqa %xmm4, %xmm0
530 ; SSE41-NEXT: pmaxud %xmm1, %xmm0
531 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
532 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
533 ; SSE41-NEXT: pxor %xmm5, %xmm0
534 ; SSE41-NEXT: movdqa %xmm3, %xmm6
535 ; SSE41-NEXT: pmaxud %xmm2, %xmm6
536 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm6
537 ; SSE41-NEXT: pxor %xmm5, %xmm6
538 ; SSE41-NEXT: packssdw %xmm6, %xmm0
539 ; SSE41-NEXT: psubd %xmm2, %xmm3
540 ; SSE41-NEXT: psubd %xmm1, %xmm4
541 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
542 ; SSE41-NEXT: pshufb %xmm1, %xmm4
543 ; SSE41-NEXT: pshufb %xmm1, %xmm3
544 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
545 ; SSE41-NEXT: pandn %xmm4, %xmm0
548 ; AVX1-LABEL: test13:
549 ; AVX1: # %bb.0: # %vector.ph
550 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
551 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
552 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
553 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
554 ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm4
555 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4
556 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
557 ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
558 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm6
559 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm0, %xmm6
560 ; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm5
561 ; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4
562 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
563 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm1
564 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
565 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
566 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
567 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
568 ; AVX1-NEXT: vpandn %xmm0, %xmm4, %xmm0
569 ; AVX1-NEXT: vzeroupper
572 ; AVX2-LABEL: test13:
573 ; AVX2: # %bb.0: # %vector.ph
574 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
575 ; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm2
576 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2
577 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
578 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
579 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
580 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
581 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
582 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
583 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
584 ; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0
585 ; AVX2-NEXT: vzeroupper
588 ; AVX512-LABEL: test13:
589 ; AVX512: # %bb.0: # %vector.ph
590 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
591 ; AVX512-NEXT: vpcmpnltud %ymm1, %ymm0, %k1
592 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
593 ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
594 ; AVX512-NEXT: vzeroupper
597 %lhs = zext <8 x i16> %x to <8 x i32>
598 %cond = icmp ult <8 x i32> %lhs, %y
599 %sub = sub <8 x i32> %lhs, %y
600 %trunc = trunc <8 x i32> %sub to <8 x i16>
601 %res = select <8 x i1> %cond, <8 x i16> zeroinitializer, <8 x i16> %trunc
605 define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
606 ; SSE2-LABEL: test14:
607 ; SSE2: # %bb.0: # %vector.ph
608 ; SSE2-NEXT: movdqa %xmm0, %xmm5
609 ; SSE2-NEXT: pxor %xmm0, %xmm0
610 ; SSE2-NEXT: movdqa %xmm5, %xmm6
611 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
612 ; SSE2-NEXT: movdqa %xmm6, %xmm8
613 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
614 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
615 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
616 ; SSE2-NEXT: movdqa %xmm5, %xmm10
617 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
618 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
619 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
620 ; SSE2-NEXT: movdqa %xmm4, %xmm9
621 ; SSE2-NEXT: pxor %xmm0, %xmm9
622 ; SSE2-NEXT: psubd %xmm5, %xmm4
623 ; SSE2-NEXT: por %xmm0, %xmm5
624 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm5
625 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255]
626 ; SSE2-NEXT: pand %xmm9, %xmm5
627 ; SSE2-NEXT: movdqa %xmm3, %xmm7
628 ; SSE2-NEXT: pxor %xmm0, %xmm7
629 ; SSE2-NEXT: psubd %xmm10, %xmm3
630 ; SSE2-NEXT: por %xmm0, %xmm10
631 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm10
632 ; SSE2-NEXT: pand %xmm9, %xmm10
633 ; SSE2-NEXT: packuswb %xmm5, %xmm10
634 ; SSE2-NEXT: movdqa %xmm2, %xmm5
635 ; SSE2-NEXT: pxor %xmm0, %xmm5
636 ; SSE2-NEXT: psubd %xmm6, %xmm2
637 ; SSE2-NEXT: por %xmm0, %xmm6
638 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
639 ; SSE2-NEXT: pand %xmm9, %xmm6
640 ; SSE2-NEXT: movdqa %xmm1, %xmm5
641 ; SSE2-NEXT: pxor %xmm0, %xmm5
642 ; SSE2-NEXT: por %xmm8, %xmm0
643 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
644 ; SSE2-NEXT: pand %xmm9, %xmm0
645 ; SSE2-NEXT: packuswb %xmm6, %xmm0
646 ; SSE2-NEXT: packuswb %xmm10, %xmm0
647 ; SSE2-NEXT: psubd %xmm8, %xmm1
648 ; SSE2-NEXT: pand %xmm9, %xmm4
649 ; SSE2-NEXT: pand %xmm9, %xmm3
650 ; SSE2-NEXT: packuswb %xmm4, %xmm3
651 ; SSE2-NEXT: pand %xmm9, %xmm2
652 ; SSE2-NEXT: pand %xmm9, %xmm1
653 ; SSE2-NEXT: packuswb %xmm2, %xmm1
654 ; SSE2-NEXT: packuswb %xmm3, %xmm1
655 ; SSE2-NEXT: pandn %xmm1, %xmm0
658 ; SSSE3-LABEL: test14:
659 ; SSSE3: # %bb.0: # %vector.ph
660 ; SSSE3-NEXT: pxor %xmm7, %xmm7
661 ; SSSE3-NEXT: movdqa %xmm0, %xmm11
662 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15]
663 ; SSSE3-NEXT: movdqa %xmm11, %xmm8
664 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
665 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
666 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
667 ; SSSE3-NEXT: movdqa %xmm0, %xmm10
668 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
669 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
670 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
671 ; SSSE3-NEXT: movdqa %xmm2, %xmm9
672 ; SSSE3-NEXT: pxor %xmm7, %xmm9
673 ; SSSE3-NEXT: psubd %xmm0, %xmm2
674 ; SSSE3-NEXT: movdqa %xmm0, %xmm5
675 ; SSSE3-NEXT: por %xmm7, %xmm5
676 ; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5
677 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
678 ; SSSE3-NEXT: pshufb %xmm9, %xmm5
679 ; SSSE3-NEXT: movdqa %xmm1, %xmm6
680 ; SSSE3-NEXT: pxor %xmm7, %xmm6
681 ; SSSE3-NEXT: psubd %xmm10, %xmm1
682 ; SSSE3-NEXT: movdqa %xmm10, %xmm0
683 ; SSSE3-NEXT: por %xmm7, %xmm0
684 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
685 ; SSSE3-NEXT: pshufb %xmm9, %xmm0
686 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
687 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
688 ; SSSE3-NEXT: pxor %xmm7, %xmm5
689 ; SSSE3-NEXT: psubd %xmm11, %xmm4
690 ; SSSE3-NEXT: por %xmm7, %xmm11
691 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm11
692 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
693 ; SSSE3-NEXT: pshufb %xmm5, %xmm11
694 ; SSSE3-NEXT: movdqa %xmm3, %xmm6
695 ; SSSE3-NEXT: pxor %xmm7, %xmm6
696 ; SSSE3-NEXT: por %xmm8, %xmm7
697 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
698 ; SSSE3-NEXT: pshufb %xmm5, %xmm7
699 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
700 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
701 ; SSSE3-NEXT: psubd %xmm8, %xmm3
702 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
703 ; SSSE3-NEXT: pand %xmm5, %xmm4
704 ; SSSE3-NEXT: pand %xmm5, %xmm3
705 ; SSSE3-NEXT: packuswb %xmm4, %xmm3
706 ; SSSE3-NEXT: pand %xmm5, %xmm2
707 ; SSSE3-NEXT: pand %xmm5, %xmm1
708 ; SSSE3-NEXT: packuswb %xmm2, %xmm1
709 ; SSSE3-NEXT: packuswb %xmm3, %xmm1
710 ; SSSE3-NEXT: pandn %xmm1, %xmm0
713 ; SSE41-LABEL: test14:
714 ; SSE41: # %bb.0: # %vector.ph
715 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
716 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
717 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
718 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
719 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
720 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
721 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
722 ; SSE41-NEXT: movdqa %xmm4, %xmm0
723 ; SSE41-NEXT: pmaxud %xmm10, %xmm0
724 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
725 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
726 ; SSE41-NEXT: pxor %xmm6, %xmm0
727 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
728 ; SSE41-NEXT: pshufb %xmm7, %xmm0
729 ; SSE41-NEXT: movdqa %xmm3, %xmm5
730 ; SSE41-NEXT: pmaxud %xmm9, %xmm5
731 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm5
732 ; SSE41-NEXT: pxor %xmm6, %xmm5
733 ; SSE41-NEXT: pshufb %xmm7, %xmm5
734 ; SSE41-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
735 ; SSE41-NEXT: movdqa %xmm1, %xmm0
736 ; SSE41-NEXT: pmaxud %xmm8, %xmm0
737 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
738 ; SSE41-NEXT: pxor %xmm6, %xmm0
739 ; SSE41-NEXT: movdqa {{.*#+}} xmm12 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
740 ; SSE41-NEXT: pshufb %xmm12, %xmm0
741 ; SSE41-NEXT: movdqa %xmm2, %xmm7
742 ; SSE41-NEXT: pmaxud %xmm11, %xmm7
743 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
744 ; SSE41-NEXT: pxor %xmm6, %xmm7
745 ; SSE41-NEXT: pshufb %xmm12, %xmm7
746 ; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
747 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
748 ; SSE41-NEXT: psubd %xmm11, %xmm2
749 ; SSE41-NEXT: psubd %xmm8, %xmm1
750 ; SSE41-NEXT: psubd %xmm9, %xmm3
751 ; SSE41-NEXT: psubd %xmm10, %xmm4
752 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
753 ; SSE41-NEXT: pand %xmm5, %xmm4
754 ; SSE41-NEXT: pand %xmm5, %xmm3
755 ; SSE41-NEXT: packusdw %xmm4, %xmm3
756 ; SSE41-NEXT: pand %xmm5, %xmm1
757 ; SSE41-NEXT: pand %xmm5, %xmm2
758 ; SSE41-NEXT: packusdw %xmm2, %xmm1
759 ; SSE41-NEXT: packuswb %xmm3, %xmm1
760 ; SSE41-NEXT: pandn %xmm1, %xmm0
763 ; AVX1-LABEL: test14:
764 ; AVX1: # %bb.0: # %vector.ph
765 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
766 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
767 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
768 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
769 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
770 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
771 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
772 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
773 ; AVX1-NEXT: vpmaxud %xmm0, %xmm6, %xmm7
774 ; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm7
775 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
776 ; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7
777 ; AVX1-NEXT: vpmaxud %xmm11, %xmm2, %xmm4
778 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4
779 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
780 ; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm10
781 ; AVX1-NEXT: vpmaxud %xmm9, %xmm1, %xmm7
782 ; AVX1-NEXT: vpcmpeqd %xmm7, %xmm1, %xmm7
783 ; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7
784 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
785 ; AVX1-NEXT: vpmaxud %xmm8, %xmm4, %xmm5
786 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5
787 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm3
788 ; AVX1-NEXT: vpackssdw %xmm3, %xmm7, %xmm3
789 ; AVX1-NEXT: vpacksswb %xmm10, %xmm3, %xmm3
790 ; AVX1-NEXT: vpsubd %xmm8, %xmm4, %xmm4
791 ; AVX1-NEXT: vpsubd %xmm9, %xmm1, %xmm1
792 ; AVX1-NEXT: vpsubd %xmm11, %xmm2, %xmm2
793 ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0
794 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255]
795 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
796 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
797 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
798 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
799 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm2
800 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
801 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
802 ; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
803 ; AVX1-NEXT: vzeroupper
806 ; AVX2-LABEL: test14:
807 ; AVX2: # %bb.0: # %vector.ph
808 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
809 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
810 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
811 ; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm4
812 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4
813 ; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
814 ; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4
815 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
816 ; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
817 ; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm6
818 ; AVX2-NEXT: vpcmpeqd %ymm6, %ymm2, %ymm6
819 ; AVX2-NEXT: vpxor %ymm5, %ymm6, %ymm5
820 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
821 ; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
822 ; AVX2-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
823 ; AVX2-NEXT: vpsubd %ymm3, %ymm2, %ymm2
824 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
825 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
826 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
827 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
828 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
829 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
830 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1
831 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
832 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
833 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
834 ; AVX2-NEXT: vpandn %xmm0, %xmm4, %xmm0
835 ; AVX2-NEXT: vzeroupper
838 ; AVX512-LABEL: test14:
839 ; AVX512: # %bb.0: # %vector.ph
840 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
841 ; AVX512-NEXT: vpcmpnltud %zmm0, %zmm1, %k1
842 ; AVX512-NEXT: vpsubd %zmm0, %zmm1, %zmm0
843 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
844 ; AVX512-NEXT: vzeroupper
847 %rhs = zext <16 x i8> %x to <16 x i32>
848 %cond = icmp ult <16 x i32> %y, %rhs
849 %sub = sub <16 x i32> %y, %rhs
850 %truncsub = trunc <16 x i32> %sub to <16 x i8>
851 %res = select <16 x i1> %cond, <16 x i8> zeroinitializer, <16 x i8> %truncsub
855 define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
856 ; SSE2-LABEL: test15:
857 ; SSE2: # %bb.0: # %vector.ph
858 ; SSE2-NEXT: movdqa %xmm0, %xmm3
859 ; SSE2-NEXT: pxor %xmm4, %xmm4
860 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
861 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
862 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
863 ; SSE2-NEXT: movdqa %xmm3, %xmm5
864 ; SSE2-NEXT: psubd %xmm2, %xmm3
865 ; SSE2-NEXT: pxor %xmm4, %xmm2
866 ; SSE2-NEXT: por %xmm4, %xmm5
867 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
868 ; SSE2-NEXT: movdqa %xmm1, %xmm2
869 ; SSE2-NEXT: pxor %xmm4, %xmm2
870 ; SSE2-NEXT: por %xmm0, %xmm4
871 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
872 ; SSE2-NEXT: packssdw %xmm5, %xmm4
873 ; SSE2-NEXT: psubd %xmm1, %xmm0
874 ; SSE2-NEXT: pslld $16, %xmm3
875 ; SSE2-NEXT: psrad $16, %xmm3
876 ; SSE2-NEXT: pslld $16, %xmm0
877 ; SSE2-NEXT: psrad $16, %xmm0
878 ; SSE2-NEXT: packssdw %xmm3, %xmm0
879 ; SSE2-NEXT: pand %xmm4, %xmm0
882 ; SSSE3-LABEL: test15:
883 ; SSSE3: # %bb.0: # %vector.ph
884 ; SSSE3-NEXT: pxor %xmm4, %xmm4
885 ; SSSE3-NEXT: movdqa %xmm0, %xmm3
886 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
887 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
888 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
889 ; SSSE3-NEXT: movdqa %xmm0, %xmm5
890 ; SSSE3-NEXT: psubd %xmm2, %xmm0
891 ; SSSE3-NEXT: pxor %xmm4, %xmm2
892 ; SSSE3-NEXT: por %xmm4, %xmm5
893 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
894 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
895 ; SSSE3-NEXT: pxor %xmm4, %xmm2
896 ; SSSE3-NEXT: por %xmm3, %xmm4
897 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
898 ; SSSE3-NEXT: packssdw %xmm5, %xmm4
899 ; SSSE3-NEXT: psubd %xmm1, %xmm3
900 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
901 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
902 ; SSSE3-NEXT: pshufb %xmm1, %xmm3
903 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
904 ; SSSE3-NEXT: pand %xmm4, %xmm3
905 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
908 ; SSE41-LABEL: test15:
909 ; SSE41: # %bb.0: # %vector.ph
910 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
911 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
912 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
913 ; SSE41-NEXT: movdqa %xmm0, %xmm4
914 ; SSE41-NEXT: pminud %xmm1, %xmm4
915 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
916 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
917 ; SSE41-NEXT: pxor %xmm5, %xmm4
918 ; SSE41-NEXT: movdqa %xmm3, %xmm6
919 ; SSE41-NEXT: pminud %xmm2, %xmm6
920 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm6
921 ; SSE41-NEXT: pxor %xmm5, %xmm6
922 ; SSE41-NEXT: packssdw %xmm6, %xmm4
923 ; SSE41-NEXT: psubd %xmm2, %xmm3
924 ; SSE41-NEXT: psubd %xmm1, %xmm0
925 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
926 ; SSE41-NEXT: pshufb %xmm1, %xmm0
927 ; SSE41-NEXT: pshufb %xmm1, %xmm3
928 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
929 ; SSE41-NEXT: pand %xmm4, %xmm0
932 ; AVX1-LABEL: test15:
933 ; AVX1: # %bb.0: # %vector.ph
934 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
935 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
936 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
937 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
938 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm4
939 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4
940 ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm5
941 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm0, %xmm5
942 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
943 ; AVX1-NEXT: vpandn %xmm0, %xmm5, %xmm0
944 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm1
945 ; AVX1-NEXT: vpandn %xmm1, %xmm4, %xmm1
946 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
947 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
948 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
949 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
950 ; AVX1-NEXT: vzeroupper
953 ; AVX2-LABEL: test15:
954 ; AVX2: # %bb.0: # %vector.ph
955 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
956 ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm2
957 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2
958 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
959 ; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0
960 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
961 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
962 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
963 ; AVX2-NEXT: vzeroupper
966 ; AVX512-LABEL: test15:
967 ; AVX512: # %bb.0: # %vector.ph
968 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
969 ; AVX512-NEXT: vpcmpnleud %ymm1, %ymm0, %k1
970 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
971 ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
972 ; AVX512-NEXT: vzeroupper
975 %lhs = zext <8 x i16> %x to <8 x i32>
976 %cond = icmp ugt <8 x i32> %lhs, %y
977 %sub = sub <8 x i32> %lhs, %y
978 %truncsub = trunc <8 x i32> %sub to <8 x i16>
979 %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
983 define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
984 ; SSE2-LABEL: test16:
985 ; SSE2: # %bb.0: # %vector.ph
986 ; SSE2-NEXT: movdqa %xmm0, %xmm3
987 ; SSE2-NEXT: pxor %xmm4, %xmm4
988 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
989 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
990 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
991 ; SSE2-NEXT: movdqa %xmm3, %xmm5
992 ; SSE2-NEXT: psubd %xmm2, %xmm3
993 ; SSE2-NEXT: pxor %xmm4, %xmm2
994 ; SSE2-NEXT: por %xmm4, %xmm5
995 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
996 ; SSE2-NEXT: movdqa %xmm1, %xmm2
997 ; SSE2-NEXT: pxor %xmm4, %xmm2
998 ; SSE2-NEXT: por %xmm0, %xmm4
999 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
1000 ; SSE2-NEXT: packssdw %xmm5, %xmm4
1001 ; SSE2-NEXT: psubd %xmm1, %xmm0
1002 ; SSE2-NEXT: pslld $16, %xmm3
1003 ; SSE2-NEXT: psrad $16, %xmm3
1004 ; SSE2-NEXT: pslld $16, %xmm0
1005 ; SSE2-NEXT: psrad $16, %xmm0
1006 ; SSE2-NEXT: packssdw %xmm3, %xmm0
1007 ; SSE2-NEXT: pand %xmm4, %xmm0
1010 ; SSSE3-LABEL: test16:
1011 ; SSSE3: # %bb.0: # %vector.ph
1012 ; SSSE3-NEXT: pxor %xmm4, %xmm4
1013 ; SSSE3-NEXT: movdqa %xmm0, %xmm3
1014 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1015 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1016 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
1017 ; SSSE3-NEXT: movdqa %xmm0, %xmm5
1018 ; SSSE3-NEXT: psubd %xmm2, %xmm0
1019 ; SSSE3-NEXT: pxor %xmm4, %xmm2
1020 ; SSSE3-NEXT: por %xmm4, %xmm5
1021 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
1022 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
1023 ; SSSE3-NEXT: pxor %xmm4, %xmm2
1024 ; SSSE3-NEXT: por %xmm3, %xmm4
1025 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
1026 ; SSSE3-NEXT: packssdw %xmm5, %xmm4
1027 ; SSSE3-NEXT: psubd %xmm1, %xmm3
1028 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1029 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
1030 ; SSSE3-NEXT: pshufb %xmm1, %xmm3
1031 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1032 ; SSSE3-NEXT: pand %xmm4, %xmm3
1033 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
1036 ; SSE41-LABEL: test16:
1037 ; SSE41: # %bb.0: # %vector.ph
1038 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
1039 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
1040 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1041 ; SSE41-NEXT: movdqa %xmm0, %xmm4
1042 ; SSE41-NEXT: pminud %xmm1, %xmm4
1043 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
1044 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
1045 ; SSE41-NEXT: pxor %xmm5, %xmm4
1046 ; SSE41-NEXT: movdqa %xmm3, %xmm6
1047 ; SSE41-NEXT: pminud %xmm2, %xmm6
1048 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm6
1049 ; SSE41-NEXT: pxor %xmm5, %xmm6
1050 ; SSE41-NEXT: packssdw %xmm6, %xmm4
1051 ; SSE41-NEXT: psubd %xmm2, %xmm3
1052 ; SSE41-NEXT: psubd %xmm1, %xmm0
1053 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1054 ; SSE41-NEXT: pshufb %xmm1, %xmm0
1055 ; SSE41-NEXT: pshufb %xmm1, %xmm3
1056 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
1057 ; SSE41-NEXT: pand %xmm4, %xmm0
1060 ; AVX1-LABEL: test16:
1061 ; AVX1: # %bb.0: # %vector.ph
1062 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1063 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1064 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1065 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1066 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm4
1067 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4
1068 ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm5
1069 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm0, %xmm5
1070 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
1071 ; AVX1-NEXT: vpandn %xmm0, %xmm5, %xmm0
1072 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm1
1073 ; AVX1-NEXT: vpandn %xmm1, %xmm4, %xmm1
1074 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1075 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1076 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1077 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1078 ; AVX1-NEXT: vzeroupper
1081 ; AVX2-LABEL: test16:
1082 ; AVX2: # %bb.0: # %vector.ph
1083 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1084 ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm2
1085 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2
1086 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
1087 ; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0
1088 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1089 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1090 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1091 ; AVX2-NEXT: vzeroupper
1094 ; AVX512-LABEL: test16:
1095 ; AVX512: # %bb.0: # %vector.ph
1096 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1097 ; AVX512-NEXT: vpcmpnleud %ymm1, %ymm0, %k1
1098 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
1099 ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
1100 ; AVX512-NEXT: vzeroupper
1103 %lhs = zext <8 x i16> %x to <8 x i32>
1104 %cond = icmp ult <8 x i32> %y, %lhs
1105 %sub = sub <8 x i32> %lhs, %y
1106 %truncsub = trunc <8 x i32> %sub to <8 x i16>
1107 %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
1111 define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind {
1112 ; SSE2-LABEL: test17:
1113 ; SSE2: # %bb.0: # %vector.ph
1114 ; SSE2-NEXT: movd %edi, %xmm4
1115 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1116 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7]
1117 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
1118 ; SSE2-NEXT: psubusb %xmm4, %xmm0
1119 ; SSE2-NEXT: psubusb %xmm4, %xmm1
1120 ; SSE2-NEXT: psubusb %xmm4, %xmm2
1121 ; SSE2-NEXT: psubusb %xmm4, %xmm3
1124 ; SSSE3-LABEL: test17:
1125 ; SSSE3: # %bb.0: # %vector.ph
1126 ; SSSE3-NEXT: movd %edi, %xmm4
1127 ; SSSE3-NEXT: pxor %xmm5, %xmm5
1128 ; SSSE3-NEXT: pshufb %xmm5, %xmm4
1129 ; SSSE3-NEXT: psubusb %xmm4, %xmm0
1130 ; SSSE3-NEXT: psubusb %xmm4, %xmm1
1131 ; SSSE3-NEXT: psubusb %xmm4, %xmm2
1132 ; SSSE3-NEXT: psubusb %xmm4, %xmm3
1135 ; SSE41-LABEL: test17:
1136 ; SSE41: # %bb.0: # %vector.ph
1137 ; SSE41-NEXT: movd %edi, %xmm4
1138 ; SSE41-NEXT: pxor %xmm5, %xmm5
1139 ; SSE41-NEXT: pshufb %xmm5, %xmm4
1140 ; SSE41-NEXT: psubusb %xmm4, %xmm0
1141 ; SSE41-NEXT: psubusb %xmm4, %xmm1
1142 ; SSE41-NEXT: psubusb %xmm4, %xmm2
1143 ; SSE41-NEXT: psubusb %xmm4, %xmm3
1146 ; AVX1-LABEL: test17:
1147 ; AVX1: # %bb.0: # %vector.ph
1148 ; AVX1-NEXT: vmovd %edi, %xmm2
1149 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1150 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1151 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1152 ; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3
1153 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
1154 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1155 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1156 ; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3
1157 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
1158 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1161 ; AVX2-LABEL: test17:
1162 ; AVX2: # %bb.0: # %vector.ph
1163 ; AVX2-NEXT: vmovd %edi, %xmm2
1164 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
1165 ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
1166 ; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1
1169 ; AVX512-LABEL: test17:
1170 ; AVX512: # %bb.0: # %vector.ph
1171 ; AVX512-NEXT: vpbroadcastb %edi, %zmm1
1172 ; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0
1175 %0 = insertelement <64 x i8> undef, i8 %w, i32 0
1176 %broadcast15 = shufflevector <64 x i8> %0, <64 x i8> undef, <64 x i32> zeroinitializer
1177 %1 = icmp ult <64 x i8> %x, %broadcast15
1178 %2 = sub <64 x i8> %x, %broadcast15
1179 %res = select <64 x i1> %1, <64 x i8> zeroinitializer, <64 x i8> %2
1183 define <32 x i16> @test18(<32 x i16> %x, i16 zeroext %w) nounwind {
1184 ; SSE-LABEL: test18:
1185 ; SSE: # %bb.0: # %vector.ph
1186 ; SSE-NEXT: movd %edi, %xmm4
1187 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7]
1188 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
1189 ; SSE-NEXT: psubusw %xmm4, %xmm0
1190 ; SSE-NEXT: psubusw %xmm4, %xmm1
1191 ; SSE-NEXT: psubusw %xmm4, %xmm2
1192 ; SSE-NEXT: psubusw %xmm4, %xmm3
1195 ; AVX1-LABEL: test18:
1196 ; AVX1: # %bb.0: # %vector.ph
1197 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1198 ; AVX1-NEXT: vmovd %edi, %xmm3
1199 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
1200 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1201 ; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2
1202 ; AVX1-NEXT: vpsubusw %xmm3, %xmm0, %xmm0
1203 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1204 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1205 ; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2
1206 ; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1
1207 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1210 ; AVX2-LABEL: test18:
1211 ; AVX2: # %bb.0: # %vector.ph
1212 ; AVX2-NEXT: vmovd %edi, %xmm2
1213 ; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
1214 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
1215 ; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1
1218 ; AVX512-LABEL: test18:
1219 ; AVX512: # %bb.0: # %vector.ph
1220 ; AVX512-NEXT: vpbroadcastw %edi, %zmm1
1221 ; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
1224 %0 = insertelement <32 x i16> undef, i16 %w, i32 0
1225 %broadcast15 = shufflevector <32 x i16> %0, <32 x i16> undef, <32 x i32> zeroinitializer
1226 %1 = icmp ult <32 x i16> %x, %broadcast15
1227 %2 = sub <32 x i16> %x, %broadcast15
1228 %res = select <32 x i1> %1, <32 x i16> zeroinitializer, <32 x i16> %2
1232 define <8 x i16> @psubus_8i16_max(<8 x i16> %x, <8 x i16> %y) nounwind {
1233 ; SSE-LABEL: psubus_8i16_max:
1234 ; SSE: # %bb.0: # %vector.ph
1235 ; SSE-NEXT: psubusw %xmm1, %xmm0
1238 ; AVX-LABEL: psubus_8i16_max:
1239 ; AVX: # %bb.0: # %vector.ph
1240 ; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1243 %cmp = icmp ult <8 x i16> %x, %y
1244 %max = select <8 x i1> %cmp, <8 x i16> %y, <8 x i16> %x
1245 %res = sub <8 x i16> %max, %y
1249 define <16 x i8> @psubus_16i8_max(<16 x i8> %x, <16 x i8> %y) nounwind {
1250 ; SSE-LABEL: psubus_16i8_max:
1251 ; SSE: # %bb.0: # %vector.ph
1252 ; SSE-NEXT: psubusb %xmm1, %xmm0
1255 ; AVX-LABEL: psubus_16i8_max:
1256 ; AVX: # %bb.0: # %vector.ph
1257 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
1260 %cmp = icmp ult <16 x i8> %x, %y
1261 %max = select <16 x i1> %cmp, <16 x i8> %y, <16 x i8> %x
1262 %res = sub <16 x i8> %max, %y
1266 define <16 x i16> @psubus_16i16_max(<16 x i16> %x, <16 x i16> %y) nounwind {
1267 ; SSE-LABEL: psubus_16i16_max:
1268 ; SSE: # %bb.0: # %vector.ph
1269 ; SSE-NEXT: psubusw %xmm2, %xmm0
1270 ; SSE-NEXT: psubusw %xmm3, %xmm1
1273 ; AVX1-LABEL: psubus_16i16_max:
1274 ; AVX1: # %bb.0: # %vector.ph
1275 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1276 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1277 ; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
1278 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1279 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1282 ; AVX2-LABEL: psubus_16i16_max:
1283 ; AVX2: # %bb.0: # %vector.ph
1284 ; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1287 ; AVX512-LABEL: psubus_16i16_max:
1288 ; AVX512: # %bb.0: # %vector.ph
1289 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1292 %cmp = icmp ult <16 x i16> %x, %y
1293 %max = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> %x
1294 %res = sub <16 x i16> %max, %y
1298 define <32 x i16> @psubus_32i16_max(<32 x i16> %x, <32 x i16> %y) nounwind {
1299 ; SSE-LABEL: psubus_32i16_max:
1300 ; SSE: # %bb.0: # %vector.ph
1301 ; SSE-NEXT: psubusw %xmm4, %xmm0
1302 ; SSE-NEXT: psubusw %xmm5, %xmm1
1303 ; SSE-NEXT: psubusw %xmm6, %xmm2
1304 ; SSE-NEXT: psubusw %xmm7, %xmm3
1307 ; AVX1-LABEL: psubus_32i16_max:
1308 ; AVX1: # %bb.0: # %vector.ph
1309 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1310 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
1311 ; AVX1-NEXT: vpsubusw %xmm4, %xmm5, %xmm4
1312 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
1313 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1314 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
1315 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1316 ; AVX1-NEXT: vpsubusw %xmm2, %xmm4, %xmm2
1317 ; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1
1318 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1321 ; AVX2-LABEL: psubus_32i16_max:
1322 ; AVX2: # %bb.0: # %vector.ph
1323 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
1324 ; AVX2-NEXT: vpsubusw %ymm3, %ymm1, %ymm1
1327 ; AVX512-LABEL: psubus_32i16_max:
1328 ; AVX512: # %bb.0: # %vector.ph
1329 ; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
1332 %cmp = icmp ult <32 x i16> %x, %y
1333 %max = select <32 x i1> %cmp, <32 x i16> %y, <32 x i16> %x
1334 %res = sub <32 x i16> %max, %y
1338 define <64 x i8> @psubus_64i8_max(<64 x i8> %x, <64 x i8> %y) nounwind {
1339 ; SSE-LABEL: psubus_64i8_max:
1340 ; SSE: # %bb.0: # %vector.ph
1341 ; SSE-NEXT: psubusb %xmm4, %xmm0
1342 ; SSE-NEXT: psubusb %xmm5, %xmm1
1343 ; SSE-NEXT: psubusb %xmm6, %xmm2
1344 ; SSE-NEXT: psubusb %xmm7, %xmm3
1347 ; AVX1-LABEL: psubus_64i8_max:
1348 ; AVX1: # %bb.0: # %vector.ph
1349 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1350 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
1351 ; AVX1-NEXT: vpsubusb %xmm4, %xmm5, %xmm4
1352 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
1353 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1354 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
1355 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1356 ; AVX1-NEXT: vpsubusb %xmm2, %xmm4, %xmm2
1357 ; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1
1358 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1361 ; AVX2-LABEL: psubus_64i8_max:
1362 ; AVX2: # %bb.0: # %vector.ph
1363 ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
1364 ; AVX2-NEXT: vpsubusb %ymm3, %ymm1, %ymm1
1367 ; AVX512-LABEL: psubus_64i8_max:
1368 ; AVX512: # %bb.0: # %vector.ph
1369 ; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0
1372 %cmp = icmp ult <64 x i8> %x, %y
1373 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
1374 %res = sub <64 x i8> %max, %y
1378 define <32 x i8> @psubus_32i8_max(<32 x i8> %x, <32 x i8> %y) nounwind {
1379 ; SSE-LABEL: psubus_32i8_max:
1380 ; SSE: # %bb.0: # %vector.ph
1381 ; SSE-NEXT: psubusb %xmm2, %xmm0
1382 ; SSE-NEXT: psubusb %xmm3, %xmm1
1385 ; AVX1-LABEL: psubus_32i8_max:
1386 ; AVX1: # %bb.0: # %vector.ph
1387 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1388 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1389 ; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm2
1390 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
1391 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1394 ; AVX2-LABEL: psubus_32i8_max:
1395 ; AVX2: # %bb.0: # %vector.ph
1396 ; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
1399 ; AVX512-LABEL: psubus_32i8_max:
1400 ; AVX512: # %bb.0: # %vector.ph
1401 ; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
1404 %cmp = icmp ult <32 x i8> %x, %y
1405 %max = select <32 x i1> %cmp, <32 x i8> %y, <32 x i8> %x
1406 %res = sub <32 x i8> %max, %y
1410 define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
1411 ; SSE2-LABEL: psubus_8i32_max:
1412 ; SSE2: # %bb.0: # %vector.ph
1413 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1414 ; SSE2-NEXT: pxor %xmm4, %xmm4
1415 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1416 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1417 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
1418 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1419 ; SSE2-NEXT: pxor %xmm5, %xmm6
1420 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1421 ; SSE2-NEXT: por %xmm5, %xmm4
1422 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
1423 ; SSE2-NEXT: pand %xmm4, %xmm3
1424 ; SSE2-NEXT: pandn %xmm2, %xmm4
1425 ; SSE2-NEXT: por %xmm3, %xmm4
1426 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1427 ; SSE2-NEXT: pxor %xmm5, %xmm3
1428 ; SSE2-NEXT: por %xmm0, %xmm5
1429 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1430 ; SSE2-NEXT: pand %xmm5, %xmm0
1431 ; SSE2-NEXT: pandn %xmm1, %xmm5
1432 ; SSE2-NEXT: por %xmm5, %xmm0
1433 ; SSE2-NEXT: psubd %xmm1, %xmm0
1434 ; SSE2-NEXT: psubd %xmm2, %xmm4
1435 ; SSE2-NEXT: pslld $16, %xmm4
1436 ; SSE2-NEXT: psrad $16, %xmm4
1437 ; SSE2-NEXT: pslld $16, %xmm0
1438 ; SSE2-NEXT: psrad $16, %xmm0
1439 ; SSE2-NEXT: packssdw %xmm4, %xmm0
1442 ; SSSE3-LABEL: psubus_8i32_max:
1443 ; SSSE3: # %bb.0: # %vector.ph
1444 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1445 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
1446 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
1447 ; SSSE3-NEXT: pxor %xmm4, %xmm5
1448 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
1449 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
1450 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
1451 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
1452 ; SSSE3-NEXT: pand %xmm7, %xmm2
1453 ; SSSE3-NEXT: pandn %xmm5, %xmm7
1454 ; SSSE3-NEXT: por %xmm2, %xmm7
1455 ; SSSE3-NEXT: pshufb %xmm3, %xmm7
1456 ; SSSE3-NEXT: pxor %xmm1, %xmm4
1457 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
1458 ; SSSE3-NEXT: pand %xmm6, %xmm1
1459 ; SSSE3-NEXT: pandn %xmm5, %xmm6
1460 ; SSSE3-NEXT: por %xmm1, %xmm6
1461 ; SSSE3-NEXT: pshufb %xmm3, %xmm6
1462 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1463 ; SSSE3-NEXT: psubusw %xmm6, %xmm0
1466 ; SSE41-LABEL: psubus_8i32_max:
1467 ; SSE41: # %bb.0: # %vector.ph
1468 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1469 ; SSE41-NEXT: pminud %xmm3, %xmm2
1470 ; SSE41-NEXT: pminud %xmm3, %xmm1
1471 ; SSE41-NEXT: packusdw %xmm2, %xmm1
1472 ; SSE41-NEXT: psubusw %xmm1, %xmm0
1475 ; AVX1-LABEL: psubus_8i32_max:
1476 ; AVX1: # %bb.0: # %vector.ph
1477 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1478 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1479 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
1480 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
1481 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1482 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1483 ; AVX1-NEXT: vzeroupper
1486 ; AVX2-LABEL: psubus_8i32_max:
1487 ; AVX2: # %bb.0: # %vector.ph
1488 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
1489 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
1490 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1491 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1492 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1493 ; AVX2-NEXT: vzeroupper
1496 ; AVX512-LABEL: psubus_8i32_max:
1497 ; AVX512: # %bb.0: # %vector.ph
1498 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
1499 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1500 ; AVX512-NEXT: vzeroupper
1503 %lhs = zext <8 x i16> %x to <8 x i32>
1504 %cond = icmp ult <8 x i32> %lhs, %y
1505 %max = select <8 x i1> %cond, <8 x i32> %y, <8 x i32> %lhs
1506 %sub = sub <8 x i32> %max, %y
1507 %res = trunc <8 x i32> %sub to <8 x i16>
1511 define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
1512 ; SSE2-LABEL: psubus_8i64_max:
1513 ; SSE2: # %bb.0: # %vector.ph
1514 ; SSE2-NEXT: pxor %xmm5, %xmm5
1515 ; SSE2-NEXT: movdqa %xmm0, %xmm10
1516 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
1517 ; SSE2-NEXT: movdqa %xmm10, %xmm8
1518 ; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
1519 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3]
1520 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
1521 ; SSE2-NEXT: movdqa %xmm0, %xmm9
1522 ; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
1523 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
1524 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456]
1525 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1526 ; SSE2-NEXT: pxor %xmm11, %xmm6
1527 ; SSE2-NEXT: movdqa %xmm0, %xmm7
1528 ; SSE2-NEXT: por %xmm11, %xmm7
1529 ; SSE2-NEXT: movdqa %xmm7, %xmm5
1530 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
1531 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
1532 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm7
1533 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
1534 ; SSE2-NEXT: pand %xmm12, %xmm7
1535 ; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3]
1536 ; SSE2-NEXT: por %xmm7, %xmm13
1537 ; SSE2-NEXT: pand %xmm13, %xmm0
1538 ; SSE2-NEXT: pandn %xmm2, %xmm13
1539 ; SSE2-NEXT: por %xmm0, %xmm13
1540 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1541 ; SSE2-NEXT: pxor %xmm11, %xmm0
1542 ; SSE2-NEXT: movdqa %xmm9, %xmm5
1543 ; SSE2-NEXT: por %xmm11, %xmm5
1544 ; SSE2-NEXT: movdqa %xmm5, %xmm7
1545 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
1546 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2]
1547 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm5
1548 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1549 ; SSE2-NEXT: pand %xmm12, %xmm5
1550 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
1551 ; SSE2-NEXT: por %xmm5, %xmm0
1552 ; SSE2-NEXT: pand %xmm0, %xmm9
1553 ; SSE2-NEXT: pandn %xmm1, %xmm0
1554 ; SSE2-NEXT: por %xmm9, %xmm0
1555 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1556 ; SSE2-NEXT: pxor %xmm11, %xmm5
1557 ; SSE2-NEXT: movdqa %xmm10, %xmm7
1558 ; SSE2-NEXT: por %xmm11, %xmm7
1559 ; SSE2-NEXT: movdqa %xmm7, %xmm6
1560 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
1561 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
1562 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm7
1563 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
1564 ; SSE2-NEXT: pand %xmm9, %xmm5
1565 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
1566 ; SSE2-NEXT: por %xmm5, %xmm7
1567 ; SSE2-NEXT: pand %xmm7, %xmm10
1568 ; SSE2-NEXT: pandn %xmm4, %xmm7
1569 ; SSE2-NEXT: por %xmm10, %xmm7
1570 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1571 ; SSE2-NEXT: pxor %xmm11, %xmm5
1572 ; SSE2-NEXT: por %xmm8, %xmm11
1573 ; SSE2-NEXT: movdqa %xmm11, %xmm6
1574 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
1575 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
1576 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm11
1577 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,3,3]
1578 ; SSE2-NEXT: pand %xmm9, %xmm5
1579 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1580 ; SSE2-NEXT: por %xmm5, %xmm6
1581 ; SSE2-NEXT: pand %xmm6, %xmm8
1582 ; SSE2-NEXT: pandn %xmm3, %xmm6
1583 ; SSE2-NEXT: por %xmm8, %xmm6
1584 ; SSE2-NEXT: psubq %xmm3, %xmm6
1585 ; SSE2-NEXT: psubq %xmm4, %xmm7
1586 ; SSE2-NEXT: psubq %xmm1, %xmm0
1587 ; SSE2-NEXT: psubq %xmm2, %xmm13
1588 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,2,3]
1589 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1590 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1591 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
1592 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1593 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3]
1594 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1595 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
1596 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1597 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1598 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1601 ; SSSE3-LABEL: psubus_8i64_max:
1602 ; SSSE3: # %bb.0: # %vector.ph
1603 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
1604 ; SSSE3-NEXT: movdqa %xmm2, %xmm7
1605 ; SSSE3-NEXT: pxor %xmm5, %xmm7
1606 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991]
1607 ; SSSE3-NEXT: movdqa %xmm8, %xmm6
1608 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6
1609 ; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
1610 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7
1611 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
1612 ; SSSE3-NEXT: pand %xmm9, %xmm7
1613 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1614 ; SSSE3-NEXT: por %xmm7, %xmm6
1615 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535]
1616 ; SSSE3-NEXT: pand %xmm6, %xmm2
1617 ; SSSE3-NEXT: pandn %xmm9, %xmm6
1618 ; SSSE3-NEXT: por %xmm2, %xmm6
1619 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
1620 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7]
1621 ; SSSE3-NEXT: movdqa %xmm1, %xmm6
1622 ; SSSE3-NEXT: pxor %xmm5, %xmm6
1623 ; SSSE3-NEXT: movdqa %xmm8, %xmm7
1624 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
1625 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
1626 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6
1627 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1628 ; SSSE3-NEXT: pand %xmm2, %xmm6
1629 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
1630 ; SSSE3-NEXT: por %xmm6, %xmm2
1631 ; SSSE3-NEXT: pand %xmm2, %xmm1
1632 ; SSSE3-NEXT: pandn %xmm9, %xmm2
1633 ; SSSE3-NEXT: por %xmm1, %xmm2
1634 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1635 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1636 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
1637 ; SSSE3-NEXT: movdqa %xmm4, %xmm2
1638 ; SSSE3-NEXT: pxor %xmm5, %xmm2
1639 ; SSSE3-NEXT: movdqa %xmm8, %xmm6
1640 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
1641 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
1642 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
1643 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1644 ; SSSE3-NEXT: pand %xmm7, %xmm2
1645 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1646 ; SSSE3-NEXT: por %xmm2, %xmm6
1647 ; SSSE3-NEXT: pand %xmm6, %xmm4
1648 ; SSSE3-NEXT: pandn %xmm9, %xmm6
1649 ; SSSE3-NEXT: por %xmm4, %xmm6
1650 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
1651 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
1652 ; SSSE3-NEXT: pxor %xmm3, %xmm5
1653 ; SSSE3-NEXT: movdqa %xmm8, %xmm4
1654 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
1655 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
1656 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5
1657 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1658 ; SSSE3-NEXT: pand %xmm6, %xmm5
1659 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1660 ; SSSE3-NEXT: por %xmm5, %xmm4
1661 ; SSSE3-NEXT: pand %xmm4, %xmm3
1662 ; SSSE3-NEXT: pandn %xmm9, %xmm4
1663 ; SSSE3-NEXT: por %xmm3, %xmm4
1664 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
1665 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
1666 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1667 ; SSSE3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
1668 ; SSSE3-NEXT: psubusw %xmm3, %xmm0
1671 ; SSE41-LABEL: psubus_8i64_max:
1672 ; SSE41: # %bb.0: # %vector.ph
1673 ; SSE41-NEXT: movdqa %xmm0, %xmm8
1674 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
1675 ; SSE41-NEXT: movdqa %xmm4, %xmm0
1676 ; SSE41-NEXT: pxor %xmm6, %xmm0
1677 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
1678 ; SSE41-NEXT: movdqa %xmm9, %xmm7
1679 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
1680 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
1681 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
1682 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1683 ; SSE41-NEXT: pand %xmm5, %xmm0
1684 ; SSE41-NEXT: por %xmm7, %xmm0
1685 ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535]
1686 ; SSE41-NEXT: movapd %xmm7, %xmm5
1687 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5
1688 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1689 ; SSE41-NEXT: pxor %xmm6, %xmm0
1690 ; SSE41-NEXT: movdqa %xmm9, %xmm4
1691 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
1692 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
1693 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
1694 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1695 ; SSE41-NEXT: pand %xmm10, %xmm0
1696 ; SSE41-NEXT: por %xmm4, %xmm0
1697 ; SSE41-NEXT: movapd %xmm7, %xmm4
1698 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
1699 ; SSE41-NEXT: packusdw %xmm5, %xmm4
1700 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1701 ; SSE41-NEXT: pxor %xmm6, %xmm0
1702 ; SSE41-NEXT: movdqa %xmm9, %xmm3
1703 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
1704 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
1705 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
1706 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1707 ; SSE41-NEXT: pand %xmm5, %xmm0
1708 ; SSE41-NEXT: por %xmm3, %xmm0
1709 ; SSE41-NEXT: movapd %xmm7, %xmm3
1710 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
1711 ; SSE41-NEXT: pxor %xmm1, %xmm6
1712 ; SSE41-NEXT: movdqa %xmm9, %xmm2
1713 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm2
1714 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
1715 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
1716 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
1717 ; SSE41-NEXT: pand %xmm5, %xmm0
1718 ; SSE41-NEXT: por %xmm2, %xmm0
1719 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
1720 ; SSE41-NEXT: packusdw %xmm3, %xmm7
1721 ; SSE41-NEXT: packusdw %xmm4, %xmm7
1722 ; SSE41-NEXT: psubusw %xmm7, %xmm8
1723 ; SSE41-NEXT: movdqa %xmm8, %xmm0
1726 ; AVX1-LABEL: psubus_8i64_max:
1727 ; AVX1: # %bb.0: # %vector.ph
1728 ; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [65535,65535]
1729 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1730 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
1731 ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6
1732 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [9223372036854841343,9223372036854841343]
1733 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6
1734 ; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm4
1735 ; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm6
1736 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6
1737 ; AVX1-NEXT: vblendvpd %xmm6, %xmm2, %xmm3, %xmm2
1738 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
1739 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1740 ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6
1741 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6
1742 ; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm4
1743 ; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm5
1744 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5
1745 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
1746 ; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
1747 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1748 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1749 ; AVX1-NEXT: vzeroupper
1752 ; AVX2-LABEL: psubus_8i64_max:
1753 ; AVX2: # %bb.0: # %vector.ph
1754 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
1755 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4
1756 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343]
1757 ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
1758 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [65535,65535,65535,65535]
1759 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
1760 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3
1761 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
1762 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
1763 ; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
1764 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
1765 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1766 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1767 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1768 ; AVX2-NEXT: vzeroupper
1771 ; AVX512-LABEL: psubus_8i64_max:
1772 ; AVX512: # %bb.0: # %vector.ph
1773 ; AVX512-NEXT: vpmovusqw %zmm1, %xmm1
1774 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1775 ; AVX512-NEXT: vzeroupper
1778 %lhs = zext <8 x i16> %x to <8 x i64>
1779 %cond = icmp ult <8 x i64> %lhs, %y
1780 %max = select <8 x i1> %cond, <8 x i64> %y, <8 x i64> %lhs
1781 %sub = sub <8 x i64> %max, %y
1782 %res = trunc <8 x i64> %sub to <8 x i16>
1786 define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
1787 ; SSE2-LABEL: psubus_16i32_max:
1788 ; SSE2: # %bb.0: # %vector.ph
1789 ; SSE2-NEXT: movdqa %xmm1, %xmm8
1790 ; SSE2-NEXT: pxor %xmm7, %xmm7
1791 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
1792 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
1793 ; SSE2-NEXT: movdqa %xmm0, %xmm10
1794 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
1795 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
1796 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
1797 ; SSE2-NEXT: movdqa %xmm3, %xmm6
1798 ; SSE2-NEXT: pxor %xmm7, %xmm6
1799 ; SSE2-NEXT: movdqa %xmm0, %xmm9
1800 ; SSE2-NEXT: por %xmm7, %xmm9
1801 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm9
1802 ; SSE2-NEXT: pand %xmm9, %xmm0
1803 ; SSE2-NEXT: pandn %xmm3, %xmm9
1804 ; SSE2-NEXT: por %xmm0, %xmm9
1805 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1806 ; SSE2-NEXT: pxor %xmm7, %xmm6
1807 ; SSE2-NEXT: movdqa %xmm10, %xmm0
1808 ; SSE2-NEXT: por %xmm7, %xmm0
1809 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
1810 ; SSE2-NEXT: pand %xmm0, %xmm10
1811 ; SSE2-NEXT: pandn %xmm2, %xmm0
1812 ; SSE2-NEXT: por %xmm10, %xmm0
1813 ; SSE2-NEXT: movdqa %xmm5, %xmm10
1814 ; SSE2-NEXT: pxor %xmm7, %xmm10
1815 ; SSE2-NEXT: movdqa %xmm8, %xmm6
1816 ; SSE2-NEXT: por %xmm7, %xmm6
1817 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm6
1818 ; SSE2-NEXT: pand %xmm6, %xmm8
1819 ; SSE2-NEXT: pandn %xmm5, %xmm6
1820 ; SSE2-NEXT: por %xmm8, %xmm6
1821 ; SSE2-NEXT: movdqa %xmm4, %xmm8
1822 ; SSE2-NEXT: pxor %xmm7, %xmm8
1823 ; SSE2-NEXT: por %xmm1, %xmm7
1824 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
1825 ; SSE2-NEXT: pand %xmm7, %xmm1
1826 ; SSE2-NEXT: pandn %xmm4, %xmm7
1827 ; SSE2-NEXT: por %xmm7, %xmm1
1828 ; SSE2-NEXT: psubd %xmm4, %xmm1
1829 ; SSE2-NEXT: psubd %xmm5, %xmm6
1830 ; SSE2-NEXT: psubd %xmm2, %xmm0
1831 ; SSE2-NEXT: psubd %xmm3, %xmm9
1832 ; SSE2-NEXT: pslld $16, %xmm9
1833 ; SSE2-NEXT: psrad $16, %xmm9
1834 ; SSE2-NEXT: pslld $16, %xmm0
1835 ; SSE2-NEXT: psrad $16, %xmm0
1836 ; SSE2-NEXT: packssdw %xmm9, %xmm0
1837 ; SSE2-NEXT: pslld $16, %xmm6
1838 ; SSE2-NEXT: psrad $16, %xmm6
1839 ; SSE2-NEXT: pslld $16, %xmm1
1840 ; SSE2-NEXT: psrad $16, %xmm1
1841 ; SSE2-NEXT: packssdw %xmm6, %xmm1
1844 ; SSSE3-LABEL: psubus_16i32_max:
1845 ; SSSE3: # %bb.0: # %vector.ph
1846 ; SSSE3-NEXT: movdqa %xmm1, %xmm8
1847 ; SSSE3-NEXT: pxor %xmm7, %xmm7
1848 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
1849 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
1850 ; SSSE3-NEXT: movdqa %xmm0, %xmm10
1851 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
1852 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
1853 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
1854 ; SSSE3-NEXT: movdqa %xmm3, %xmm6
1855 ; SSSE3-NEXT: pxor %xmm7, %xmm6
1856 ; SSSE3-NEXT: movdqa %xmm0, %xmm9
1857 ; SSSE3-NEXT: por %xmm7, %xmm9
1858 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9
1859 ; SSSE3-NEXT: pand %xmm9, %xmm0
1860 ; SSSE3-NEXT: pandn %xmm3, %xmm9
1861 ; SSSE3-NEXT: por %xmm0, %xmm9
1862 ; SSSE3-NEXT: movdqa %xmm2, %xmm6
1863 ; SSSE3-NEXT: pxor %xmm7, %xmm6
1864 ; SSSE3-NEXT: movdqa %xmm10, %xmm0
1865 ; SSSE3-NEXT: por %xmm7, %xmm0
1866 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
1867 ; SSSE3-NEXT: pand %xmm0, %xmm10
1868 ; SSSE3-NEXT: pandn %xmm2, %xmm0
1869 ; SSSE3-NEXT: por %xmm10, %xmm0
1870 ; SSSE3-NEXT: movdqa %xmm5, %xmm10
1871 ; SSSE3-NEXT: pxor %xmm7, %xmm10
1872 ; SSSE3-NEXT: movdqa %xmm8, %xmm6
1873 ; SSSE3-NEXT: por %xmm7, %xmm6
1874 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm6
1875 ; SSSE3-NEXT: pand %xmm6, %xmm8
1876 ; SSSE3-NEXT: pandn %xmm5, %xmm6
1877 ; SSSE3-NEXT: por %xmm8, %xmm6
1878 ; SSSE3-NEXT: movdqa %xmm4, %xmm8
1879 ; SSSE3-NEXT: pxor %xmm7, %xmm8
1880 ; SSSE3-NEXT: por %xmm1, %xmm7
1881 ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7
1882 ; SSSE3-NEXT: pand %xmm7, %xmm1
1883 ; SSSE3-NEXT: pandn %xmm4, %xmm7
1884 ; SSSE3-NEXT: por %xmm7, %xmm1
1885 ; SSSE3-NEXT: psubd %xmm4, %xmm1
1886 ; SSSE3-NEXT: psubd %xmm5, %xmm6
1887 ; SSSE3-NEXT: psubd %xmm2, %xmm0
1888 ; SSSE3-NEXT: psubd %xmm3, %xmm9
1889 ; SSSE3-NEXT: pslld $16, %xmm9
1890 ; SSSE3-NEXT: psrad $16, %xmm9
1891 ; SSSE3-NEXT: pslld $16, %xmm0
1892 ; SSSE3-NEXT: psrad $16, %xmm0
1893 ; SSSE3-NEXT: packssdw %xmm9, %xmm0
1894 ; SSSE3-NEXT: pslld $16, %xmm6
1895 ; SSSE3-NEXT: psrad $16, %xmm6
1896 ; SSSE3-NEXT: pslld $16, %xmm1
1897 ; SSSE3-NEXT: psrad $16, %xmm1
1898 ; SSSE3-NEXT: packssdw %xmm6, %xmm1
1901 ; SSE41-LABEL: psubus_16i32_max:
1902 ; SSE41: # %bb.0: # %vector.ph
1903 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
1904 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
1905 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1906 ; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
1907 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
1908 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1909 ; SSE41-NEXT: pmaxud %xmm2, %xmm0
1910 ; SSE41-NEXT: pmaxud %xmm3, %xmm7
1911 ; SSE41-NEXT: pmaxud %xmm4, %xmm1
1912 ; SSE41-NEXT: pmaxud %xmm5, %xmm6
1913 ; SSE41-NEXT: psubd %xmm5, %xmm6
1914 ; SSE41-NEXT: psubd %xmm4, %xmm1
1915 ; SSE41-NEXT: psubd %xmm3, %xmm7
1916 ; SSE41-NEXT: psubd %xmm2, %xmm0
1917 ; SSE41-NEXT: pxor %xmm2, %xmm2
1918 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1919 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2],xmm2[3],xmm7[4],xmm2[5],xmm7[6],xmm2[7]
1920 ; SSE41-NEXT: packusdw %xmm7, %xmm0
1921 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
1922 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2],xmm2[3],xmm6[4],xmm2[5],xmm6[6],xmm2[7]
1923 ; SSE41-NEXT: packusdw %xmm6, %xmm1
1926 ; AVX1-LABEL: psubus_16i32_max:
1927 ; AVX1: # %bb.0: # %vector.ph
1928 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1929 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
1930 ; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
1931 ; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1
1932 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1933 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1934 ; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
1935 ; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2
1936 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1937 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1938 ; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
1939 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1940 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1943 ; AVX2-LABEL: psubus_16i32_max:
1944 ; AVX2: # %bb.0: # %vector.ph
1945 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535]
1946 ; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
1947 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
1948 ; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
1949 ; AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2
1950 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1951 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1952 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
1953 ; AVX2-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
1954 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1955 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1958 ; AVX512-LABEL: psubus_16i32_max:
1959 ; AVX512: # %bb.0: # %vector.ph
1960 ; AVX512-NEXT: vpmovusdw %zmm1, %ymm1
1961 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1964 %lhs = zext <16 x i16> %x to <16 x i32>
1965 %cond = icmp ult <16 x i32> %lhs, %y
1966 %max = select <16 x i1> %cond, <16 x i32> %y, <16 x i32> %lhs
1967 %sub = sub <16 x i32> %max, %y
1968 %res = trunc <16 x i32> %sub to <16 x i16>
1972 define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwind {
1973 ; SSE2-LABEL: psubus_i16_i32_max_swapped:
1974 ; SSE2: # %bb.0: # %vector.ph
1975 ; SSE2-NEXT: pxor %xmm3, %xmm3
1976 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1977 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1978 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1979 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
1980 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1981 ; SSE2-NEXT: pxor %xmm5, %xmm3
1982 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1983 ; SSE2-NEXT: por %xmm5, %xmm6
1984 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm3
1985 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1986 ; SSE2-NEXT: pand %xmm3, %xmm6
1987 ; SSE2-NEXT: pandn %xmm0, %xmm3
1988 ; SSE2-NEXT: por %xmm6, %xmm3
1989 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1990 ; SSE2-NEXT: pxor %xmm5, %xmm0
1991 ; SSE2-NEXT: por %xmm4, %xmm5
1992 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
1993 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1994 ; SSE2-NEXT: pand %xmm0, %xmm5
1995 ; SSE2-NEXT: pandn %xmm4, %xmm0
1996 ; SSE2-NEXT: por %xmm5, %xmm0
1997 ; SSE2-NEXT: psubd %xmm1, %xmm0
1998 ; SSE2-NEXT: psubd %xmm2, %xmm3
1999 ; SSE2-NEXT: pslld $16, %xmm3
2000 ; SSE2-NEXT: psrad $16, %xmm3
2001 ; SSE2-NEXT: pslld $16, %xmm0
2002 ; SSE2-NEXT: psrad $16, %xmm0
2003 ; SSE2-NEXT: packssdw %xmm3, %xmm0
2006 ; SSSE3-LABEL: psubus_i16_i32_max_swapped:
2007 ; SSSE3: # %bb.0: # %vector.ph
2008 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2009 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
2010 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
2011 ; SSSE3-NEXT: pxor %xmm4, %xmm5
2012 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
2013 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
2014 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
2015 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
2016 ; SSSE3-NEXT: pand %xmm7, %xmm2
2017 ; SSSE3-NEXT: pandn %xmm5, %xmm7
2018 ; SSSE3-NEXT: por %xmm2, %xmm7
2019 ; SSSE3-NEXT: pshufb %xmm3, %xmm7
2020 ; SSSE3-NEXT: pxor %xmm1, %xmm4
2021 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
2022 ; SSSE3-NEXT: pand %xmm6, %xmm1
2023 ; SSSE3-NEXT: pandn %xmm5, %xmm6
2024 ; SSSE3-NEXT: por %xmm1, %xmm6
2025 ; SSSE3-NEXT: pshufb %xmm3, %xmm6
2026 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2027 ; SSSE3-NEXT: psubusw %xmm6, %xmm0
2030 ; SSE41-LABEL: psubus_i16_i32_max_swapped:
2031 ; SSE41: # %bb.0: # %vector.ph
2032 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
2033 ; SSE41-NEXT: pminud %xmm3, %xmm2
2034 ; SSE41-NEXT: pminud %xmm3, %xmm1
2035 ; SSE41-NEXT: packusdw %xmm2, %xmm1
2036 ; SSE41-NEXT: psubusw %xmm1, %xmm0
2039 ; AVX1-LABEL: psubus_i16_i32_max_swapped:
2040 ; AVX1: # %bb.0: # %vector.ph
2041 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2042 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
2043 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
2044 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
2045 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2046 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2047 ; AVX1-NEXT: vzeroupper
2050 ; AVX2-LABEL: psubus_i16_i32_max_swapped:
2051 ; AVX2: # %bb.0: # %vector.ph
2052 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
2053 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
2054 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2055 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2056 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2057 ; AVX2-NEXT: vzeroupper
2060 ; AVX512-LABEL: psubus_i16_i32_max_swapped:
2061 ; AVX512: # %bb.0: # %vector.ph
2062 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
2063 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2064 ; AVX512-NEXT: vzeroupper
2067 %lhs = zext <8 x i16> %x to <8 x i32>
2068 %cond = icmp ult <8 x i32> %y, %lhs
2069 %max = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
2070 %sub = sub <8 x i32> %max, %y
2071 %res = trunc <8 x i32> %sub to <8 x i16>
2075 define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind {
2076 ; SSE2-LABEL: psubus_i16_i32_min:
2077 ; SSE2: # %bb.0: # %vector.ph
2078 ; SSE2-NEXT: pxor %xmm4, %xmm4
2079 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2080 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2081 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2082 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
2083 ; SSE2-NEXT: movdqa %xmm2, %xmm5
2084 ; SSE2-NEXT: pxor %xmm4, %xmm5
2085 ; SSE2-NEXT: movdqa %xmm0, %xmm6
2086 ; SSE2-NEXT: por %xmm4, %xmm6
2087 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
2088 ; SSE2-NEXT: movdqa %xmm0, %xmm6
2089 ; SSE2-NEXT: pand %xmm5, %xmm6
2090 ; SSE2-NEXT: pandn %xmm2, %xmm5
2091 ; SSE2-NEXT: por %xmm6, %xmm5
2092 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2093 ; SSE2-NEXT: pxor %xmm4, %xmm2
2094 ; SSE2-NEXT: por %xmm3, %xmm4
2095 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
2096 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2097 ; SSE2-NEXT: pand %xmm2, %xmm4
2098 ; SSE2-NEXT: pandn %xmm1, %xmm2
2099 ; SSE2-NEXT: por %xmm4, %xmm2
2100 ; SSE2-NEXT: psubd %xmm2, %xmm3
2101 ; SSE2-NEXT: psubd %xmm5, %xmm0
2102 ; SSE2-NEXT: pslld $16, %xmm0
2103 ; SSE2-NEXT: psrad $16, %xmm0
2104 ; SSE2-NEXT: pslld $16, %xmm3
2105 ; SSE2-NEXT: psrad $16, %xmm3
2106 ; SSE2-NEXT: packssdw %xmm0, %xmm3
2107 ; SSE2-NEXT: movdqa %xmm3, %xmm0
2110 ; SSSE3-LABEL: psubus_i16_i32_min:
2111 ; SSSE3: # %bb.0: # %vector.ph
2112 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2113 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
2114 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
2115 ; SSSE3-NEXT: pxor %xmm4, %xmm5
2116 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
2117 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
2118 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
2119 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
2120 ; SSSE3-NEXT: pand %xmm7, %xmm2
2121 ; SSSE3-NEXT: pandn %xmm5, %xmm7
2122 ; SSSE3-NEXT: por %xmm2, %xmm7
2123 ; SSSE3-NEXT: pshufb %xmm3, %xmm7
2124 ; SSSE3-NEXT: pxor %xmm1, %xmm4
2125 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
2126 ; SSSE3-NEXT: pand %xmm6, %xmm1
2127 ; SSSE3-NEXT: pandn %xmm5, %xmm6
2128 ; SSSE3-NEXT: por %xmm1, %xmm6
2129 ; SSSE3-NEXT: pshufb %xmm3, %xmm6
2130 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2131 ; SSSE3-NEXT: psubusw %xmm6, %xmm0
2134 ; SSE41-LABEL: psubus_i16_i32_min:
2135 ; SSE41: # %bb.0: # %vector.ph
2136 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
2137 ; SSE41-NEXT: pminud %xmm3, %xmm2
2138 ; SSE41-NEXT: pminud %xmm3, %xmm1
2139 ; SSE41-NEXT: packusdw %xmm2, %xmm1
2140 ; SSE41-NEXT: psubusw %xmm1, %xmm0
2143 ; AVX1-LABEL: psubus_i16_i32_min:
2144 ; AVX1: # %bb.0: # %vector.ph
2145 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2146 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
2147 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
2148 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
2149 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2150 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2151 ; AVX1-NEXT: vzeroupper
2154 ; AVX2-LABEL: psubus_i16_i32_min:
2155 ; AVX2: # %bb.0: # %vector.ph
2156 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
2157 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
2158 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2159 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2160 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2161 ; AVX2-NEXT: vzeroupper
2164 ; AVX512-LABEL: psubus_i16_i32_min:
2165 ; AVX512: # %bb.0: # %vector.ph
2166 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
2167 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2168 ; AVX512-NEXT: vzeroupper
2171 %lhs = zext <8 x i16> %x to <8 x i32>
2172 %cond = icmp ult <8 x i32> %lhs, %y
2173 %min = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
2174 %sub = sub <8 x i32> %lhs, %min
2175 %res = trunc <8 x i32> %sub to <8 x i16>
2179 define void @subus_v8i8(<8 x i8>* %p1, <8 x i8>* %p2) {
2180 ; SSE-LABEL: subus_v8i8:
2182 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2183 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2184 ; SSE-NEXT: psubusb %xmm1, %xmm0
2185 ; SSE-NEXT: movq %xmm0, (%rdi)
2188 ; AVX1-LABEL: subus_v8i8:
2190 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2191 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2192 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2193 ; AVX1-NEXT: vmovq %xmm0, (%rdi)
2196 ; AVX2-LABEL: subus_v8i8:
2198 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2199 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2200 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2201 ; AVX2-NEXT: vmovq %xmm0, (%rdi)
2204 ; AVX512-LABEL: subus_v8i8:
2206 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2207 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2208 ; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2209 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2210 ; AVX512-NEXT: vpmovwb %xmm0, (%rdi)
2212 %ld1 = load <8 x i8>, <8 x i8>* %p1, align 8
2213 %ld2 = load <8 x i8>, <8 x i8>* %p2, align 8
2214 %1 = sub <8 x i8> %ld1, %ld2
2215 %2 = icmp ugt <8 x i8> %ld1, %ld2
2216 %sh3 = select <8 x i1> %2, <8 x i8> %1, <8 x i8> zeroinitializer
2217 store <8 x i8> %sh3, <8 x i8>* %p1, align 8
2221 define void @subus_v4i8(<4 x i8>* %p1, <4 x i8>* %p2) {
2222 ; SSE-LABEL: subus_v4i8:
2224 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2225 ; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2226 ; SSE-NEXT: psubusb %xmm1, %xmm0
2227 ; SSE-NEXT: movd %xmm0, (%rdi)
2230 ; AVX1-LABEL: subus_v4i8:
2232 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2233 ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2234 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2235 ; AVX1-NEXT: vmovd %xmm0, (%rdi)
2238 ; AVX2-LABEL: subus_v4i8:
2240 ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2241 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2242 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2243 ; AVX2-NEXT: vmovd %xmm0, (%rdi)
2246 ; AVX512-LABEL: subus_v4i8:
2248 ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2249 ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2250 ; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2251 ; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2252 ; AVX512-NEXT: vpmovdb %xmm0, (%rdi)
2254 %ld1 = load <4 x i8>, <4 x i8>* %p1, align 8
2255 %ld2 = load <4 x i8>, <4 x i8>* %p2, align 8
2256 %1 = sub <4 x i8> %ld1, %ld2
2257 %2 = icmp ugt <4 x i8> %ld1, %ld2
2258 %sh3 = select <4 x i1> %2, <4 x i8> %1, <4 x i8> zeroinitializer
2259 store <4 x i8> %sh3, <4 x i8>* %p1, align 8
2263 define void @subus_v2i8(<2 x i8>* %p1, <2 x i8>* %p2) {
2264 ; SSE2-LABEL: subus_v2i8:
2266 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2267 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2268 ; SSE2-NEXT: psubusb %xmm1, %xmm0
2269 ; SSE2-NEXT: movd %xmm0, %eax
2270 ; SSE2-NEXT: movw %ax, (%rdi)
2273 ; SSSE3-LABEL: subus_v2i8:
2275 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2276 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2277 ; SSSE3-NEXT: psubusb %xmm1, %xmm0
2278 ; SSSE3-NEXT: movd %xmm0, %eax
2279 ; SSSE3-NEXT: movw %ax, (%rdi)
2282 ; SSE41-LABEL: subus_v2i8:
2284 ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2285 ; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2286 ; SSE41-NEXT: psubusb %xmm1, %xmm0
2287 ; SSE41-NEXT: pextrw $0, %xmm0, (%rdi)
2290 ; AVX1-LABEL: subus_v2i8:
2292 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2293 ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2294 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2295 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi)
2298 ; AVX2-LABEL: subus_v2i8:
2300 ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2301 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2302 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2303 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi)
2306 ; AVX512-LABEL: subus_v2i8:
2308 ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2309 ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2310 ; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2311 ; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
2312 ; AVX512-NEXT: vpmovqb %xmm0, (%rdi)
2314 %ld1 = load <2 x i8>, <2 x i8>* %p1, align 8
2315 %ld2 = load <2 x i8>, <2 x i8>* %p2, align 8
2316 %1 = sub <2 x i8> %ld1, %ld2
2317 %2 = icmp ugt <2 x i8> %ld1, %ld2
2318 %sh3 = select <2 x i1> %2, <2 x i8> %1, <2 x i8> zeroinitializer
2319 store <2 x i8> %sh3, <2 x i8>* %p1, align 8
2323 define void @subus_v4i16(<4 x i16>* %p1, <4 x i16>* %p2) {
2324 ; SSE-LABEL: subus_v4i16:
2326 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2327 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2328 ; SSE-NEXT: psubusw %xmm1, %xmm0
2329 ; SSE-NEXT: movq %xmm0, (%rdi)
2332 ; AVX1-LABEL: subus_v4i16:
2334 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2335 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2336 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2337 ; AVX1-NEXT: vmovq %xmm0, (%rdi)
2340 ; AVX2-LABEL: subus_v4i16:
2342 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2343 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2344 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2345 ; AVX2-NEXT: vmovq %xmm0, (%rdi)
2348 ; AVX512-LABEL: subus_v4i16:
2350 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2351 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2352 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2353 ; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2354 ; AVX512-NEXT: vpmovdw %xmm0, (%rdi)
2356 %ld1 = load <4 x i16>, <4 x i16>* %p1, align 8
2357 %ld2 = load <4 x i16>, <4 x i16>* %p2, align 8
2358 %1 = sub <4 x i16> %ld1, %ld2
2359 %2 = icmp ugt <4 x i16> %ld1, %ld2
2360 %sh3 = select <4 x i1> %2, <4 x i16> %1, <4 x i16> zeroinitializer
2361 store <4 x i16> %sh3, <4 x i16>* %p1, align 8
2365 define void @subus_v2i16(<2 x i16>* %p1, <2 x i16>* %p2) {
2366 ; SSE-LABEL: subus_v2i16:
2368 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2369 ; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2370 ; SSE-NEXT: psubusw %xmm1, %xmm0
2371 ; SSE-NEXT: movd %xmm0, (%rdi)
2374 ; AVX1-LABEL: subus_v2i16:
2376 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2377 ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2378 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2379 ; AVX1-NEXT: vmovd %xmm0, (%rdi)
2382 ; AVX2-LABEL: subus_v2i16:
2384 ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2385 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2386 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2387 ; AVX2-NEXT: vmovd %xmm0, (%rdi)
2390 ; AVX512-LABEL: subus_v2i16:
2392 ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2393 ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2394 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2395 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2396 ; AVX512-NEXT: vpmovqw %xmm0, (%rdi)
2398 %ld1 = load <2 x i16>, <2 x i16>* %p1, align 8
2399 %ld2 = load <2 x i16>, <2 x i16>* %p2, align 8
2400 %1 = sub <2 x i16> %ld1, %ld2
2401 %2 = icmp ugt <2 x i16> %ld1, %ld2
2402 %sh3 = select <2 x i1> %2, <2 x i16> %1, <2 x i16> zeroinitializer
2403 store <2 x i16> %sh3, <2 x i16>* %p1, align 8
2407 define <16 x i8> @test19(<16 x i8> %x) {
2408 ; SSE-LABEL: test19:
2409 ; SSE: # %bb.0: # %entry
2410 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
2413 ; AVX-LABEL: test19:
2414 ; AVX: # %bb.0: # %entry
2415 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
2418 %0 = icmp ugt <16 x i8> %x, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2419 %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2420 %2 = add <16 x i8> %1, <i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70>
2424 define <16 x i8> @test20(<16 x i8> %x) {
2425 ; SSE-LABEL: test20:
2426 ; SSE: # %bb.0: # %entry
2427 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
2430 ; AVX-LABEL: test20:
2431 ; AVX: # %bb.0: # %entry
2432 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
2435 %0 = icmp ugt <16 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70>
2436 %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70>
2437 %2 = add <16 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70>
2441 define <8 x i16> @test21(<8 x i16> %x) {
2442 ; SSE-LABEL: test21:
2443 ; SSE: # %bb.0: # %entry
2444 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
2447 ; AVX-LABEL: test21:
2448 ; AVX: # %bb.0: # %entry
2449 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
2452 %0 = icmp ugt <8 x i16> %x, <i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700>
2453 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700>
2454 %2 = add <8 x i16> %1, <i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700>
2458 define <8 x i16> @test22(<8 x i16> %x) {
2459 ; SSE-LABEL: test22:
2460 ; SSE: # %bb.0: # %entry
2461 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
2464 ; AVX-LABEL: test22:
2465 ; AVX: # %bb.0: # %entry
2466 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
2469 %0 = icmp ugt <8 x i16> %x, <i16 1, i16 -22000, i16 -770, i16 98, i16 19, i16 1000, i16 3456, i16 70>
2470 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 1, i16 -22000, i16 -770, i16 98, i16 19, i16 1000, i16 3456, i16 70>
2471 %2 = add <8 x i16> %1, <i16 -1, i16 22000, i16 770, i16 -98, i16 -19, i16 -1000, i16 -3456, i16 -70>
2475 define <32 x i8> @test23(<32 x i8> %x) {
2476 ; SSE-LABEL: test23:
2477 ; SSE: # %bb.0: # %entry
2478 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70]
2479 ; SSE-NEXT: psubusb %xmm2, %xmm0
2480 ; SSE-NEXT: psubusb %xmm2, %xmm1
2483 ; AVX1-LABEL: test23:
2484 ; AVX1: # %bb.0: # %entry
2485 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2486 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70]
2487 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
2488 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
2489 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2492 ; AVX2-LABEL: test23:
2493 ; AVX2: # %bb.0: # %entry
2494 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
2497 ; AVX512-LABEL: test23:
2498 ; AVX512: # %bb.0: # %entry
2499 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
2502 %0 = icmp ugt <32 x i8> %x, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2503 %1 = select <32 x i1> %0, <32 x i8> %x, <32 x i8> <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2504 %2 = add <32 x i8> %1, <i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70>
2508 define <32 x i8> @test24(<32 x i8> %x) {
2509 ; SSE-LABEL: test24:
2510 ; SSE: # %bb.0: # %entry
2511 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
2512 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm1
2515 ; AVX1-LABEL: test24:
2516 ; AVX1: # %bb.0: # %entry
2517 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm1
2518 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2519 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
2520 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2523 ; AVX2-LABEL: test24:
2524 ; AVX2: # %bb.0: # %entry
2525 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
2528 ; AVX512-LABEL: test24:
2529 ; AVX512: # %bb.0: # %entry
2530 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
2533 %0 = icmp ugt <32 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2534 %1 = select <32 x i1> %0, <32 x i8> %x, <32 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2535 %2 = add <32 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70>
2539 define <16 x i16> @test25(<16 x i16> %x) {
2540 ; SSE-LABEL: test25:
2541 ; SSE: # %bb.0: # %entry
2542 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000]
2543 ; SSE-NEXT: psubusw %xmm2, %xmm0
2544 ; SSE-NEXT: psubusw %xmm2, %xmm1
2547 ; AVX1-LABEL: test25:
2548 ; AVX1: # %bb.0: # %entry
2549 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2550 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000]
2551 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
2552 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
2553 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2556 ; AVX2-LABEL: test25:
2557 ; AVX2: # %bb.0: # %entry
2558 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
2561 ; AVX512-LABEL: test25:
2562 ; AVX512: # %bb.0: # %entry
2563 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
2566 %0 = icmp ugt <16 x i16> %x, <i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000>
2567 %1 = select <16 x i1> %0, <16 x i16> %x, <16 x i16> <i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000>
2568 %2 = add <16 x i16> %1, <i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000>
2572 define <16 x i16> @test26(<16 x i16> %x) {
2573 ; SSE-LABEL: test26:
2574 ; SSE: # %bb.0: # %entry
2575 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
2576 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm1
2579 ; AVX1-LABEL: test26:
2580 ; AVX1: # %bb.0: # %entry
2581 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm1
2582 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2583 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
2584 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2587 ; AVX2-LABEL: test26:
2588 ; AVX2: # %bb.0: # %entry
2589 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
2592 ; AVX512-LABEL: test26:
2593 ; AVX512: # %bb.0: # %entry
2594 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
2597 %0 = icmp ugt <16 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70>
2598 %1 = select <16 x i1> %0, <16 x i16> %x, <16 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70>
2599 %2 = add <16 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9800, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -34, i16 -55, i16 -70>
2603 define <64 x i8> @test27(<64 x i8> %x) {
2604 ; SSE-LABEL: test27:
2605 ; SSE: # %bb.0: # %entry
2606 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
2607 ; SSE-NEXT: psubusb %xmm4, %xmm0
2608 ; SSE-NEXT: psubusb %xmm4, %xmm1
2609 ; SSE-NEXT: psubusb %xmm4, %xmm2
2610 ; SSE-NEXT: psubusb %xmm4, %xmm3
2613 ; AVX1-LABEL: test27:
2614 ; AVX1: # %bb.0: # %entry
2615 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2616 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
2617 ; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2
2618 ; AVX1-NEXT: vpsubusb %xmm3, %xmm0, %xmm0
2619 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2620 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2621 ; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2
2622 ; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1
2623 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2626 ; AVX2-LABEL: test27:
2627 ; AVX2: # %bb.0: # %entry
2628 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
2629 ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
2630 ; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1
2633 ; AVX512-LABEL: test27:
2634 ; AVX512: # %bb.0: # %entry
2635 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %zmm0, %zmm0
2638 %0 = icmp ugt <64 x i8> %x, <i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154>
2639 %1 = select <64 x i1> %0, <64 x i8> %x, <64 x i8> <i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154>
2640 %2 = add <64 x i8> %1, <i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154>
2644 define <64 x i8> @test28(<64 x i8> %x) {
2645 ; SSE-LABEL: test28:
2646 ; SSE: # %bb.0: # %entry
2647 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70]
2648 ; SSE-NEXT: psubusb %xmm4, %xmm0
2649 ; SSE-NEXT: psubusb %xmm4, %xmm2
2650 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm1
2651 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm3
2654 ; AVX1-LABEL: test28:
2655 ; AVX1: # %bb.0: # %entry
2656 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70]
2657 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm3
2658 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2659 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
2660 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
2661 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm2
2662 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2663 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm1, %xmm1
2664 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2667 ; AVX2-LABEL: test28:
2668 ; AVX2: # %bb.0: # %entry
2669 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
2670 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm1, %ymm1
2673 ; AVX512-LABEL: test28:
2674 ; AVX512: # %bb.0: # %entry
2675 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %zmm0, %zmm0
2678 %0 = icmp ugt <64 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70, i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -116, i8 -77, i8 -70, i8 123, i8 98, i8 67, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2679 %1 = select <64 x i1> %0, <64 x i8> %x, <64 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70, i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -116, i8 -77, i8 -70, i8 123, i8 98, i8 67, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2680 %2 = add <64 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70, i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 116, i8 77, i8 70, i8 -123, i8 -98, i8 -67, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70>
2684 define <32 x i16> @test29(<32 x i16> %x) {
2685 ; SSE-LABEL: test29:
2686 ; SSE: # %bb.0: # %entry
2687 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
2688 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm1
2689 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm2
2690 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm3
2693 ; AVX1-LABEL: test29:
2694 ; AVX1: # %bb.0: # %entry
2695 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm2
2696 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2697 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
2698 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2699 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm1, %xmm2
2700 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2701 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm1, %xmm1
2702 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2705 ; AVX2-LABEL: test29:
2706 ; AVX2: # %bb.0: # %entry
2707 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
2708 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm1, %ymm1
2711 ; AVX512-LABEL: test29:
2712 ; AVX512: # %bb.0: # %entry
2713 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %zmm0, %zmm0
2716 %0 = icmp ugt <32 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70, i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9805, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 346, i16 55, i16 70>
2717 %1 = select <32 x i1> %0, <32 x i16> %x, <32 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70, i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9805, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 346, i16 55, i16 70>
2718 %2 = add <32 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9800, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -34, i16 -55, i16 -70, i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9805, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -346, i16 -55, i16 -70>
2723 define i64 @test30(<8 x i16> %x) {
2724 ; SSE-LABEL: test30:
2725 ; SSE: # %bb.0: # %entry
2726 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
2727 ; SSE-NEXT: movq %xmm0, %rax
2730 ; AVX-LABEL: test30:
2731 ; AVX: # %bb.0: # %entry
2732 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
2733 ; AVX-NEXT: vmovq %xmm0, %rax
2736 %0 = icmp ugt <8 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 undef, i16 undef, i16 undef, i16 undef>
2737 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 undef, i16 undef, i16 undef, i16 undef>
2738 %2 = add <8 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 undef, i16 undef, i16 undef, i16 undef>
2739 %3 = bitcast <8 x i16> %2 to <2 x i64>
2740 %4 = extractelement <2 x i64> %3, i32 0
2745 define i64 @test31(<2 x i64> %x) {
2746 ; SSE-LABEL: test31:
2748 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
2749 ; SSE-NEXT: movq %xmm0, %rax
2752 ; AVX-LABEL: test31:
2754 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
2755 ; AVX-NEXT: vmovq %xmm0, %rax
2757 %t0 = bitcast <2 x i64> %x to <16 x i8>
2758 %cmp = icmp ugt <16 x i8> %t0, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
2759 %bop = add <16 x i8> %t0, <i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
2760 %sel = select <16 x i1> %cmp, <16 x i8> %bop, <16 x i8> zeroinitializer
2761 %bc = bitcast <16 x i8> %sel to <2 x i64>
2762 %ext = extractelement <2 x i64> %bc, i32 0