1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2OR3,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2OR3,SSSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
12 define <8 x i16> @test1(<8 x i16> %x) nounwind {
14 ; SSE: # %bb.0: # %vector.ph
15 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
19 ; AVX: # %bb.0: # %vector.ph
20 ; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
23 %0 = icmp slt <8 x i16> %x, zeroinitializer
24 %1 = xor <8 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
25 %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
29 define <8 x i16> @test2(<8 x i16> %x) nounwind {
31 ; SSE: # %bb.0: # %vector.ph
32 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
36 ; AVX: # %bb.0: # %vector.ph
37 ; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
40 %0 = icmp ugt <8 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
41 %1 = add <8 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
42 %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
46 define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
48 ; SSE: # %bb.0: # %vector.ph
49 ; SSE-NEXT: movd %edi, %xmm1
50 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
51 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
52 ; SSE-NEXT: psubusw %xmm1, %xmm0
56 ; AVX1: # %bb.0: # %vector.ph
57 ; AVX1-NEXT: vmovd %edi, %xmm1
58 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
59 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
60 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
64 ; AVX2: # %bb.0: # %vector.ph
65 ; AVX2-NEXT: vmovd %edi, %xmm1
66 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
67 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
70 ; AVX512-LABEL: test3:
71 ; AVX512: # %bb.0: # %vector.ph
72 ; AVX512-NEXT: vpbroadcastw %edi, %xmm1
73 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
76 %0 = insertelement <8 x i16> undef, i16 %w, i32 0
77 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
78 %1 = icmp ult <8 x i16> %x, %broadcast15
79 %2 = sub <8 x i16> %x, %broadcast15
80 %res = select <8 x i1> %1, <8 x i16> zeroinitializer, <8 x i16> %2
84 define <16 x i8> @test4(<16 x i8> %x) nounwind {
86 ; SSE: # %bb.0: # %vector.ph
87 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
91 ; AVX: # %bb.0: # %vector.ph
92 ; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
95 %0 = icmp slt <16 x i8> %x, zeroinitializer
96 %1 = xor <16 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
97 %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
101 define <16 x i8> @test5(<16 x i8> %x) nounwind {
103 ; SSE: # %bb.0: # %vector.ph
104 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
108 ; AVX: # %bb.0: # %vector.ph
109 ; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
112 %0 = icmp ugt <16 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
113 %1 = add <16 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
114 %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
118 define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
120 ; SSE2: # %bb.0: # %vector.ph
121 ; SSE2-NEXT: movd %edi, %xmm1
122 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
123 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
124 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
125 ; SSE2-NEXT: psubusb %xmm1, %xmm0
128 ; SSSE3-LABEL: test6:
129 ; SSSE3: # %bb.0: # %vector.ph
130 ; SSSE3-NEXT: movd %edi, %xmm1
131 ; SSSE3-NEXT: pxor %xmm2, %xmm2
132 ; SSSE3-NEXT: pshufb %xmm2, %xmm1
133 ; SSSE3-NEXT: psubusb %xmm1, %xmm0
136 ; SSE41-LABEL: test6:
137 ; SSE41: # %bb.0: # %vector.ph
138 ; SSE41-NEXT: movd %edi, %xmm1
139 ; SSE41-NEXT: pxor %xmm2, %xmm2
140 ; SSE41-NEXT: pshufb %xmm2, %xmm1
141 ; SSE41-NEXT: psubusb %xmm1, %xmm0
145 ; AVX1: # %bb.0: # %vector.ph
146 ; AVX1-NEXT: vmovd %edi, %xmm1
147 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
148 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
149 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
153 ; AVX2: # %bb.0: # %vector.ph
154 ; AVX2-NEXT: vmovd %edi, %xmm1
155 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
156 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
159 ; AVX512-LABEL: test6:
160 ; AVX512: # %bb.0: # %vector.ph
161 ; AVX512-NEXT: vpbroadcastb %edi, %xmm1
162 ; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
165 %0 = insertelement <16 x i8> undef, i8 %w, i32 0
166 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
167 %1 = icmp ult <16 x i8> %x, %broadcast15
168 %2 = sub <16 x i8> %x, %broadcast15
169 %res = select <16 x i1> %1, <16 x i8> zeroinitializer, <16 x i8> %2
173 define <16 x i16> @test7(<16 x i16> %x) nounwind {
175 ; SSE: # %bb.0: # %vector.ph
176 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
177 ; SSE-NEXT: psubusw %xmm2, %xmm0
178 ; SSE-NEXT: psubusw %xmm2, %xmm1
182 ; AVX1: # %bb.0: # %vector.ph
183 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
184 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
185 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
186 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
187 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
191 ; AVX2: # %bb.0: # %vector.ph
192 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
195 ; AVX512-LABEL: test7:
196 ; AVX512: # %bb.0: # %vector.ph
197 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
200 %0 = icmp slt <16 x i16> %x, zeroinitializer
201 %1 = xor <16 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
202 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
206 define <16 x i16> @test8(<16 x i16> %x) nounwind {
208 ; SSE: # %bb.0: # %vector.ph
209 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
210 ; SSE-NEXT: psubusw %xmm2, %xmm0
211 ; SSE-NEXT: psubusw %xmm2, %xmm1
215 ; AVX1: # %bb.0: # %vector.ph
216 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
217 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
218 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
219 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
220 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
224 ; AVX2: # %bb.0: # %vector.ph
225 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
228 ; AVX512-LABEL: test8:
229 ; AVX512: # %bb.0: # %vector.ph
230 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
233 %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
234 %1 = add <16 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
235 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
239 define <16 x i16> @test8a(<16 x i16> %x) nounwind {
241 ; SSE: # %bb.0: # %vector.ph
242 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
243 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
246 ; AVX1-LABEL: test8a:
247 ; AVX1: # %bb.0: # %vector.ph
248 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
249 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
250 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
251 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
254 ; AVX2-LABEL: test8a:
255 ; AVX2: # %bb.0: # %vector.ph
256 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
259 ; AVX512-LABEL: test8a:
260 ; AVX512: # %bb.0: # %vector.ph
261 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
264 %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32765, i16 32764, i16 32763, i16 32762, i16 32761, i16 32760, i16 32759, i16 32758, i16 32757, i16 32756, i16 32755, i16 32754, i16 32753, i16 32752, i16 32751>
265 %1 = add <16 x i16> %x, <i16 -32767, i16 -32766, i16 -32765, i16 -32764, i16 -32763, i16 -32762, i16 -32761, i16 -32760, i16 -32759, i16 -32758, i16 -32757, i16 -32756, i16 -32755, i16 -32754, i16 -32753, i16 -32752>
266 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
270 define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
272 ; SSE: # %bb.0: # %vector.ph
273 ; SSE-NEXT: movd %edi, %xmm2
274 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
275 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
276 ; SSE-NEXT: psubusw %xmm2, %xmm0
277 ; SSE-NEXT: psubusw %xmm2, %xmm1
281 ; AVX1: # %bb.0: # %vector.ph
282 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
283 ; AVX1-NEXT: vmovd %edi, %xmm2
284 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
285 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
286 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
287 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
288 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
292 ; AVX2: # %bb.0: # %vector.ph
293 ; AVX2-NEXT: vmovd %edi, %xmm1
294 ; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
295 ; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
298 ; AVX512-LABEL: test9:
299 ; AVX512: # %bb.0: # %vector.ph
300 ; AVX512-NEXT: vpbroadcastw %edi, %ymm1
301 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
304 %0 = insertelement <16 x i16> undef, i16 %w, i32 0
305 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
306 %1 = icmp ult <16 x i16> %x, %broadcast15
307 %2 = sub <16 x i16> %x, %broadcast15
308 %res = select <16 x i1> %1, <16 x i16> zeroinitializer, <16 x i16> %2
312 define <32 x i8> @test10(<32 x i8> %x) nounwind {
314 ; SSE: # %bb.0: # %vector.ph
315 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
316 ; SSE-NEXT: psubusb %xmm2, %xmm0
317 ; SSE-NEXT: psubusb %xmm2, %xmm1
320 ; AVX1-LABEL: test10:
321 ; AVX1: # %bb.0: # %vector.ph
322 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
323 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
324 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
325 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
326 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
329 ; AVX2-LABEL: test10:
330 ; AVX2: # %bb.0: # %vector.ph
331 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
334 ; AVX512-LABEL: test10:
335 ; AVX512: # %bb.0: # %vector.ph
336 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
339 %0 = icmp slt <32 x i8> %x, zeroinitializer
340 %1 = xor <32 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
341 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
345 define <32 x i8> @test11(<32 x i8> %x) nounwind {
347 ; SSE: # %bb.0: # %vector.ph
348 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
349 ; SSE-NEXT: psubusb %xmm2, %xmm0
350 ; SSE-NEXT: psubusb %xmm2, %xmm1
353 ; AVX1-LABEL: test11:
354 ; AVX1: # %bb.0: # %vector.ph
355 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
356 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
357 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
358 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
359 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
362 ; AVX2-LABEL: test11:
363 ; AVX2: # %bb.0: # %vector.ph
364 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
367 ; AVX512-LABEL: test11:
368 ; AVX512: # %bb.0: # %vector.ph
369 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
372 %0 = icmp ugt <32 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
373 %1 = add <32 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
374 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
378 define <32 x i8> @test11a(<32 x i8> %x) nounwind {
379 ; SSE-LABEL: test11a:
380 ; SSE: # %bb.0: # %vector.ph
381 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
382 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
385 ; AVX1-LABEL: test11a:
386 ; AVX1: # %bb.0: # %vector.ph
387 ; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
388 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
389 ; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
390 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
393 ; AVX2-LABEL: test11a:
394 ; AVX2: # %bb.0: # %vector.ph
395 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
398 ; AVX512-LABEL: test11a:
399 ; AVX512: # %bb.0: # %vector.ph
400 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
403 %0 = icmp ugt <32 x i8> %x, <i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 117, i8 116, i8 115, i8 114, i8 113, i8 112, i8 111, i8 110, i8 109, i8 108, i8 107, i8 106, i8 105, i8 104, i8 103, i8 102, i8 101, i8 100, i8 99, i8 98, i8 97, i8 96, i8 95>
404 %1 = add <32 x i8> %x, <i8 -127, i8 -126, i8 -125, i8 -124, i8 -123, i8 -122, i8 -121, i8 -120, i8 -119, i8 -118, i8 -117, i8 -116, i8 -115, i8 -114, i8 -113, i8 -112, i8 -111, i8 -110, i8 -109, i8 -108, i8 -107, i8 -106, i8 -105, i8 -104, i8 -103, i8 -102, i8 -101, i8 -100, i8 -99, i8 -98, i8 -97, i8 -96>
405 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
409 define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
410 ; SSE2-LABEL: test12:
411 ; SSE2: # %bb.0: # %vector.ph
412 ; SSE2-NEXT: movd %edi, %xmm2
413 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
414 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
415 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
416 ; SSE2-NEXT: psubusb %xmm2, %xmm0
417 ; SSE2-NEXT: psubusb %xmm2, %xmm1
420 ; SSSE3-LABEL: test12:
421 ; SSSE3: # %bb.0: # %vector.ph
422 ; SSSE3-NEXT: movd %edi, %xmm2
423 ; SSSE3-NEXT: pxor %xmm3, %xmm3
424 ; SSSE3-NEXT: pshufb %xmm3, %xmm2
425 ; SSSE3-NEXT: psubusb %xmm2, %xmm0
426 ; SSSE3-NEXT: psubusb %xmm2, %xmm1
429 ; SSE41-LABEL: test12:
430 ; SSE41: # %bb.0: # %vector.ph
431 ; SSE41-NEXT: movd %edi, %xmm2
432 ; SSE41-NEXT: pxor %xmm3, %xmm3
433 ; SSE41-NEXT: pshufb %xmm3, %xmm2
434 ; SSE41-NEXT: psubusb %xmm2, %xmm0
435 ; SSE41-NEXT: psubusb %xmm2, %xmm1
438 ; AVX1-LABEL: test12:
439 ; AVX1: # %bb.0: # %vector.ph
440 ; AVX1-NEXT: vmovd %edi, %xmm1
441 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
442 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
443 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
444 ; AVX1-NEXT: vpsubusb %xmm1, %xmm2, %xmm2
445 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
446 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
449 ; AVX2-LABEL: test12:
450 ; AVX2: # %bb.0: # %vector.ph
451 ; AVX2-NEXT: vmovd %edi, %xmm1
452 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
453 ; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
456 ; AVX512-LABEL: test12:
457 ; AVX512: # %bb.0: # %vector.ph
458 ; AVX512-NEXT: vpbroadcastb %edi, %ymm1
459 ; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
462 %0 = insertelement <32 x i8> undef, i8 %w, i32 0
463 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
464 %1 = icmp ult <32 x i8> %x, %broadcast15
465 %2 = sub <32 x i8> %x, %broadcast15
466 %res = select <32 x i1> %1, <32 x i8> zeroinitializer, <32 x i8> %2
470 define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
471 ; SSE2-LABEL: test13:
472 ; SSE2: # %bb.0: # %vector.ph
473 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
474 ; SSE2-NEXT: movdqa %xmm2, %xmm4
475 ; SSE2-NEXT: pxor %xmm3, %xmm4
476 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
477 ; SSE2-NEXT: movdqa %xmm5, %xmm6
478 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
479 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
480 ; SSE2-NEXT: pand %xmm6, %xmm2
481 ; SSE2-NEXT: pxor %xmm4, %xmm6
482 ; SSE2-NEXT: por %xmm2, %xmm6
483 ; SSE2-NEXT: pslld $16, %xmm6
484 ; SSE2-NEXT: psrad $16, %xmm6
485 ; SSE2-NEXT: pxor %xmm1, %xmm3
486 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
487 ; SSE2-NEXT: pxor %xmm5, %xmm4
488 ; SSE2-NEXT: pand %xmm1, %xmm5
489 ; SSE2-NEXT: por %xmm4, %xmm5
490 ; SSE2-NEXT: pslld $16, %xmm5
491 ; SSE2-NEXT: psrad $16, %xmm5
492 ; SSE2-NEXT: packssdw %xmm6, %xmm5
493 ; SSE2-NEXT: psubusw %xmm5, %xmm0
496 ; SSSE3-LABEL: test13:
497 ; SSSE3: # %bb.0: # %vector.ph
498 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
499 ; SSSE3-NEXT: movdqa %xmm2, %xmm4
500 ; SSSE3-NEXT: pxor %xmm3, %xmm4
501 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
502 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
503 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
504 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
505 ; SSSE3-NEXT: pand %xmm6, %xmm2
506 ; SSSE3-NEXT: pandn %xmm4, %xmm6
507 ; SSSE3-NEXT: por %xmm2, %xmm6
508 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
509 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
510 ; SSSE3-NEXT: pxor %xmm1, %xmm3
511 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
512 ; SSSE3-NEXT: pand %xmm5, %xmm1
513 ; SSSE3-NEXT: pandn %xmm4, %xmm5
514 ; SSSE3-NEXT: por %xmm1, %xmm5
515 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
516 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
517 ; SSSE3-NEXT: psubusw %xmm5, %xmm0
520 ; SSE41-LABEL: test13:
521 ; SSE41: # %bb.0: # %vector.ph
522 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
523 ; SSE41-NEXT: pminud %xmm3, %xmm2
524 ; SSE41-NEXT: pminud %xmm3, %xmm1
525 ; SSE41-NEXT: packusdw %xmm2, %xmm1
526 ; SSE41-NEXT: psubusw %xmm1, %xmm0
529 ; AVX1-LABEL: test13:
530 ; AVX1: # %bb.0: # %vector.ph
531 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
532 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
533 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
534 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
535 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
536 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
537 ; AVX1-NEXT: vzeroupper
540 ; AVX2-LABEL: test13:
541 ; AVX2: # %bb.0: # %vector.ph
542 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
543 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
544 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
545 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
546 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
547 ; AVX2-NEXT: vzeroupper
550 ; AVX512-LABEL: test13:
551 ; AVX512: # %bb.0: # %vector.ph
552 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
553 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
554 ; AVX512-NEXT: vzeroupper
557 %lhs = zext <8 x i16> %x to <8 x i32>
558 %cond = icmp ult <8 x i32> %lhs, %y
559 %sub = sub <8 x i32> %lhs, %y
560 %trunc = trunc <8 x i32> %sub to <8 x i16>
561 %res = select <8 x i1> %cond, <8 x i16> zeroinitializer, <8 x i16> %trunc
565 ; FIXME: match this to UMIN+TRUNC+PSUBUS
566 define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
567 ; SSE2OR3-LABEL: test14:
568 ; SSE2OR3: # %bb.0: # %vector.ph
569 ; SSE2OR3-NEXT: pxor %xmm8, %xmm8
570 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm6
571 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm9
572 ; SSE2OR3-NEXT: movdqa %xmm3, %xmm10
573 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm7
574 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
575 ; SSE2OR3-NEXT: pand %xmm5, %xmm4
576 ; SSE2OR3-NEXT: pand %xmm5, %xmm3
577 ; SSE2OR3-NEXT: packuswb %xmm4, %xmm3
578 ; SSE2OR3-NEXT: movdqa %xmm1, %xmm4
579 ; SSE2OR3-NEXT: pand %xmm5, %xmm2
580 ; SSE2OR3-NEXT: pand %xmm5, %xmm1
581 ; SSE2OR3-NEXT: packuswb %xmm2, %xmm1
582 ; SSE2OR3-NEXT: packuswb %xmm3, %xmm1
583 ; SSE2OR3-NEXT: psubb %xmm0, %xmm1
584 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2
585 ; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
586 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm0
587 ; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
588 ; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
589 ; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
590 ; SSE2OR3-NEXT: movdqa %xmm6, %xmm3
591 ; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
592 ; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
593 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
594 ; SSE2OR3-NEXT: pxor %xmm5, %xmm9
595 ; SSE2OR3-NEXT: por %xmm5, %xmm6
596 ; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm6
597 ; SSE2OR3-NEXT: pxor %xmm5, %xmm10
598 ; SSE2OR3-NEXT: por %xmm5, %xmm3
599 ; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm3
600 ; SSE2OR3-NEXT: packssdw %xmm6, %xmm3
601 ; SSE2OR3-NEXT: pxor %xmm5, %xmm7
602 ; SSE2OR3-NEXT: por %xmm5, %xmm2
603 ; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm2
604 ; SSE2OR3-NEXT: pxor %xmm5, %xmm4
605 ; SSE2OR3-NEXT: por %xmm5, %xmm0
606 ; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm0
607 ; SSE2OR3-NEXT: packssdw %xmm2, %xmm0
608 ; SSE2OR3-NEXT: packsswb %xmm3, %xmm0
609 ; SSE2OR3-NEXT: pandn %xmm1, %xmm0
612 ; SSE41-LABEL: test14:
613 ; SSE41: # %bb.0: # %vector.ph
614 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
615 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
616 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
617 ; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
618 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
619 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
620 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
621 ; SSE41-NEXT: pmaxud %xmm4, %xmm6
622 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
623 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm9
624 ; SSE41-NEXT: pxor %xmm9, %xmm6
625 ; SSE41-NEXT: pmaxud %xmm3, %xmm7
626 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
627 ; SSE41-NEXT: pxor %xmm9, %xmm7
628 ; SSE41-NEXT: packssdw %xmm6, %xmm7
629 ; SSE41-NEXT: pmaxud %xmm1, %xmm5
630 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm5
631 ; SSE41-NEXT: pxor %xmm9, %xmm5
632 ; SSE41-NEXT: pmaxud %xmm2, %xmm8
633 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm8
634 ; SSE41-NEXT: pxor %xmm9, %xmm8
635 ; SSE41-NEXT: packssdw %xmm8, %xmm5
636 ; SSE41-NEXT: packsswb %xmm7, %xmm5
637 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
638 ; SSE41-NEXT: pand %xmm6, %xmm4
639 ; SSE41-NEXT: pand %xmm6, %xmm3
640 ; SSE41-NEXT: packusdw %xmm4, %xmm3
641 ; SSE41-NEXT: pand %xmm6, %xmm2
642 ; SSE41-NEXT: pand %xmm6, %xmm1
643 ; SSE41-NEXT: packusdw %xmm2, %xmm1
644 ; SSE41-NEXT: packuswb %xmm3, %xmm1
645 ; SSE41-NEXT: psubb %xmm0, %xmm1
646 ; SSE41-NEXT: pandn %xmm1, %xmm5
647 ; SSE41-NEXT: movdqa %xmm5, %xmm0
650 ; AVX1-LABEL: test14:
651 ; AVX1: # %bb.0: # %vector.ph
652 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
653 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
654 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
655 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
656 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
657 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
658 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
659 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
660 ; AVX1-NEXT: vpmaxud %xmm6, %xmm7, %xmm6
661 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
662 ; AVX1-NEXT: vpcmpeqd %xmm7, %xmm7, %xmm7
663 ; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6
664 ; AVX1-NEXT: vpmaxud %xmm5, %xmm2, %xmm5
665 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm5
666 ; AVX1-NEXT: vpxor %xmm7, %xmm5, %xmm5
667 ; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
668 ; AVX1-NEXT: vpmaxud %xmm4, %xmm1, %xmm4
669 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm4
670 ; AVX1-NEXT: vpxor %xmm7, %xmm4, %xmm4
671 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
672 ; AVX1-NEXT: vpmaxud %xmm3, %xmm6, %xmm3
673 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3
674 ; AVX1-NEXT: vpxor %xmm7, %xmm3, %xmm3
675 ; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
676 ; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3
677 ; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
678 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
679 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
680 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
681 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
682 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
683 ; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
684 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
685 ; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0
686 ; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
687 ; AVX1-NEXT: vzeroupper
690 ; AVX2-LABEL: test14:
691 ; AVX2: # %bb.0: # %vector.ph
692 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
693 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
694 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
695 ; AVX2-NEXT: vpmaxud %ymm4, %ymm1, %ymm4
696 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4
697 ; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
698 ; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4
699 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
700 ; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
701 ; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm3
702 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm3
703 ; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm3
704 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
705 ; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3
706 ; AVX2-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
707 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
708 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
709 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
710 ; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
711 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
712 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
713 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
714 ; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0
715 ; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
716 ; AVX2-NEXT: vzeroupper
719 ; AVX512-LABEL: test14:
720 ; AVX512: # %bb.0: # %vector.ph
721 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
722 ; AVX512-NEXT: vpcmpnltud %zmm2, %zmm1, %k1
723 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1
724 ; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} {z}
725 ; AVX512-NEXT: vzeroupper
728 %rhs = zext <16 x i8> %x to <16 x i32>
729 %cond = icmp ult <16 x i32> %y, %rhs
730 %sub = sub <16 x i32> %y, %rhs
731 %truncsub = trunc <16 x i32> %sub to <16 x i8>
732 %res = select <16 x i1> %cond, <16 x i8> zeroinitializer, <16 x i8> %truncsub
736 define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
737 ; SSE2-LABEL: test15:
738 ; SSE2: # %bb.0: # %vector.ph
739 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
740 ; SSE2-NEXT: movdqa %xmm2, %xmm4
741 ; SSE2-NEXT: pxor %xmm3, %xmm4
742 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
743 ; SSE2-NEXT: movdqa %xmm5, %xmm6
744 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
745 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
746 ; SSE2-NEXT: pand %xmm6, %xmm2
747 ; SSE2-NEXT: pxor %xmm4, %xmm6
748 ; SSE2-NEXT: por %xmm2, %xmm6
749 ; SSE2-NEXT: pslld $16, %xmm6
750 ; SSE2-NEXT: psrad $16, %xmm6
751 ; SSE2-NEXT: pxor %xmm1, %xmm3
752 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
753 ; SSE2-NEXT: pxor %xmm5, %xmm4
754 ; SSE2-NEXT: pand %xmm1, %xmm5
755 ; SSE2-NEXT: por %xmm4, %xmm5
756 ; SSE2-NEXT: pslld $16, %xmm5
757 ; SSE2-NEXT: psrad $16, %xmm5
758 ; SSE2-NEXT: packssdw %xmm6, %xmm5
759 ; SSE2-NEXT: psubusw %xmm5, %xmm0
762 ; SSSE3-LABEL: test15:
763 ; SSSE3: # %bb.0: # %vector.ph
764 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
765 ; SSSE3-NEXT: movdqa %xmm2, %xmm4
766 ; SSSE3-NEXT: pxor %xmm3, %xmm4
767 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
768 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
769 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
770 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
771 ; SSSE3-NEXT: pand %xmm6, %xmm2
772 ; SSSE3-NEXT: pandn %xmm4, %xmm6
773 ; SSSE3-NEXT: por %xmm2, %xmm6
774 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
775 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
776 ; SSSE3-NEXT: pxor %xmm1, %xmm3
777 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
778 ; SSSE3-NEXT: pand %xmm5, %xmm1
779 ; SSSE3-NEXT: pandn %xmm4, %xmm5
780 ; SSSE3-NEXT: por %xmm1, %xmm5
781 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
782 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
783 ; SSSE3-NEXT: psubusw %xmm5, %xmm0
786 ; SSE41-LABEL: test15:
787 ; SSE41: # %bb.0: # %vector.ph
788 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
789 ; SSE41-NEXT: pminud %xmm3, %xmm2
790 ; SSE41-NEXT: pminud %xmm3, %xmm1
791 ; SSE41-NEXT: packusdw %xmm2, %xmm1
792 ; SSE41-NEXT: psubusw %xmm1, %xmm0
795 ; AVX1-LABEL: test15:
796 ; AVX1: # %bb.0: # %vector.ph
797 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
798 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
799 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
800 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
801 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
802 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
803 ; AVX1-NEXT: vzeroupper
806 ; AVX2-LABEL: test15:
807 ; AVX2: # %bb.0: # %vector.ph
808 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
809 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
810 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
811 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
812 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
813 ; AVX2-NEXT: vzeroupper
816 ; AVX512-LABEL: test15:
817 ; AVX512: # %bb.0: # %vector.ph
818 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
819 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
820 ; AVX512-NEXT: vzeroupper
823 %lhs = zext <8 x i16> %x to <8 x i32>
824 %cond = icmp ugt <8 x i32> %lhs, %y
825 %sub = sub <8 x i32> %lhs, %y
826 %truncsub = trunc <8 x i32> %sub to <8 x i16>
827 %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
831 ; FIXME: match this to UMIN+TRUNC+PSUBUS
832 define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
833 ; SSE2-LABEL: test16:
834 ; SSE2: # %bb.0: # %vector.ph
835 ; SSE2-NEXT: pxor %xmm3, %xmm3
836 ; SSE2-NEXT: movdqa %xmm0, %xmm4
837 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
838 ; SSE2-NEXT: movdqa %xmm0, %xmm5
839 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
840 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
841 ; SSE2-NEXT: movdqa %xmm2, %xmm6
842 ; SSE2-NEXT: pxor %xmm3, %xmm6
843 ; SSE2-NEXT: por %xmm3, %xmm5
844 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
845 ; SSE2-NEXT: movdqa %xmm1, %xmm6
846 ; SSE2-NEXT: pxor %xmm3, %xmm6
847 ; SSE2-NEXT: por %xmm3, %xmm4
848 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
849 ; SSE2-NEXT: packssdw %xmm5, %xmm4
850 ; SSE2-NEXT: pslld $16, %xmm2
851 ; SSE2-NEXT: psrad $16, %xmm2
852 ; SSE2-NEXT: pslld $16, %xmm1
853 ; SSE2-NEXT: psrad $16, %xmm1
854 ; SSE2-NEXT: packssdw %xmm2, %xmm1
855 ; SSE2-NEXT: psubw %xmm1, %xmm0
856 ; SSE2-NEXT: pand %xmm4, %xmm0
859 ; SSSE3-LABEL: test16:
860 ; SSSE3: # %bb.0: # %vector.ph
861 ; SSSE3-NEXT: pxor %xmm3, %xmm3
862 ; SSSE3-NEXT: movdqa %xmm0, %xmm4
863 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
864 ; SSSE3-NEXT: movdqa %xmm0, %xmm5
865 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
866 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
867 ; SSSE3-NEXT: movdqa %xmm2, %xmm6
868 ; SSSE3-NEXT: pxor %xmm3, %xmm6
869 ; SSSE3-NEXT: por %xmm3, %xmm5
870 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
871 ; SSSE3-NEXT: movdqa %xmm1, %xmm6
872 ; SSSE3-NEXT: pxor %xmm3, %xmm6
873 ; SSSE3-NEXT: por %xmm3, %xmm4
874 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
875 ; SSSE3-NEXT: packssdw %xmm5, %xmm4
876 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
877 ; SSSE3-NEXT: pshufb %xmm3, %xmm2
878 ; SSSE3-NEXT: pshufb %xmm3, %xmm1
879 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
880 ; SSSE3-NEXT: psubw %xmm1, %xmm0
881 ; SSSE3-NEXT: pand %xmm4, %xmm0
884 ; SSE41-LABEL: test16:
885 ; SSE41: # %bb.0: # %vector.ph
886 ; SSE41-NEXT: pxor %xmm3, %xmm3
887 ; SSE41-NEXT: movdqa %xmm0, %xmm4
888 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
889 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
890 ; SSE41-NEXT: pmaxud %xmm2, %xmm4
891 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm4
892 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
893 ; SSE41-NEXT: pxor %xmm6, %xmm4
894 ; SSE41-NEXT: pmaxud %xmm1, %xmm5
895 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm5
896 ; SSE41-NEXT: pxor %xmm6, %xmm5
897 ; SSE41-NEXT: packssdw %xmm4, %xmm5
898 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
899 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
900 ; SSE41-NEXT: packusdw %xmm2, %xmm1
901 ; SSE41-NEXT: psubw %xmm1, %xmm0
902 ; SSE41-NEXT: pand %xmm5, %xmm0
905 ; AVX1-LABEL: test16:
906 ; AVX1: # %bb.0: # %vector.ph
907 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
908 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
909 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
910 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
911 ; AVX1-NEXT: vpmaxud %xmm2, %xmm4, %xmm2
912 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm4, %xmm2
913 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
914 ; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
915 ; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm3
916 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3
917 ; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
918 ; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2
919 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
920 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
921 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
922 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
923 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
924 ; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
925 ; AVX1-NEXT: vzeroupper
928 ; AVX2-LABEL: test16:
929 ; AVX2: # %bb.0: # %vector.ph
930 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
931 ; AVX2-NEXT: vpmaxud %ymm2, %ymm1, %ymm2
932 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2
933 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
934 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
935 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
936 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
937 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
938 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
939 ; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
940 ; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
941 ; AVX2-NEXT: vzeroupper
944 ; AVX512-LABEL: test16:
945 ; AVX512: # %bb.0: # %vector.ph
946 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
947 ; AVX512-NEXT: vpcmpltud %ymm2, %ymm1, %k1
948 ; AVX512-NEXT: vpmovdw %ymm1, %xmm1
949 ; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z}
950 ; AVX512-NEXT: vzeroupper
953 %lhs = zext <8 x i16> %x to <8 x i32>
954 %cond = icmp ult <8 x i32> %y, %lhs
955 %sub = sub <8 x i32> %lhs, %y
956 %truncsub = trunc <8 x i32> %sub to <8 x i16>
957 %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
961 define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind {
962 ; SSE2-LABEL: test17:
963 ; SSE2: # %bb.0: # %vector.ph
964 ; SSE2-NEXT: movd %edi, %xmm4
965 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
966 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
967 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
968 ; SSE2-NEXT: psubusb %xmm4, %xmm0
969 ; SSE2-NEXT: psubusb %xmm4, %xmm1
970 ; SSE2-NEXT: psubusb %xmm4, %xmm2
971 ; SSE2-NEXT: psubusb %xmm4, %xmm3
974 ; SSSE3-LABEL: test17:
975 ; SSSE3: # %bb.0: # %vector.ph
976 ; SSSE3-NEXT: movd %edi, %xmm4
977 ; SSSE3-NEXT: pxor %xmm5, %xmm5
978 ; SSSE3-NEXT: pshufb %xmm5, %xmm4
979 ; SSSE3-NEXT: psubusb %xmm4, %xmm0
980 ; SSSE3-NEXT: psubusb %xmm4, %xmm1
981 ; SSSE3-NEXT: psubusb %xmm4, %xmm2
982 ; SSSE3-NEXT: psubusb %xmm4, %xmm3
985 ; SSE41-LABEL: test17:
986 ; SSE41: # %bb.0: # %vector.ph
987 ; SSE41-NEXT: movd %edi, %xmm4
988 ; SSE41-NEXT: pxor %xmm5, %xmm5
989 ; SSE41-NEXT: pshufb %xmm5, %xmm4
990 ; SSE41-NEXT: psubusb %xmm4, %xmm0
991 ; SSE41-NEXT: psubusb %xmm4, %xmm1
992 ; SSE41-NEXT: psubusb %xmm4, %xmm2
993 ; SSE41-NEXT: psubusb %xmm4, %xmm3
996 ; AVX1-LABEL: test17:
997 ; AVX1: # %bb.0: # %vector.ph
998 ; AVX1-NEXT: vmovd %edi, %xmm2
999 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1000 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1001 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1002 ; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3
1003 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
1004 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1005 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1006 ; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3
1007 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
1008 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1011 ; AVX2-LABEL: test17:
1012 ; AVX2: # %bb.0: # %vector.ph
1013 ; AVX2-NEXT: vmovd %edi, %xmm2
1014 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
1015 ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
1016 ; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1
1019 ; AVX512-LABEL: test17:
1020 ; AVX512: # %bb.0: # %vector.ph
1021 ; AVX512-NEXT: vpbroadcastb %edi, %zmm1
1022 ; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0
1025 %0 = insertelement <64 x i8> undef, i8 %w, i32 0
1026 %broadcast15 = shufflevector <64 x i8> %0, <64 x i8> undef, <64 x i32> zeroinitializer
1027 %1 = icmp ult <64 x i8> %x, %broadcast15
1028 %2 = sub <64 x i8> %x, %broadcast15
1029 %res = select <64 x i1> %1, <64 x i8> zeroinitializer, <64 x i8> %2
1033 define <32 x i16> @test18(<32 x i16> %x, i16 zeroext %w) nounwind {
1034 ; SSE-LABEL: test18:
1035 ; SSE: # %bb.0: # %vector.ph
1036 ; SSE-NEXT: movd %edi, %xmm4
1037 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
1038 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
1039 ; SSE-NEXT: psubusw %xmm4, %xmm0
1040 ; SSE-NEXT: psubusw %xmm4, %xmm1
1041 ; SSE-NEXT: psubusw %xmm4, %xmm2
1042 ; SSE-NEXT: psubusw %xmm4, %xmm3
1045 ; AVX1-LABEL: test18:
1046 ; AVX1: # %bb.0: # %vector.ph
1047 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1048 ; AVX1-NEXT: vmovd %edi, %xmm3
1049 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
1050 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1051 ; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2
1052 ; AVX1-NEXT: vpsubusw %xmm3, %xmm0, %xmm0
1053 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1054 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1055 ; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2
1056 ; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1
1057 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1060 ; AVX2-LABEL: test18:
1061 ; AVX2: # %bb.0: # %vector.ph
1062 ; AVX2-NEXT: vmovd %edi, %xmm2
1063 ; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
1064 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
1065 ; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1
1068 ; AVX512-LABEL: test18:
1069 ; AVX512: # %bb.0: # %vector.ph
1070 ; AVX512-NEXT: vpbroadcastw %edi, %zmm1
1071 ; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
1074 %0 = insertelement <32 x i16> undef, i16 %w, i32 0
1075 %broadcast15 = shufflevector <32 x i16> %0, <32 x i16> undef, <32 x i32> zeroinitializer
1076 %1 = icmp ult <32 x i16> %x, %broadcast15
1077 %2 = sub <32 x i16> %x, %broadcast15
1078 %res = select <32 x i1> %1, <32 x i16> zeroinitializer, <32 x i16> %2
1082 define <8 x i16> @psubus_8i16_max(<8 x i16> %x, <8 x i16> %y) nounwind {
1083 ; SSE-LABEL: psubus_8i16_max:
1084 ; SSE: # %bb.0: # %vector.ph
1085 ; SSE-NEXT: psubusw %xmm1, %xmm0
1088 ; AVX-LABEL: psubus_8i16_max:
1089 ; AVX: # %bb.0: # %vector.ph
1090 ; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1093 %cmp = icmp ult <8 x i16> %x, %y
1094 %max = select <8 x i1> %cmp, <8 x i16> %y, <8 x i16> %x
1095 %res = sub <8 x i16> %max, %y
1099 define <16 x i8> @psubus_16i8_max(<16 x i8> %x, <16 x i8> %y) nounwind {
1100 ; SSE-LABEL: psubus_16i8_max:
1101 ; SSE: # %bb.0: # %vector.ph
1102 ; SSE-NEXT: psubusb %xmm1, %xmm0
1105 ; AVX-LABEL: psubus_16i8_max:
1106 ; AVX: # %bb.0: # %vector.ph
1107 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
1110 %cmp = icmp ult <16 x i8> %x, %y
1111 %max = select <16 x i1> %cmp, <16 x i8> %y, <16 x i8> %x
1112 %res = sub <16 x i8> %max, %y
1116 define <16 x i16> @psubus_16i16_max(<16 x i16> %x, <16 x i16> %y) nounwind {
1117 ; SSE-LABEL: psubus_16i16_max:
1118 ; SSE: # %bb.0: # %vector.ph
1119 ; SSE-NEXT: psubusw %xmm2, %xmm0
1120 ; SSE-NEXT: psubusw %xmm3, %xmm1
1123 ; AVX1-LABEL: psubus_16i16_max:
1124 ; AVX1: # %bb.0: # %vector.ph
1125 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1126 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1127 ; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
1128 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1129 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1132 ; AVX2-LABEL: psubus_16i16_max:
1133 ; AVX2: # %bb.0: # %vector.ph
1134 ; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1137 ; AVX512-LABEL: psubus_16i16_max:
1138 ; AVX512: # %bb.0: # %vector.ph
1139 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1142 %cmp = icmp ult <16 x i16> %x, %y
1143 %max = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> %x
1144 %res = sub <16 x i16> %max, %y
1148 define <32 x i16> @psubus_32i16_max(<32 x i16> %x, <32 x i16> %y) nounwind {
1149 ; SSE-LABEL: psubus_32i16_max:
1150 ; SSE: # %bb.0: # %vector.ph
1151 ; SSE-NEXT: psubusw %xmm4, %xmm0
1152 ; SSE-NEXT: psubusw %xmm5, %xmm1
1153 ; SSE-NEXT: psubusw %xmm6, %xmm2
1154 ; SSE-NEXT: psubusw %xmm7, %xmm3
1157 ; AVX1-LABEL: psubus_32i16_max:
1158 ; AVX1: # %bb.0: # %vector.ph
1159 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1160 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
1161 ; AVX1-NEXT: vpsubusw %xmm4, %xmm5, %xmm4
1162 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
1163 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1164 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
1165 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1166 ; AVX1-NEXT: vpsubusw %xmm2, %xmm4, %xmm2
1167 ; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1
1168 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1171 ; AVX2-LABEL: psubus_32i16_max:
1172 ; AVX2: # %bb.0: # %vector.ph
1173 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
1174 ; AVX2-NEXT: vpsubusw %ymm3, %ymm1, %ymm1
1177 ; AVX512-LABEL: psubus_32i16_max:
1178 ; AVX512: # %bb.0: # %vector.ph
1179 ; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
1182 %cmp = icmp ult <32 x i16> %x, %y
1183 %max = select <32 x i1> %cmp, <32 x i16> %y, <32 x i16> %x
1184 %res = sub <32 x i16> %max, %y
1188 define <64 x i8> @psubus_64i8_max(<64 x i8> %x, <64 x i8> %y) nounwind {
1189 ; SSE-LABEL: psubus_64i8_max:
1190 ; SSE: # %bb.0: # %vector.ph
1191 ; SSE-NEXT: psubusb %xmm4, %xmm0
1192 ; SSE-NEXT: psubusb %xmm5, %xmm1
1193 ; SSE-NEXT: psubusb %xmm6, %xmm2
1194 ; SSE-NEXT: psubusb %xmm7, %xmm3
1197 ; AVX1-LABEL: psubus_64i8_max:
1198 ; AVX1: # %bb.0: # %vector.ph
1199 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1200 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
1201 ; AVX1-NEXT: vpsubusb %xmm4, %xmm5, %xmm4
1202 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
1203 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1204 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
1205 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1206 ; AVX1-NEXT: vpsubusb %xmm2, %xmm4, %xmm2
1207 ; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1
1208 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1211 ; AVX2-LABEL: psubus_64i8_max:
1212 ; AVX2: # %bb.0: # %vector.ph
1213 ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
1214 ; AVX2-NEXT: vpsubusb %ymm3, %ymm1, %ymm1
1217 ; AVX512-LABEL: psubus_64i8_max:
1218 ; AVX512: # %bb.0: # %vector.ph
1219 ; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0
1222 %cmp = icmp ult <64 x i8> %x, %y
1223 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
1224 %res = sub <64 x i8> %max, %y
1228 define <32 x i8> @psubus_32i8_max(<32 x i8> %x, <32 x i8> %y) nounwind {
1229 ; SSE-LABEL: psubus_32i8_max:
1230 ; SSE: # %bb.0: # %vector.ph
1231 ; SSE-NEXT: psubusb %xmm2, %xmm0
1232 ; SSE-NEXT: psubusb %xmm3, %xmm1
1235 ; AVX1-LABEL: psubus_32i8_max:
1236 ; AVX1: # %bb.0: # %vector.ph
1237 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1238 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1239 ; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm2
1240 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
1241 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1244 ; AVX2-LABEL: psubus_32i8_max:
1245 ; AVX2: # %bb.0: # %vector.ph
1246 ; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
1249 ; AVX512-LABEL: psubus_32i8_max:
1250 ; AVX512: # %bb.0: # %vector.ph
1251 ; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
1254 %cmp = icmp ult <32 x i8> %x, %y
1255 %max = select <32 x i1> %cmp, <32 x i8> %y, <32 x i8> %x
1256 %res = sub <32 x i8> %max, %y
1260 define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
1261 ; SSE2-LABEL: psubus_8i32_max:
1262 ; SSE2: # %bb.0: # %vector.ph
1263 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1264 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1265 ; SSE2-NEXT: pxor %xmm3, %xmm4
1266 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
1267 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1268 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
1269 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
1270 ; SSE2-NEXT: pand %xmm6, %xmm2
1271 ; SSE2-NEXT: pxor %xmm4, %xmm6
1272 ; SSE2-NEXT: por %xmm2, %xmm6
1273 ; SSE2-NEXT: pslld $16, %xmm6
1274 ; SSE2-NEXT: psrad $16, %xmm6
1275 ; SSE2-NEXT: pxor %xmm1, %xmm3
1276 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1277 ; SSE2-NEXT: pxor %xmm5, %xmm4
1278 ; SSE2-NEXT: pand %xmm1, %xmm5
1279 ; SSE2-NEXT: por %xmm4, %xmm5
1280 ; SSE2-NEXT: pslld $16, %xmm5
1281 ; SSE2-NEXT: psrad $16, %xmm5
1282 ; SSE2-NEXT: packssdw %xmm6, %xmm5
1283 ; SSE2-NEXT: psubusw %xmm5, %xmm0
1286 ; SSSE3-LABEL: psubus_8i32_max:
1287 ; SSSE3: # %bb.0: # %vector.ph
1288 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1289 ; SSSE3-NEXT: movdqa %xmm2, %xmm4
1290 ; SSSE3-NEXT: pxor %xmm3, %xmm4
1291 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
1292 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
1293 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
1294 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
1295 ; SSSE3-NEXT: pand %xmm6, %xmm2
1296 ; SSSE3-NEXT: pandn %xmm4, %xmm6
1297 ; SSSE3-NEXT: por %xmm2, %xmm6
1298 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1299 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
1300 ; SSSE3-NEXT: pxor %xmm1, %xmm3
1301 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
1302 ; SSSE3-NEXT: pand %xmm5, %xmm1
1303 ; SSSE3-NEXT: pandn %xmm4, %xmm5
1304 ; SSSE3-NEXT: por %xmm1, %xmm5
1305 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
1306 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
1307 ; SSSE3-NEXT: psubusw %xmm5, %xmm0
1310 ; SSE41-LABEL: psubus_8i32_max:
1311 ; SSE41: # %bb.0: # %vector.ph
1312 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1313 ; SSE41-NEXT: pminud %xmm3, %xmm2
1314 ; SSE41-NEXT: pminud %xmm3, %xmm1
1315 ; SSE41-NEXT: packusdw %xmm2, %xmm1
1316 ; SSE41-NEXT: psubusw %xmm1, %xmm0
1319 ; AVX1-LABEL: psubus_8i32_max:
1320 ; AVX1: # %bb.0: # %vector.ph
1321 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1322 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1323 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
1324 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
1325 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1326 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1327 ; AVX1-NEXT: vzeroupper
1330 ; AVX2-LABEL: psubus_8i32_max:
1331 ; AVX2: # %bb.0: # %vector.ph
1332 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
1333 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
1334 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1335 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1336 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1337 ; AVX2-NEXT: vzeroupper
1340 ; AVX512-LABEL: psubus_8i32_max:
1341 ; AVX512: # %bb.0: # %vector.ph
1342 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
1343 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1344 ; AVX512-NEXT: vzeroupper
1347 %lhs = zext <8 x i16> %x to <8 x i32>
1348 %cond = icmp ult <8 x i32> %lhs, %y
1349 %max = select <8 x i1> %cond, <8 x i32> %y, <8 x i32> %lhs
1350 %sub = sub <8 x i32> %max, %y
1351 %res = trunc <8 x i32> %sub to <8 x i16>
1355 define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
1356 ; SSE2OR3-LABEL: psubus_8i64_max:
1357 ; SSE2OR3: # %bb.0: # %vector.ph
1358 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
1359 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm7
1360 ; SSE2OR3-NEXT: pxor %xmm5, %xmm7
1361 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991]
1362 ; SSE2OR3-NEXT: movdqa %xmm8, %xmm6
1363 ; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6
1364 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
1365 ; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm7
1366 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
1367 ; SSE2OR3-NEXT: pand %xmm9, %xmm7
1368 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1369 ; SSE2OR3-NEXT: por %xmm7, %xmm6
1370 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535]
1371 ; SSE2OR3-NEXT: pand %xmm6, %xmm2
1372 ; SSE2OR3-NEXT: pandn %xmm9, %xmm6
1373 ; SSE2OR3-NEXT: por %xmm2, %xmm6
1374 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
1375 ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7]
1376 ; SSE2OR3-NEXT: movdqa %xmm1, %xmm6
1377 ; SSE2OR3-NEXT: pxor %xmm5, %xmm6
1378 ; SSE2OR3-NEXT: movdqa %xmm8, %xmm7
1379 ; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm7
1380 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
1381 ; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm6
1382 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1383 ; SSE2OR3-NEXT: pand %xmm2, %xmm6
1384 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
1385 ; SSE2OR3-NEXT: por %xmm6, %xmm2
1386 ; SSE2OR3-NEXT: pand %xmm2, %xmm1
1387 ; SSE2OR3-NEXT: pandn %xmm9, %xmm2
1388 ; SSE2OR3-NEXT: por %xmm1, %xmm2
1389 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1390 ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1391 ; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
1392 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm2
1393 ; SSE2OR3-NEXT: pxor %xmm5, %xmm2
1394 ; SSE2OR3-NEXT: movdqa %xmm8, %xmm6
1395 ; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm6
1396 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
1397 ; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm2
1398 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1399 ; SSE2OR3-NEXT: pand %xmm7, %xmm2
1400 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1401 ; SSE2OR3-NEXT: por %xmm2, %xmm6
1402 ; SSE2OR3-NEXT: pand %xmm6, %xmm4
1403 ; SSE2OR3-NEXT: pandn %xmm9, %xmm6
1404 ; SSE2OR3-NEXT: por %xmm4, %xmm6
1405 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
1406 ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
1407 ; SSE2OR3-NEXT: pxor %xmm3, %xmm5
1408 ; SSE2OR3-NEXT: movdqa %xmm8, %xmm4
1409 ; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm4
1410 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
1411 ; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm5
1412 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1413 ; SSE2OR3-NEXT: pand %xmm6, %xmm5
1414 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1415 ; SSE2OR3-NEXT: por %xmm5, %xmm4
1416 ; SSE2OR3-NEXT: pand %xmm4, %xmm3
1417 ; SSE2OR3-NEXT: pandn %xmm9, %xmm4
1418 ; SSE2OR3-NEXT: por %xmm3, %xmm4
1419 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
1420 ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
1421 ; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1422 ; SSE2OR3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
1423 ; SSE2OR3-NEXT: psubusw %xmm3, %xmm0
1424 ; SSE2OR3-NEXT: retq
1426 ; SSE41-LABEL: psubus_8i64_max:
1427 ; SSE41: # %bb.0: # %vector.ph
1428 ; SSE41-NEXT: movdqa %xmm0, %xmm8
1429 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
1430 ; SSE41-NEXT: movdqa %xmm4, %xmm0
1431 ; SSE41-NEXT: pxor %xmm9, %xmm0
1432 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991]
1433 ; SSE41-NEXT: movdqa %xmm5, %xmm7
1434 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
1435 ; SSE41-NEXT: movdqa %xmm5, %xmm6
1436 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
1437 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
1438 ; SSE41-NEXT: pand %xmm7, %xmm0
1439 ; SSE41-NEXT: por %xmm6, %xmm0
1440 ; SSE41-NEXT: movapd {{.*#+}} xmm6 = [65535,65535]
1441 ; SSE41-NEXT: movapd %xmm6, %xmm10
1442 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10
1443 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1444 ; SSE41-NEXT: pxor %xmm9, %xmm0
1445 ; SSE41-NEXT: movdqa %xmm5, %xmm4
1446 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
1447 ; SSE41-NEXT: movdqa %xmm5, %xmm7
1448 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
1449 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
1450 ; SSE41-NEXT: pand %xmm4, %xmm0
1451 ; SSE41-NEXT: por %xmm7, %xmm0
1452 ; SSE41-NEXT: movapd %xmm6, %xmm4
1453 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
1454 ; SSE41-NEXT: packusdw %xmm10, %xmm4
1455 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1456 ; SSE41-NEXT: pxor %xmm9, %xmm0
1457 ; SSE41-NEXT: movdqa %xmm5, %xmm3
1458 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
1459 ; SSE41-NEXT: movdqa %xmm5, %xmm7
1460 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
1461 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
1462 ; SSE41-NEXT: pand %xmm3, %xmm0
1463 ; SSE41-NEXT: por %xmm7, %xmm0
1464 ; SSE41-NEXT: movapd %xmm6, %xmm3
1465 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
1466 ; SSE41-NEXT: pxor %xmm1, %xmm9
1467 ; SSE41-NEXT: movdqa %xmm5, %xmm2
1468 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm2
1469 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm5
1470 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
1471 ; SSE41-NEXT: pand %xmm2, %xmm0
1472 ; SSE41-NEXT: por %xmm5, %xmm0
1473 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
1474 ; SSE41-NEXT: packusdw %xmm3, %xmm6
1475 ; SSE41-NEXT: packusdw %xmm4, %xmm6
1476 ; SSE41-NEXT: psubusw %xmm6, %xmm8
1477 ; SSE41-NEXT: movdqa %xmm8, %xmm0
1480 ; AVX1-LABEL: psubus_8i64_max:
1481 ; AVX1: # %bb.0: # %vector.ph
1482 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1483 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
1484 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
1485 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
1486 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
1487 ; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535]
1488 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
1489 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
1490 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
1491 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2
1492 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1493 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1494 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
1495 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
1496 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
1497 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4
1498 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
1499 ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm7, %xmm1
1500 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1501 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1502 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1503 ; AVX1-NEXT: vzeroupper
1506 ; AVX2-LABEL: psubus_8i64_max:
1507 ; AVX2: # %bb.0: # %vector.ph
1508 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
1509 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4
1510 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343]
1511 ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
1512 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [65535,65535,65535,65535]
1513 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
1514 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3
1515 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
1516 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
1517 ; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
1518 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1519 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1520 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1521 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1522 ; AVX2-NEXT: vzeroupper
1525 ; AVX512-LABEL: psubus_8i64_max:
1526 ; AVX512: # %bb.0: # %vector.ph
1527 ; AVX512-NEXT: vpmovusqw %zmm1, %xmm1
1528 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1529 ; AVX512-NEXT: vzeroupper
1532 %lhs = zext <8 x i16> %x to <8 x i64>
1533 %cond = icmp ult <8 x i64> %lhs, %y
1534 %max = select <8 x i1> %cond, <8 x i64> %y, <8 x i64> %lhs
1535 %sub = sub <8 x i64> %max, %y
1536 %res = trunc <8 x i64> %sub to <8 x i16>
1540 define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
1541 ; SSE2OR3-LABEL: psubus_16i32_max:
1542 ; SSE2OR3: # %bb.0: # %vector.ph
1543 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
1544 ; SSE2OR3-NEXT: movdqa %xmm3, %xmm8
1545 ; SSE2OR3-NEXT: pxor %xmm9, %xmm8
1546 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183]
1547 ; SSE2OR3-NEXT: movdqa %xmm10, %xmm6
1548 ; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6
1549 ; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8
1550 ; SSE2OR3-NEXT: pand %xmm6, %xmm3
1551 ; SSE2OR3-NEXT: pxor %xmm8, %xmm6
1552 ; SSE2OR3-NEXT: por %xmm3, %xmm6
1553 ; SSE2OR3-NEXT: pslld $16, %xmm6
1554 ; SSE2OR3-NEXT: psrad $16, %xmm6
1555 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3
1556 ; SSE2OR3-NEXT: pxor %xmm9, %xmm3
1557 ; SSE2OR3-NEXT: movdqa %xmm10, %xmm7
1558 ; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm7
1559 ; SSE2OR3-NEXT: pand %xmm7, %xmm2
1560 ; SSE2OR3-NEXT: pxor %xmm8, %xmm7
1561 ; SSE2OR3-NEXT: por %xmm2, %xmm7
1562 ; SSE2OR3-NEXT: pslld $16, %xmm7
1563 ; SSE2OR3-NEXT: psrad $16, %xmm7
1564 ; SSE2OR3-NEXT: packssdw %xmm6, %xmm7
1565 ; SSE2OR3-NEXT: psubusw %xmm7, %xmm0
1566 ; SSE2OR3-NEXT: movdqa %xmm5, %xmm2
1567 ; SSE2OR3-NEXT: pxor %xmm9, %xmm2
1568 ; SSE2OR3-NEXT: movdqa %xmm10, %xmm3
1569 ; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3
1570 ; SSE2OR3-NEXT: pand %xmm3, %xmm5
1571 ; SSE2OR3-NEXT: pxor %xmm8, %xmm3
1572 ; SSE2OR3-NEXT: por %xmm5, %xmm3
1573 ; SSE2OR3-NEXT: pslld $16, %xmm3
1574 ; SSE2OR3-NEXT: psrad $16, %xmm3
1575 ; SSE2OR3-NEXT: pxor %xmm4, %xmm9
1576 ; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10
1577 ; SSE2OR3-NEXT: pxor %xmm10, %xmm8
1578 ; SSE2OR3-NEXT: pand %xmm4, %xmm10
1579 ; SSE2OR3-NEXT: por %xmm8, %xmm10
1580 ; SSE2OR3-NEXT: pslld $16, %xmm10
1581 ; SSE2OR3-NEXT: psrad $16, %xmm10
1582 ; SSE2OR3-NEXT: packssdw %xmm3, %xmm10
1583 ; SSE2OR3-NEXT: psubusw %xmm10, %xmm1
1584 ; SSE2OR3-NEXT: retq
1586 ; SSE41-LABEL: psubus_16i32_max:
1587 ; SSE41: # %bb.0: # %vector.ph
1588 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
1589 ; SSE41-NEXT: pminud %xmm6, %xmm3
1590 ; SSE41-NEXT: pminud %xmm6, %xmm2
1591 ; SSE41-NEXT: packusdw %xmm3, %xmm2
1592 ; SSE41-NEXT: psubusw %xmm2, %xmm0
1593 ; SSE41-NEXT: pminud %xmm6, %xmm5
1594 ; SSE41-NEXT: pminud %xmm6, %xmm4
1595 ; SSE41-NEXT: packusdw %xmm5, %xmm4
1596 ; SSE41-NEXT: psubusw %xmm4, %xmm1
1599 ; AVX1-LABEL: psubus_16i32_max:
1600 ; AVX1: # %bb.0: # %vector.ph
1601 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1602 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
1603 ; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
1604 ; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2
1605 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1606 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1607 ; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
1608 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1609 ; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
1610 ; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1
1611 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1612 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1613 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1616 ; AVX2-LABEL: psubus_16i32_max:
1617 ; AVX2: # %bb.0: # %vector.ph
1618 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535]
1619 ; AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2
1620 ; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
1621 ; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
1622 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
1623 ; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1626 ; AVX512-LABEL: psubus_16i32_max:
1627 ; AVX512: # %bb.0: # %vector.ph
1628 ; AVX512-NEXT: vpmovusdw %zmm1, %ymm1
1629 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1632 %lhs = zext <16 x i16> %x to <16 x i32>
1633 %cond = icmp ult <16 x i32> %lhs, %y
1634 %max = select <16 x i1> %cond, <16 x i32> %y, <16 x i32> %lhs
1635 %sub = sub <16 x i32> %max, %y
1636 %res = trunc <16 x i32> %sub to <16 x i16>
1640 define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwind {
1641 ; SSE2-LABEL: psubus_i16_i32_max_swapped:
1642 ; SSE2: # %bb.0: # %vector.ph
1643 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1644 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1645 ; SSE2-NEXT: pxor %xmm3, %xmm4
1646 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
1647 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1648 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
1649 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
1650 ; SSE2-NEXT: pand %xmm6, %xmm2
1651 ; SSE2-NEXT: pxor %xmm4, %xmm6
1652 ; SSE2-NEXT: por %xmm2, %xmm6
1653 ; SSE2-NEXT: pslld $16, %xmm6
1654 ; SSE2-NEXT: psrad $16, %xmm6
1655 ; SSE2-NEXT: pxor %xmm1, %xmm3
1656 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1657 ; SSE2-NEXT: pxor %xmm5, %xmm4
1658 ; SSE2-NEXT: pand %xmm1, %xmm5
1659 ; SSE2-NEXT: por %xmm4, %xmm5
1660 ; SSE2-NEXT: pslld $16, %xmm5
1661 ; SSE2-NEXT: psrad $16, %xmm5
1662 ; SSE2-NEXT: packssdw %xmm6, %xmm5
1663 ; SSE2-NEXT: psubusw %xmm5, %xmm0
1666 ; SSSE3-LABEL: psubus_i16_i32_max_swapped:
1667 ; SSSE3: # %bb.0: # %vector.ph
1668 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1669 ; SSSE3-NEXT: movdqa %xmm2, %xmm4
1670 ; SSSE3-NEXT: pxor %xmm3, %xmm4
1671 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
1672 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
1673 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
1674 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
1675 ; SSSE3-NEXT: pand %xmm6, %xmm2
1676 ; SSSE3-NEXT: pandn %xmm4, %xmm6
1677 ; SSSE3-NEXT: por %xmm2, %xmm6
1678 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1679 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
1680 ; SSSE3-NEXT: pxor %xmm1, %xmm3
1681 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
1682 ; SSSE3-NEXT: pand %xmm5, %xmm1
1683 ; SSSE3-NEXT: pandn %xmm4, %xmm5
1684 ; SSSE3-NEXT: por %xmm1, %xmm5
1685 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
1686 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
1687 ; SSSE3-NEXT: psubusw %xmm5, %xmm0
1690 ; SSE41-LABEL: psubus_i16_i32_max_swapped:
1691 ; SSE41: # %bb.0: # %vector.ph
1692 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1693 ; SSE41-NEXT: pminud %xmm3, %xmm2
1694 ; SSE41-NEXT: pminud %xmm3, %xmm1
1695 ; SSE41-NEXT: packusdw %xmm2, %xmm1
1696 ; SSE41-NEXT: psubusw %xmm1, %xmm0
1699 ; AVX1-LABEL: psubus_i16_i32_max_swapped:
1700 ; AVX1: # %bb.0: # %vector.ph
1701 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1702 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1703 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
1704 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
1705 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1706 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1707 ; AVX1-NEXT: vzeroupper
1710 ; AVX2-LABEL: psubus_i16_i32_max_swapped:
1711 ; AVX2: # %bb.0: # %vector.ph
1712 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
1713 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
1714 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1715 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1716 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1717 ; AVX2-NEXT: vzeroupper
1720 ; AVX512-LABEL: psubus_i16_i32_max_swapped:
1721 ; AVX512: # %bb.0: # %vector.ph
1722 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
1723 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1724 ; AVX512-NEXT: vzeroupper
1727 %lhs = zext <8 x i16> %x to <8 x i32>
1728 %cond = icmp ult <8 x i32> %y, %lhs
1729 %max = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
1730 %sub = sub <8 x i32> %max, %y
1731 %res = trunc <8 x i32> %sub to <8 x i16>
1735 define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind {
1736 ; SSE2-LABEL: psubus_i16_i32_min:
1737 ; SSE2: # %bb.0: # %vector.ph
1738 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1739 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1740 ; SSE2-NEXT: pxor %xmm3, %xmm4
1741 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
1742 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1743 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
1744 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
1745 ; SSE2-NEXT: pand %xmm6, %xmm2
1746 ; SSE2-NEXT: pxor %xmm4, %xmm6
1747 ; SSE2-NEXT: por %xmm2, %xmm6
1748 ; SSE2-NEXT: pslld $16, %xmm6
1749 ; SSE2-NEXT: psrad $16, %xmm6
1750 ; SSE2-NEXT: pxor %xmm1, %xmm3
1751 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1752 ; SSE2-NEXT: pxor %xmm5, %xmm4
1753 ; SSE2-NEXT: pand %xmm1, %xmm5
1754 ; SSE2-NEXT: por %xmm4, %xmm5
1755 ; SSE2-NEXT: pslld $16, %xmm5
1756 ; SSE2-NEXT: psrad $16, %xmm5
1757 ; SSE2-NEXT: packssdw %xmm6, %xmm5
1758 ; SSE2-NEXT: psubusw %xmm5, %xmm0
1761 ; SSSE3-LABEL: psubus_i16_i32_min:
1762 ; SSSE3: # %bb.0: # %vector.ph
1763 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1764 ; SSSE3-NEXT: movdqa %xmm2, %xmm4
1765 ; SSSE3-NEXT: pxor %xmm3, %xmm4
1766 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
1767 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
1768 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
1769 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
1770 ; SSSE3-NEXT: pand %xmm6, %xmm2
1771 ; SSSE3-NEXT: pandn %xmm4, %xmm6
1772 ; SSSE3-NEXT: por %xmm2, %xmm6
1773 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1774 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
1775 ; SSSE3-NEXT: pxor %xmm1, %xmm3
1776 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
1777 ; SSSE3-NEXT: pand %xmm5, %xmm1
1778 ; SSSE3-NEXT: pandn %xmm4, %xmm5
1779 ; SSSE3-NEXT: por %xmm1, %xmm5
1780 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
1781 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
1782 ; SSSE3-NEXT: psubusw %xmm5, %xmm0
1785 ; SSE41-LABEL: psubus_i16_i32_min:
1786 ; SSE41: # %bb.0: # %vector.ph
1787 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1788 ; SSE41-NEXT: pminud %xmm3, %xmm2
1789 ; SSE41-NEXT: pminud %xmm3, %xmm1
1790 ; SSE41-NEXT: packusdw %xmm2, %xmm1
1791 ; SSE41-NEXT: psubusw %xmm1, %xmm0
1794 ; AVX1-LABEL: psubus_i16_i32_min:
1795 ; AVX1: # %bb.0: # %vector.ph
1796 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1797 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1798 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
1799 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
1800 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1801 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1802 ; AVX1-NEXT: vzeroupper
1805 ; AVX2-LABEL: psubus_i16_i32_min:
1806 ; AVX2: # %bb.0: # %vector.ph
1807 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
1808 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
1809 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1810 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1811 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1812 ; AVX2-NEXT: vzeroupper
1815 ; AVX512-LABEL: psubus_i16_i32_min:
1816 ; AVX512: # %bb.0: # %vector.ph
1817 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
1818 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1819 ; AVX512-NEXT: vzeroupper
1822 %lhs = zext <8 x i16> %x to <8 x i32>
1823 %cond = icmp ult <8 x i32> %lhs, %y
1824 %min = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
1825 %sub = sub <8 x i32> %lhs, %min
1826 %res = trunc <8 x i32> %sub to <8 x i16>
1830 define void @subus_v8i8(<8 x i8>* %p1, <8 x i8>* %p2) {
1831 ; SSE-LABEL: subus_v8i8:
1833 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1834 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1835 ; SSE-NEXT: psubusb %xmm1, %xmm0
1836 ; SSE-NEXT: movq %xmm0, (%rdi)
1839 ; AVX-LABEL: subus_v8i8:
1841 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1842 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1843 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
1844 ; AVX-NEXT: vmovq %xmm0, (%rdi)
1846 %ld1 = load <8 x i8>, <8 x i8>* %p1, align 8
1847 %ld2 = load <8 x i8>, <8 x i8>* %p2, align 8
1848 %1 = sub <8 x i8> %ld1, %ld2
1849 %2 = icmp ugt <8 x i8> %ld1, %ld2
1850 %sh3 = select <8 x i1> %2, <8 x i8> %1, <8 x i8> zeroinitializer
1851 store <8 x i8> %sh3, <8 x i8>* %p1, align 8
1855 define void @subus_v4i8(<4 x i8>* %p1, <4 x i8>* %p2) {
1856 ; SSE-LABEL: subus_v4i8:
1858 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1859 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1860 ; SSE-NEXT: psubusb %xmm1, %xmm0
1861 ; SSE-NEXT: movd %xmm0, (%rdi)
1864 ; AVX-LABEL: subus_v4i8:
1866 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1867 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1868 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
1869 ; AVX-NEXT: vmovd %xmm0, (%rdi)
1871 %ld1 = load <4 x i8>, <4 x i8>* %p1, align 8
1872 %ld2 = load <4 x i8>, <4 x i8>* %p2, align 8
1873 %1 = sub <4 x i8> %ld1, %ld2
1874 %2 = icmp ugt <4 x i8> %ld1, %ld2
1875 %sh3 = select <4 x i1> %2, <4 x i8> %1, <4 x i8> zeroinitializer
1876 store <4 x i8> %sh3, <4 x i8>* %p1, align 8
1880 define void @subus_v2i8(<2 x i8>* %p1, <2 x i8>* %p2) {
1881 ; SSE2OR3-LABEL: subus_v2i8:
1883 ; SSE2OR3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1884 ; SSE2OR3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1885 ; SSE2OR3-NEXT: psubusb %xmm1, %xmm0
1886 ; SSE2OR3-NEXT: movd %xmm0, %eax
1887 ; SSE2OR3-NEXT: movw %ax, (%rdi)
1888 ; SSE2OR3-NEXT: retq
1890 ; SSE41-LABEL: subus_v2i8:
1892 ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1893 ; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1894 ; SSE41-NEXT: psubusb %xmm1, %xmm0
1895 ; SSE41-NEXT: pextrw $0, %xmm0, (%rdi)
1898 ; AVX-LABEL: subus_v2i8:
1900 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1901 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1902 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
1903 ; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
1905 %ld1 = load <2 x i8>, <2 x i8>* %p1, align 8
1906 %ld2 = load <2 x i8>, <2 x i8>* %p2, align 8
1907 %1 = sub <2 x i8> %ld1, %ld2
1908 %2 = icmp ugt <2 x i8> %ld1, %ld2
1909 %sh3 = select <2 x i1> %2, <2 x i8> %1, <2 x i8> zeroinitializer
1910 store <2 x i8> %sh3, <2 x i8>* %p1, align 8
1914 define void @subus_v4i16(<4 x i16>* %p1, <4 x i16>* %p2) {
1915 ; SSE-LABEL: subus_v4i16:
1917 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1918 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1919 ; SSE-NEXT: psubusw %xmm1, %xmm0
1920 ; SSE-NEXT: movq %xmm0, (%rdi)
1923 ; AVX-LABEL: subus_v4i16:
1925 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1926 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1927 ; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1928 ; AVX-NEXT: vmovq %xmm0, (%rdi)
1930 %ld1 = load <4 x i16>, <4 x i16>* %p1, align 8
1931 %ld2 = load <4 x i16>, <4 x i16>* %p2, align 8
1932 %1 = sub <4 x i16> %ld1, %ld2
1933 %2 = icmp ugt <4 x i16> %ld1, %ld2
1934 %sh3 = select <4 x i1> %2, <4 x i16> %1, <4 x i16> zeroinitializer
1935 store <4 x i16> %sh3, <4 x i16>* %p1, align 8
1939 define void @subus_v2i16(<2 x i16>* %p1, <2 x i16>* %p2) {
1940 ; SSE-LABEL: subus_v2i16:
1942 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1943 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1944 ; SSE-NEXT: psubusw %xmm1, %xmm0
1945 ; SSE-NEXT: movd %xmm0, (%rdi)
1948 ; AVX-LABEL: subus_v2i16:
1950 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1951 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1952 ; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1953 ; AVX-NEXT: vmovd %xmm0, (%rdi)
1955 %ld1 = load <2 x i16>, <2 x i16>* %p1, align 8
1956 %ld2 = load <2 x i16>, <2 x i16>* %p2, align 8
1957 %1 = sub <2 x i16> %ld1, %ld2
1958 %2 = icmp ugt <2 x i16> %ld1, %ld2
1959 %sh3 = select <2 x i1> %2, <2 x i16> %1, <2 x i16> zeroinitializer
1960 store <2 x i16> %sh3, <2 x i16>* %p1, align 8
1964 define <16 x i8> @test19(<16 x i8> %x) {
1965 ; SSE-LABEL: test19:
1966 ; SSE: # %bb.0: # %entry
1967 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1970 ; AVX-LABEL: test19:
1971 ; AVX: # %bb.0: # %entry
1972 ; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1975 %0 = icmp ugt <16 x i8> %x, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
1976 %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
1977 %2 = add <16 x i8> %1, <i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70>
1981 define <16 x i8> @test20(<16 x i8> %x) {
1982 ; SSE-LABEL: test20:
1983 ; SSE: # %bb.0: # %entry
1984 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1987 ; AVX-LABEL: test20:
1988 ; AVX: # %bb.0: # %entry
1989 ; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1992 %0 = icmp ugt <16 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70>
1993 %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70>
1994 %2 = add <16 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70>
1998 define <8 x i16> @test21(<8 x i16> %x) {
1999 ; SSE-LABEL: test21:
2000 ; SSE: # %bb.0: # %entry
2001 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2004 ; AVX-LABEL: test21:
2005 ; AVX: # %bb.0: # %entry
2006 ; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2009 %0 = icmp ugt <8 x i16> %x, <i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700>
2010 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700>
2011 %2 = add <8 x i16> %1, <i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700>
2015 define <8 x i16> @test22(<8 x i16> %x) {
2016 ; SSE-LABEL: test22:
2017 ; SSE: # %bb.0: # %entry
2018 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2021 ; AVX-LABEL: test22:
2022 ; AVX: # %bb.0: # %entry
2023 ; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2026 %0 = icmp ugt <8 x i16> %x, <i16 1, i16 -22000, i16 -770, i16 98, i16 19, i16 1000, i16 3456, i16 70>
2027 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 1, i16 -22000, i16 -770, i16 98, i16 19, i16 1000, i16 3456, i16 70>
2028 %2 = add <8 x i16> %1, <i16 -1, i16 22000, i16 770, i16 -98, i16 -19, i16 -1000, i16 -3456, i16 -70>
2032 define <32 x i8> @test23(<32 x i8> %x) {
2033 ; SSE-LABEL: test23:
2034 ; SSE: # %bb.0: # %entry
2035 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70]
2036 ; SSE-NEXT: psubusb %xmm2, %xmm0
2037 ; SSE-NEXT: psubusb %xmm2, %xmm1
2040 ; AVX1-LABEL: test23:
2041 ; AVX1: # %bb.0: # %entry
2042 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2043 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70]
2044 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
2045 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
2046 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2049 ; AVX2-LABEL: test23:
2050 ; AVX2: # %bb.0: # %entry
2051 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2054 ; AVX512-LABEL: test23:
2055 ; AVX512: # %bb.0: # %entry
2056 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2059 %0 = icmp ugt <32 x i8> %x, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2060 %1 = select <32 x i1> %0, <32 x i8> %x, <32 x i8> <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2061 %2 = add <32 x i8> %1, <i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70>
2065 define <32 x i8> @test24(<32 x i8> %x) {
2066 ; SSE-LABEL: test24:
2067 ; SSE: # %bb.0: # %entry
2068 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2069 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2072 ; AVX1-LABEL: test24:
2073 ; AVX1: # %bb.0: # %entry
2074 ; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2075 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2076 ; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2077 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2080 ; AVX2-LABEL: test24:
2081 ; AVX2: # %bb.0: # %entry
2082 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2085 ; AVX512-LABEL: test24:
2086 ; AVX512: # %bb.0: # %entry
2087 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2090 %0 = icmp ugt <32 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2091 %1 = select <32 x i1> %0, <32 x i8> %x, <32 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2092 %2 = add <32 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70>
2096 define <16 x i16> @test25(<16 x i16> %x) {
2097 ; SSE-LABEL: test25:
2098 ; SSE: # %bb.0: # %entry
2099 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000]
2100 ; SSE-NEXT: psubusw %xmm2, %xmm0
2101 ; SSE-NEXT: psubusw %xmm2, %xmm1
2104 ; AVX1-LABEL: test25:
2105 ; AVX1: # %bb.0: # %entry
2106 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2107 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000]
2108 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
2109 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
2110 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2113 ; AVX2-LABEL: test25:
2114 ; AVX2: # %bb.0: # %entry
2115 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2118 ; AVX512-LABEL: test25:
2119 ; AVX512: # %bb.0: # %entry
2120 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2123 %0 = icmp ugt <16 x i16> %x, <i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000>
2124 %1 = select <16 x i1> %0, <16 x i16> %x, <16 x i16> <i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000>
2125 %2 = add <16 x i16> %1, <i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000>
2129 define <16 x i16> @test26(<16 x i16> %x) {
2130 ; SSE-LABEL: test26:
2131 ; SSE: # %bb.0: # %entry
2132 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2133 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2136 ; AVX1-LABEL: test26:
2137 ; AVX1: # %bb.0: # %entry
2138 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2139 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2140 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2141 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2144 ; AVX2-LABEL: test26:
2145 ; AVX2: # %bb.0: # %entry
2146 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2149 ; AVX512-LABEL: test26:
2150 ; AVX512: # %bb.0: # %entry
2151 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2154 %0 = icmp ugt <16 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70>
2155 %1 = select <16 x i1> %0, <16 x i16> %x, <16 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70>
2156 %2 = add <16 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9800, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -34, i16 -55, i16 -70>
2160 define <64 x i8> @test27(<64 x i8> %x) {
2161 ; SSE-LABEL: test27:
2162 ; SSE: # %bb.0: # %entry
2163 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
2164 ; SSE-NEXT: psubusb %xmm4, %xmm0
2165 ; SSE-NEXT: psubusb %xmm4, %xmm1
2166 ; SSE-NEXT: psubusb %xmm4, %xmm2
2167 ; SSE-NEXT: psubusb %xmm4, %xmm3
2170 ; AVX1-LABEL: test27:
2171 ; AVX1: # %bb.0: # %entry
2172 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2173 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
2174 ; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2
2175 ; AVX1-NEXT: vpsubusb %xmm3, %xmm0, %xmm0
2176 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2177 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2178 ; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2
2179 ; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1
2180 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2183 ; AVX2-LABEL: test27:
2184 ; AVX2: # %bb.0: # %entry
2185 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
2186 ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
2187 ; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1
2190 ; AVX512-LABEL: test27:
2191 ; AVX512: # %bb.0: # %entry
2192 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2195 %0 = icmp ugt <64 x i8> %x, <i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154>
2196 %1 = select <64 x i1> %0, <64 x i8> %x, <64 x i8> <i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154>
2197 %2 = add <64 x i8> %1, <i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154>
2201 define <64 x i8> @test28(<64 x i8> %x) {
2202 ; SSE-LABEL: test28:
2203 ; SSE: # %bb.0: # %entry
2204 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70]
2205 ; SSE-NEXT: psubusb %xmm4, %xmm0
2206 ; SSE-NEXT: psubusb %xmm4, %xmm2
2207 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2208 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2211 ; AVX1-LABEL: test28:
2212 ; AVX1: # %bb.0: # %entry
2213 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70]
2214 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm3
2215 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2216 ; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2217 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
2218 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm2
2219 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2220 ; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2221 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2224 ; AVX2-LABEL: test28:
2225 ; AVX2: # %bb.0: # %entry
2226 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2227 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2230 ; AVX512-LABEL: test28:
2231 ; AVX512: # %bb.0: # %entry
2232 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2235 %0 = icmp ugt <64 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70, i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -116, i8 -77, i8 -70, i8 123, i8 98, i8 67, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2236 %1 = select <64 x i1> %0, <64 x i8> %x, <64 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70, i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -116, i8 -77, i8 -70, i8 123, i8 98, i8 67, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2237 %2 = add <64 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70, i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 116, i8 77, i8 70, i8 -123, i8 -98, i8 -67, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70>
2241 define <32 x i16> @test29(<32 x i16> %x) {
2242 ; SSE-LABEL: test29:
2243 ; SSE: # %bb.0: # %entry
2244 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2245 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2246 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2247 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2250 ; AVX1-LABEL: test29:
2251 ; AVX1: # %bb.0: # %entry
2252 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
2253 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2254 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2255 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2256 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
2257 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2258 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2259 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2262 ; AVX2-LABEL: test29:
2263 ; AVX2: # %bb.0: # %entry
2264 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2265 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2268 ; AVX512-LABEL: test29:
2269 ; AVX512: # %bb.0: # %entry
2270 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2273 %0 = icmp ugt <32 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70, i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9805, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 346, i16 55, i16 70>
2274 %1 = select <32 x i1> %0, <32 x i16> %x, <32 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70, i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9805, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 346, i16 55, i16 70>
2275 %2 = add <32 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9800, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -34, i16 -55, i16 -70, i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9805, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -346, i16 -55, i16 -70>
2280 define i64 @test30(<8 x i16> %x) {
2281 ; SSE-LABEL: test30:
2282 ; SSE: # %bb.0: # %entry
2283 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2284 ; SSE-NEXT: movq %xmm0, %rax
2287 ; AVX-LABEL: test30:
2288 ; AVX: # %bb.0: # %entry
2289 ; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2290 ; AVX-NEXT: vmovq %xmm0, %rax
2293 %0 = icmp ugt <8 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 undef, i16 undef, i16 undef, i16 undef>
2294 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 undef, i16 undef, i16 undef, i16 undef>
2295 %2 = add <8 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 undef, i16 undef, i16 undef, i16 undef>
2296 %3 = bitcast <8 x i16> %2 to <2 x i64>
2297 %4 = extractelement <2 x i64> %3, i32 0
2302 define i64 @test31(<2 x i64> %x) {
2303 ; SSE-LABEL: test31:
2305 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2306 ; SSE-NEXT: movq %xmm0, %rax
2309 ; AVX-LABEL: test31:
2311 ; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2312 ; AVX-NEXT: vmovq %xmm0, %rax
2314 %t0 = bitcast <2 x i64> %x to <16 x i8>
2315 %cmp = icmp ugt <16 x i8> %t0, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
2316 %bop = add <16 x i8> %t0, <i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
2317 %sel = select <16 x i1> %cmp, <16 x i8> %bop, <16 x i8> zeroinitializer
2318 %bc = bitcast <16 x i8> %sel to <2 x i64>
2319 %ext = extractelement <2 x i64> %bc, i32 0
2323 ; v8i16/v8i32 - sub(x,trunc(umin(zext(x),y)))
2324 define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) {
2325 ; SSE2-LABEL: test32:
2327 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
2328 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2329 ; SSE2-NEXT: pxor %xmm3, %xmm4
2330 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
2331 ; SSE2-NEXT: movdqa %xmm5, %xmm6
2332 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
2333 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
2334 ; SSE2-NEXT: pand %xmm6, %xmm2
2335 ; SSE2-NEXT: pxor %xmm4, %xmm6
2336 ; SSE2-NEXT: por %xmm2, %xmm6
2337 ; SSE2-NEXT: pslld $16, %xmm6
2338 ; SSE2-NEXT: psrad $16, %xmm6
2339 ; SSE2-NEXT: pxor %xmm1, %xmm3
2340 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
2341 ; SSE2-NEXT: pxor %xmm5, %xmm4
2342 ; SSE2-NEXT: pand %xmm1, %xmm5
2343 ; SSE2-NEXT: por %xmm4, %xmm5
2344 ; SSE2-NEXT: pslld $16, %xmm5
2345 ; SSE2-NEXT: psrad $16, %xmm5
2346 ; SSE2-NEXT: packssdw %xmm6, %xmm5
2347 ; SSE2-NEXT: psubusw %xmm5, %xmm0
2350 ; SSSE3-LABEL: test32:
2352 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
2353 ; SSSE3-NEXT: movdqa %xmm2, %xmm4
2354 ; SSSE3-NEXT: pxor %xmm3, %xmm4
2355 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
2356 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
2357 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
2358 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
2359 ; SSSE3-NEXT: pand %xmm6, %xmm2
2360 ; SSSE3-NEXT: pandn %xmm4, %xmm6
2361 ; SSSE3-NEXT: por %xmm2, %xmm6
2362 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2363 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
2364 ; SSSE3-NEXT: pxor %xmm1, %xmm3
2365 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
2366 ; SSSE3-NEXT: pand %xmm5, %xmm1
2367 ; SSSE3-NEXT: pandn %xmm4, %xmm5
2368 ; SSSE3-NEXT: por %xmm1, %xmm5
2369 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
2370 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
2371 ; SSSE3-NEXT: psubusw %xmm5, %xmm0
2374 ; SSE41-LABEL: test32:
2376 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
2377 ; SSE41-NEXT: pminud %xmm3, %xmm2
2378 ; SSE41-NEXT: pminud %xmm3, %xmm1
2379 ; SSE41-NEXT: packusdw %xmm2, %xmm1
2380 ; SSE41-NEXT: psubusw %xmm1, %xmm0
2383 ; AVX1-LABEL: test32:
2385 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2386 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
2387 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
2388 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
2389 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2390 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2391 ; AVX1-NEXT: vzeroupper
2394 ; AVX2-LABEL: test32:
2396 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
2397 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
2398 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2399 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2400 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2401 ; AVX2-NEXT: vzeroupper
2404 ; AVX512-LABEL: test32:
2406 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
2407 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2408 ; AVX512-NEXT: vzeroupper
2410 %zext = zext <8 x i16> %a0 to <8 x i32>
2411 %icmp = icmp ult <8 x i32> %zext, %a1
2412 %umin = select <8 x i1> %icmp, <8 x i32> %zext, <8 x i32> %a1
2413 %trunc = trunc <8 x i32> %umin to <8 x i16>
2414 %sub = sub <8 x i16> %a0, %trunc
2418 ; v8i32/v8i64 - sub(x,trunc(umin(y,zext(x))))
2419 define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) {
2420 ; SSE2OR3-LABEL: test33:
2422 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
2423 ; SSE2OR3-NEXT: movdqa %xmm3, %xmm6
2424 ; SSE2OR3-NEXT: pxor %xmm9, %xmm6
2425 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259455,9223372039002259455]
2426 ; SSE2OR3-NEXT: movdqa %xmm10, %xmm7
2427 ; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm7
2428 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
2429 ; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm6
2430 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
2431 ; SSE2OR3-NEXT: pand %xmm8, %xmm6
2432 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
2433 ; SSE2OR3-NEXT: por %xmm6, %xmm7
2434 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
2435 ; SSE2OR3-NEXT: pand %xmm7, %xmm3
2436 ; SSE2OR3-NEXT: pandn %xmm8, %xmm7
2437 ; SSE2OR3-NEXT: por %xmm3, %xmm7
2438 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3
2439 ; SSE2OR3-NEXT: pxor %xmm9, %xmm3
2440 ; SSE2OR3-NEXT: movdqa %xmm10, %xmm6
2441 ; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm6
2442 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm11 = xmm6[0,0,2,2]
2443 ; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm3
2444 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2445 ; SSE2OR3-NEXT: pand %xmm11, %xmm3
2446 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
2447 ; SSE2OR3-NEXT: por %xmm3, %xmm6
2448 ; SSE2OR3-NEXT: pand %xmm6, %xmm2
2449 ; SSE2OR3-NEXT: pandn %xmm8, %xmm6
2450 ; SSE2OR3-NEXT: por %xmm2, %xmm6
2451 ; SSE2OR3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
2452 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2
2453 ; SSE2OR3-NEXT: psubd %xmm6, %xmm2
2454 ; SSE2OR3-NEXT: pxor %xmm9, %xmm6
2455 ; SSE2OR3-NEXT: pxor %xmm9, %xmm0
2456 ; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm0
2457 ; SSE2OR3-NEXT: pand %xmm2, %xmm0
2458 ; SSE2OR3-NEXT: movdqa %xmm5, %xmm2
2459 ; SSE2OR3-NEXT: pxor %xmm9, %xmm2
2460 ; SSE2OR3-NEXT: movdqa %xmm10, %xmm3
2461 ; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3
2462 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
2463 ; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm2
2464 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2465 ; SSE2OR3-NEXT: pand %xmm6, %xmm2
2466 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2467 ; SSE2OR3-NEXT: por %xmm2, %xmm3
2468 ; SSE2OR3-NEXT: pand %xmm3, %xmm5
2469 ; SSE2OR3-NEXT: pandn %xmm8, %xmm3
2470 ; SSE2OR3-NEXT: por %xmm5, %xmm3
2471 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm2
2472 ; SSE2OR3-NEXT: pxor %xmm9, %xmm2
2473 ; SSE2OR3-NEXT: movdqa %xmm10, %xmm5
2474 ; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm5
2475 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
2476 ; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm2
2477 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2478 ; SSE2OR3-NEXT: pand %xmm6, %xmm2
2479 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2480 ; SSE2OR3-NEXT: por %xmm2, %xmm5
2481 ; SSE2OR3-NEXT: pand %xmm5, %xmm4
2482 ; SSE2OR3-NEXT: pandn %xmm8, %xmm5
2483 ; SSE2OR3-NEXT: por %xmm4, %xmm5
2484 ; SSE2OR3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2]
2485 ; SSE2OR3-NEXT: movdqa %xmm1, %xmm2
2486 ; SSE2OR3-NEXT: psubd %xmm5, %xmm2
2487 ; SSE2OR3-NEXT: pxor %xmm9, %xmm5
2488 ; SSE2OR3-NEXT: pxor %xmm9, %xmm1
2489 ; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm1
2490 ; SSE2OR3-NEXT: pand %xmm2, %xmm1
2491 ; SSE2OR3-NEXT: retq
2493 ; SSE41-LABEL: test33:
2495 ; SSE41-NEXT: movdqa %xmm0, %xmm8
2496 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
2497 ; SSE41-NEXT: movdqa %xmm3, %xmm0
2498 ; SSE41-NEXT: pxor %xmm9, %xmm0
2499 ; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259455,9223372039002259455]
2500 ; SSE41-NEXT: movdqa %xmm11, %xmm10
2501 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
2502 ; SSE41-NEXT: movdqa %xmm11, %xmm7
2503 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
2504 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
2505 ; SSE41-NEXT: pand %xmm10, %xmm0
2506 ; SSE41-NEXT: por %xmm7, %xmm0
2507 ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295]
2508 ; SSE41-NEXT: movapd %xmm7, %xmm10
2509 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10
2510 ; SSE41-NEXT: movdqa %xmm2, %xmm0
2511 ; SSE41-NEXT: pxor %xmm9, %xmm0
2512 ; SSE41-NEXT: movdqa %xmm11, %xmm3
2513 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
2514 ; SSE41-NEXT: movdqa %xmm11, %xmm6
2515 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
2516 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
2517 ; SSE41-NEXT: pand %xmm3, %xmm0
2518 ; SSE41-NEXT: por %xmm6, %xmm0
2519 ; SSE41-NEXT: movapd %xmm7, %xmm3
2520 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
2521 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2]
2522 ; SSE41-NEXT: pmaxud %xmm3, %xmm8
2523 ; SSE41-NEXT: psubd %xmm3, %xmm8
2524 ; SSE41-NEXT: movdqa %xmm5, %xmm0
2525 ; SSE41-NEXT: pxor %xmm9, %xmm0
2526 ; SSE41-NEXT: movdqa %xmm11, %xmm2
2527 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
2528 ; SSE41-NEXT: movdqa %xmm11, %xmm3
2529 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
2530 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
2531 ; SSE41-NEXT: pand %xmm2, %xmm0
2532 ; SSE41-NEXT: por %xmm3, %xmm0
2533 ; SSE41-NEXT: movapd %xmm7, %xmm2
2534 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
2535 ; SSE41-NEXT: pxor %xmm4, %xmm9
2536 ; SSE41-NEXT: movdqa %xmm11, %xmm3
2537 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm3
2538 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm11
2539 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
2540 ; SSE41-NEXT: pand %xmm3, %xmm0
2541 ; SSE41-NEXT: por %xmm11, %xmm0
2542 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7
2543 ; SSE41-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2]
2544 ; SSE41-NEXT: pmaxud %xmm7, %xmm1
2545 ; SSE41-NEXT: psubd %xmm7, %xmm1
2546 ; SSE41-NEXT: movdqa %xmm8, %xmm0
2549 ; AVX1-LABEL: test33:
2551 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
2552 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
2553 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
2554 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
2555 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
2556 ; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295]
2557 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
2558 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
2559 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
2560 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2
2561 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
2562 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
2563 ; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm3
2564 ; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
2565 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2566 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
2567 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
2568 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
2569 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4
2570 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
2571 ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm7, %xmm1
2572 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
2573 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
2574 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
2575 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2578 ; AVX2-SLOW-LABEL: test33:
2579 ; AVX2-SLOW: # %bb.0:
2580 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
2581 ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm2, %ymm4
2582 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
2583 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
2584 ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
2585 ; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
2586 ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3
2587 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
2588 ; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
2589 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
2590 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2591 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6]
2592 ; AVX2-SLOW-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
2593 ; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2594 ; AVX2-SLOW-NEXT: retq
2596 ; AVX2-FAST-ALL-LABEL: test33:
2597 ; AVX2-FAST-ALL: # %bb.0:
2598 ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
2599 ; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4
2600 ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
2601 ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
2602 ; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
2603 ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1
2604 ; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
2605 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1
2606 ; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm2, %ymm3
2607 ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
2608 ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2
2609 ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm2
2610 ; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2611 ; AVX2-FAST-ALL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
2612 ; AVX2-FAST-ALL-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2613 ; AVX2-FAST-ALL-NEXT: retq
2615 ; AVX2-FAST-PERLANE-LABEL: test33:
2616 ; AVX2-FAST-PERLANE: # %bb.0:
2617 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
2618 ; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4
2619 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
2620 ; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
2621 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
2622 ; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
2623 ; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3
2624 ; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
2625 ; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
2626 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
2627 ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2628 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6]
2629 ; AVX2-FAST-PERLANE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
2630 ; AVX2-FAST-PERLANE-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2631 ; AVX2-FAST-PERLANE-NEXT: retq
2633 ; AVX512-LABEL: test33:
2635 ; AVX512-NEXT: vpmovusqd %zmm1, %ymm1
2636 ; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
2637 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2639 %zext = zext <8 x i32> %a0 to <8 x i64>
2640 %icmp = icmp ult <8 x i64> %a1, %zext
2641 %umin = select <8 x i1> %icmp, <8 x i64> %a1, <8 x i64> %zext
2642 %trunc = trunc <8 x i64> %umin to <8 x i32>
2643 %sub = sub <8 x i32> %a0, %trunc
2647 ; v8i32/v8i64 - sub(x,trunc(umin(zext(and(x,1)),y)))
2648 define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) {
2649 ; SSE2OR3-LABEL: test34:
2651 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1]
2652 ; SSE2OR3-NEXT: pand %xmm6, %xmm1
2653 ; SSE2OR3-NEXT: pand %xmm6, %xmm0
2654 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
2655 ; SSE2OR3-NEXT: movdqa %xmm3, %xmm6
2656 ; SSE2OR3-NEXT: pxor %xmm9, %xmm6
2657 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259455,9223372039002259455]
2658 ; SSE2OR3-NEXT: movdqa %xmm10, %xmm7
2659 ; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm7
2660 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
2661 ; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm6
2662 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
2663 ; SSE2OR3-NEXT: pand %xmm8, %xmm6
2664 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
2665 ; SSE2OR3-NEXT: por %xmm6, %xmm7
2666 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
2667 ; SSE2OR3-NEXT: pand %xmm7, %xmm3
2668 ; SSE2OR3-NEXT: pandn %xmm8, %xmm7
2669 ; SSE2OR3-NEXT: por %xmm3, %xmm7
2670 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3
2671 ; SSE2OR3-NEXT: pxor %xmm9, %xmm3
2672 ; SSE2OR3-NEXT: movdqa %xmm10, %xmm6
2673 ; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm6
2674 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm11 = xmm6[0,0,2,2]
2675 ; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm3
2676 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2677 ; SSE2OR3-NEXT: pand %xmm11, %xmm3
2678 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
2679 ; SSE2OR3-NEXT: por %xmm3, %xmm6
2680 ; SSE2OR3-NEXT: pand %xmm6, %xmm2
2681 ; SSE2OR3-NEXT: pandn %xmm8, %xmm6
2682 ; SSE2OR3-NEXT: por %xmm2, %xmm6
2683 ; SSE2OR3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
2684 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2
2685 ; SSE2OR3-NEXT: psubd %xmm6, %xmm2
2686 ; SSE2OR3-NEXT: pxor %xmm9, %xmm6
2687 ; SSE2OR3-NEXT: por %xmm9, %xmm0
2688 ; SSE2OR3-NEXT: pcmpgtd %xmm6, %xmm0
2689 ; SSE2OR3-NEXT: pand %xmm2, %xmm0
2690 ; SSE2OR3-NEXT: movdqa %xmm5, %xmm2
2691 ; SSE2OR3-NEXT: pxor %xmm9, %xmm2
2692 ; SSE2OR3-NEXT: movdqa %xmm10, %xmm3
2693 ; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3
2694 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
2695 ; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm2
2696 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2697 ; SSE2OR3-NEXT: pand %xmm6, %xmm2
2698 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2699 ; SSE2OR3-NEXT: por %xmm2, %xmm3
2700 ; SSE2OR3-NEXT: pand %xmm3, %xmm5
2701 ; SSE2OR3-NEXT: pandn %xmm8, %xmm3
2702 ; SSE2OR3-NEXT: por %xmm5, %xmm3
2703 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm2
2704 ; SSE2OR3-NEXT: pxor %xmm9, %xmm2
2705 ; SSE2OR3-NEXT: movdqa %xmm10, %xmm5
2706 ; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm5
2707 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
2708 ; SSE2OR3-NEXT: pcmpeqd %xmm10, %xmm2
2709 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2710 ; SSE2OR3-NEXT: pand %xmm6, %xmm2
2711 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2712 ; SSE2OR3-NEXT: por %xmm2, %xmm5
2713 ; SSE2OR3-NEXT: pand %xmm5, %xmm4
2714 ; SSE2OR3-NEXT: pandn %xmm8, %xmm5
2715 ; SSE2OR3-NEXT: por %xmm4, %xmm5
2716 ; SSE2OR3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2]
2717 ; SSE2OR3-NEXT: movdqa %xmm1, %xmm2
2718 ; SSE2OR3-NEXT: psubd %xmm5, %xmm2
2719 ; SSE2OR3-NEXT: pxor %xmm9, %xmm5
2720 ; SSE2OR3-NEXT: por %xmm9, %xmm1
2721 ; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm1
2722 ; SSE2OR3-NEXT: pand %xmm2, %xmm1
2723 ; SSE2OR3-NEXT: retq
2725 ; SSE41-LABEL: test34:
2727 ; SSE41-NEXT: movdqa %xmm0, %xmm8
2728 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
2729 ; SSE41-NEXT: pand %xmm0, %xmm1
2730 ; SSE41-NEXT: pand %xmm0, %xmm8
2731 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
2732 ; SSE41-NEXT: movdqa %xmm3, %xmm0
2733 ; SSE41-NEXT: pxor %xmm9, %xmm0
2734 ; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259455,9223372039002259455]
2735 ; SSE41-NEXT: movdqa %xmm11, %xmm10
2736 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
2737 ; SSE41-NEXT: movdqa %xmm11, %xmm7
2738 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
2739 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
2740 ; SSE41-NEXT: pand %xmm10, %xmm0
2741 ; SSE41-NEXT: por %xmm7, %xmm0
2742 ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295]
2743 ; SSE41-NEXT: movapd %xmm7, %xmm10
2744 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10
2745 ; SSE41-NEXT: movdqa %xmm2, %xmm0
2746 ; SSE41-NEXT: pxor %xmm9, %xmm0
2747 ; SSE41-NEXT: movdqa %xmm11, %xmm3
2748 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
2749 ; SSE41-NEXT: movdqa %xmm11, %xmm6
2750 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
2751 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
2752 ; SSE41-NEXT: pand %xmm3, %xmm0
2753 ; SSE41-NEXT: por %xmm6, %xmm0
2754 ; SSE41-NEXT: movapd %xmm7, %xmm3
2755 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
2756 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2]
2757 ; SSE41-NEXT: pmaxud %xmm3, %xmm8
2758 ; SSE41-NEXT: psubd %xmm3, %xmm8
2759 ; SSE41-NEXT: movdqa %xmm5, %xmm0
2760 ; SSE41-NEXT: pxor %xmm9, %xmm0
2761 ; SSE41-NEXT: movdqa %xmm11, %xmm2
2762 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
2763 ; SSE41-NEXT: movdqa %xmm11, %xmm3
2764 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
2765 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
2766 ; SSE41-NEXT: pand %xmm2, %xmm0
2767 ; SSE41-NEXT: por %xmm3, %xmm0
2768 ; SSE41-NEXT: movapd %xmm7, %xmm2
2769 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
2770 ; SSE41-NEXT: pxor %xmm4, %xmm9
2771 ; SSE41-NEXT: movdqa %xmm11, %xmm3
2772 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm3
2773 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm11
2774 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2]
2775 ; SSE41-NEXT: pand %xmm3, %xmm0
2776 ; SSE41-NEXT: por %xmm11, %xmm0
2777 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7
2778 ; SSE41-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2]
2779 ; SSE41-NEXT: pmaxud %xmm7, %xmm1
2780 ; SSE41-NEXT: psubd %xmm7, %xmm1
2781 ; SSE41-NEXT: movdqa %xmm8, %xmm0
2784 ; AVX1-LABEL: test34:
2786 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2787 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
2788 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
2789 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
2790 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
2791 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
2792 ; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295]
2793 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
2794 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
2795 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
2796 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2
2797 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
2798 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
2799 ; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm3
2800 ; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
2801 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2802 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
2803 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
2804 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
2805 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4
2806 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
2807 ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm7, %xmm1
2808 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
2809 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
2810 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
2811 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2814 ; AVX2-SLOW-LABEL: test34:
2815 ; AVX2-SLOW: # %bb.0:
2816 ; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
2817 ; AVX2-SLOW-NEXT: vpand %ymm3, %ymm0, %ymm0
2818 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
2819 ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm2, %ymm4
2820 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
2821 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
2822 ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
2823 ; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
2824 ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3
2825 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
2826 ; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
2827 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
2828 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2829 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6]
2830 ; AVX2-SLOW-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
2831 ; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2832 ; AVX2-SLOW-NEXT: retq
2834 ; AVX2-FAST-ALL-LABEL: test34:
2835 ; AVX2-FAST-ALL: # %bb.0:
2836 ; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
2837 ; AVX2-FAST-ALL-NEXT: vpand %ymm3, %ymm0, %ymm0
2838 ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
2839 ; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4
2840 ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
2841 ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
2842 ; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
2843 ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1
2844 ; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
2845 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1
2846 ; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm2, %ymm3
2847 ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
2848 ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2
2849 ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm2
2850 ; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2851 ; AVX2-FAST-ALL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
2852 ; AVX2-FAST-ALL-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2853 ; AVX2-FAST-ALL-NEXT: retq
2855 ; AVX2-FAST-PERLANE-LABEL: test34:
2856 ; AVX2-FAST-PERLANE: # %bb.0:
2857 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
2858 ; AVX2-FAST-PERLANE-NEXT: vpand %ymm3, %ymm0, %ymm0
2859 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
2860 ; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4
2861 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
2862 ; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
2863 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
2864 ; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
2865 ; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3
2866 ; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
2867 ; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
2868 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
2869 ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2870 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6]
2871 ; AVX2-FAST-PERLANE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
2872 ; AVX2-FAST-PERLANE-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2873 ; AVX2-FAST-PERLANE-NEXT: retq
2875 ; AVX512-LABEL: test34:
2877 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
2878 ; AVX512-NEXT: vpmovusqd %zmm1, %ymm1
2879 ; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
2880 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2882 %mask = and <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2883 %zext = zext <8 x i32> %mask to <8 x i64>
2884 %icmp = icmp ult <8 x i64> %zext, %a1
2885 %umin = select <8 x i1> %icmp, <8 x i64> %zext, <8 x i64> %a1
2886 %trunc = trunc <8 x i64> %umin to <8 x i32>
2887 %sub = sub <8 x i32> %mask, %trunc