1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2OR3,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2OR3,SSSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
12 declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>)
14 define <8 x i16> @test1(<8 x i16> %x) nounwind {
16 ; SSE: # %bb.0: # %vector.ph
17 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
21 ; AVX: # %bb.0: # %vector.ph
22 ; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
25 %0 = icmp slt <8 x i16> %x, zeroinitializer
26 %1 = xor <8 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
27 %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
31 ; This is logically equivalent to the above.
32 ; usubsat X, (1 << (BW-1)) <--> (X ^ (1 << (BW-1))) & (ashr X, (BW-1))
34 define <8 x i16> @ashr_xor_and(<8 x i16> %x) nounwind {
35 ; SSE-LABEL: ashr_xor_and:
37 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
40 ; AVX-LABEL: ashr_xor_and:
42 ; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
44 %signsplat = ashr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
45 %flipsign = xor <8 x i16> %x, <i16 undef, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768>
46 %res = and <8 x i16> %signsplat, %flipsign
50 define <8 x i16> @ashr_add_and(<8 x i16> %x) nounwind {
51 ; SSE-LABEL: ashr_add_and:
53 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
56 ; AVX-LABEL: ashr_add_and:
58 ; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
60 %signsplat = ashr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
61 %flipsign = add <8 x i16> %x, <i16 undef, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768>
62 %res = and <8 x i16> %signsplat, %flipsign
66 ; negative test - extra uses may lead to extra instructions when custom-lowered
68 define <16 x i8> @ashr_xor_and_commute_uses(<16 x i8> %x, ptr %p1, ptr %p2) nounwind {
69 ; SSE-LABEL: ashr_xor_and_commute_uses:
71 ; SSE-NEXT: pxor %xmm1, %xmm1
72 ; SSE-NEXT: pcmpgtb %xmm0, %xmm1
73 ; SSE-NEXT: movdqa %xmm1, (%rdi)
74 ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
75 ; SSE-NEXT: movdqa %xmm0, (%rsi)
76 ; SSE-NEXT: pand %xmm1, %xmm0
79 ; AVX1-LABEL: ashr_xor_and_commute_uses:
81 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
82 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
83 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
84 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
85 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
86 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
89 ; AVX2-LABEL: ashr_xor_and_commute_uses:
91 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
92 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
93 ; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
94 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
95 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
96 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
99 ; AVX512-LABEL: ashr_xor_and_commute_uses:
101 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
102 ; AVX512-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
103 ; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
104 ; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
105 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
106 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
108 %signsplat = ashr <16 x i8> %x, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
109 store <16 x i8> %signsplat, ptr %p1
110 %flipsign = xor <16 x i8> %x, <i8 undef, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
111 store <16 x i8> %flipsign, ptr %p2
112 %res = and <16 x i8> %flipsign, %signsplat
116 define <4 x i32> @ashr_xor_and_custom(<4 x i32> %x) nounwind {
117 ; SSE2OR3-LABEL: ashr_xor_and_custom:
119 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm1
120 ; SSE2OR3-NEXT: psrad $31, %xmm1
121 ; SSE2OR3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
122 ; SSE2OR3-NEXT: pand %xmm1, %xmm0
125 ; SSE41-LABEL: ashr_xor_and_custom:
127 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
128 ; SSE41-NEXT: pmaxud %xmm1, %xmm0
129 ; SSE41-NEXT: psubd %xmm1, %xmm0
132 ; AVX1-LABEL: ashr_xor_and_custom:
134 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
135 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
136 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
139 ; AVX2-LABEL: ashr_xor_and_custom:
141 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
142 ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
143 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
146 ; AVX512-LABEL: ashr_xor_and_custom:
148 ; AVX512-NEXT: vpsrad $31, %xmm0, %xmm1
149 ; AVX512-NEXT: vpternlogd $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
151 %signsplat = ashr <4 x i32> %x, <i32 undef, i32 31, i32 31, i32 31>
152 %flipsign = xor <4 x i32> %x, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
153 %res = and <4 x i32> %flipsign, %signsplat
157 define <4 x i32> @ashr_add_and_custom(<4 x i32> %x) nounwind {
158 ; SSE2OR3-LABEL: ashr_add_and_custom:
160 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm1
161 ; SSE2OR3-NEXT: psrad $31, %xmm1
162 ; SSE2OR3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
163 ; SSE2OR3-NEXT: pand %xmm1, %xmm0
166 ; SSE41-LABEL: ashr_add_and_custom:
168 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
169 ; SSE41-NEXT: pmaxud %xmm1, %xmm0
170 ; SSE41-NEXT: psubd %xmm1, %xmm0
173 ; AVX1-LABEL: ashr_add_and_custom:
175 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
176 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
177 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
180 ; AVX2-LABEL: ashr_add_and_custom:
182 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
183 ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
184 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
187 ; AVX512-LABEL: ashr_add_and_custom:
189 ; AVX512-NEXT: vpsrad $31, %xmm0, %xmm1
190 ; AVX512-NEXT: vpternlogd $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
192 %signsplat = ashr <4 x i32> %x, <i32 undef, i32 31, i32 31, i32 31>
193 %flipsign = add <4 x i32> %x, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
194 %res = and <4 x i32> %flipsign, %signsplat
198 ; usubsat X, (1 << (BW-1)) <--> (X ^ (1 << (BW-1))) & (ashr X, (BW-1))
200 define <4 x i32> @usubsat_custom(<4 x i32> %x) nounwind {
201 ; SSE2OR3-LABEL: usubsat_custom:
203 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm1
204 ; SSE2OR3-NEXT: psrad $31, %xmm1
205 ; SSE2OR3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
206 ; SSE2OR3-NEXT: pand %xmm1, %xmm0
209 ; SSE41-LABEL: usubsat_custom:
211 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,u]
212 ; SSE41-NEXT: pmaxud %xmm1, %xmm0
213 ; SSE41-NEXT: psubd %xmm1, %xmm0
216 ; AVX1-LABEL: usubsat_custom:
218 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
219 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
220 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
223 ; AVX2-LABEL: usubsat_custom:
225 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
226 ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
227 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
230 ; AVX512-LABEL: usubsat_custom:
232 ; AVX512-NEXT: vpsrad $31, %xmm0, %xmm1
233 ; AVX512-NEXT: vpternlogd $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
235 %res = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> <i32 2147483648, i32 2147483648, i32 2147483648, i32 undef>)
239 define <8 x i16> @test2(<8 x i16> %x) nounwind {
241 ; SSE: # %bb.0: # %vector.ph
242 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
246 ; AVX: # %bb.0: # %vector.ph
247 ; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
250 %0 = icmp ugt <8 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
251 %1 = add <8 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
252 %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
256 define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
258 ; SSE: # %bb.0: # %vector.ph
259 ; SSE-NEXT: movd %edi, %xmm1
260 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
261 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
262 ; SSE-NEXT: psubusw %xmm1, %xmm0
266 ; AVX1: # %bb.0: # %vector.ph
267 ; AVX1-NEXT: vmovd %edi, %xmm1
268 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
269 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
270 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
274 ; AVX2: # %bb.0: # %vector.ph
275 ; AVX2-NEXT: vmovd %edi, %xmm1
276 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
277 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
280 ; AVX512-LABEL: test3:
281 ; AVX512: # %bb.0: # %vector.ph
282 ; AVX512-NEXT: vpbroadcastw %edi, %xmm1
283 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
286 %0 = insertelement <8 x i16> undef, i16 %w, i32 0
287 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
288 %1 = icmp ult <8 x i16> %x, %broadcast15
289 %2 = sub <8 x i16> %x, %broadcast15
290 %res = select <8 x i1> %1, <8 x i16> zeroinitializer, <8 x i16> %2
294 define <16 x i8> @test4(<16 x i8> %x) nounwind {
296 ; SSE: # %bb.0: # %vector.ph
297 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
301 ; AVX: # %bb.0: # %vector.ph
302 ; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
305 %0 = icmp slt <16 x i8> %x, zeroinitializer
306 %1 = xor <16 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
307 %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
311 define <16 x i8> @test5(<16 x i8> %x) nounwind {
313 ; SSE: # %bb.0: # %vector.ph
314 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
318 ; AVX: # %bb.0: # %vector.ph
319 ; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
322 %0 = icmp ugt <16 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
323 %1 = add <16 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
324 %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
328 define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
330 ; SSE2: # %bb.0: # %vector.ph
331 ; SSE2-NEXT: movd %edi, %xmm1
332 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
333 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
334 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
335 ; SSE2-NEXT: psubusb %xmm1, %xmm0
338 ; SSSE3-LABEL: test6:
339 ; SSSE3: # %bb.0: # %vector.ph
340 ; SSSE3-NEXT: movd %edi, %xmm1
341 ; SSSE3-NEXT: pxor %xmm2, %xmm2
342 ; SSSE3-NEXT: pshufb %xmm2, %xmm1
343 ; SSSE3-NEXT: psubusb %xmm1, %xmm0
346 ; SSE41-LABEL: test6:
347 ; SSE41: # %bb.0: # %vector.ph
348 ; SSE41-NEXT: movd %edi, %xmm1
349 ; SSE41-NEXT: pxor %xmm2, %xmm2
350 ; SSE41-NEXT: pshufb %xmm2, %xmm1
351 ; SSE41-NEXT: psubusb %xmm1, %xmm0
355 ; AVX1: # %bb.0: # %vector.ph
356 ; AVX1-NEXT: vmovd %edi, %xmm1
357 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
358 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
359 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
363 ; AVX2: # %bb.0: # %vector.ph
364 ; AVX2-NEXT: vmovd %edi, %xmm1
365 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
366 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
369 ; AVX512-LABEL: test6:
370 ; AVX512: # %bb.0: # %vector.ph
371 ; AVX512-NEXT: vpbroadcastb %edi, %xmm1
372 ; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
375 %0 = insertelement <16 x i8> undef, i8 %w, i32 0
376 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
377 %1 = icmp ult <16 x i8> %x, %broadcast15
378 %2 = sub <16 x i8> %x, %broadcast15
379 %res = select <16 x i1> %1, <16 x i8> zeroinitializer, <16 x i8> %2
383 define <16 x i16> @test7(<16 x i16> %x) nounwind {
385 ; SSE: # %bb.0: # %vector.ph
386 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
387 ; SSE-NEXT: psubusw %xmm2, %xmm0
388 ; SSE-NEXT: psubusw %xmm2, %xmm1
392 ; AVX1: # %bb.0: # %vector.ph
393 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
394 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
395 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
396 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
397 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
401 ; AVX2: # %bb.0: # %vector.ph
402 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
405 ; AVX512-LABEL: test7:
406 ; AVX512: # %bb.0: # %vector.ph
407 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
410 %0 = icmp slt <16 x i16> %x, zeroinitializer
411 %1 = xor <16 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
412 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
416 define <16 x i16> @ashr_xor_and_v16i16(<16 x i16> %x) nounwind {
417 ; SSE-LABEL: ashr_xor_and_v16i16:
419 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
420 ; SSE-NEXT: psubusw %xmm2, %xmm0
421 ; SSE-NEXT: psubusw %xmm2, %xmm1
424 ; AVX1-LABEL: ashr_xor_and_v16i16:
426 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
427 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
428 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
429 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
430 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
433 ; AVX2-LABEL: ashr_xor_and_v16i16:
435 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
438 ; AVX512-LABEL: ashr_xor_and_v16i16:
440 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
442 %signsplat = ashr <16 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
443 %flipsign = xor <16 x i16> %x, <i16 undef, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768>
444 %res = and <16 x i16> %signsplat, %flipsign
448 define <16 x i16> @ashr_add_and_v16i16(<16 x i16> %x) nounwind {
449 ; SSE-LABEL: ashr_add_and_v16i16:
451 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
452 ; SSE-NEXT: psubusw %xmm2, %xmm0
453 ; SSE-NEXT: psubusw %xmm2, %xmm1
456 ; AVX1-LABEL: ashr_add_and_v16i16:
458 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
459 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
460 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
461 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
462 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
465 ; AVX2-LABEL: ashr_add_and_v16i16:
467 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
470 ; AVX512-LABEL: ashr_add_and_v16i16:
472 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
474 %signsplat = ashr <16 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
475 %flipsign = add <16 x i16> %x, <i16 undef, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768>
476 %res = and <16 x i16> %signsplat, %flipsign
480 define <16 x i16> @test8(<16 x i16> %x) nounwind {
482 ; SSE: # %bb.0: # %vector.ph
483 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
484 ; SSE-NEXT: psubusw %xmm2, %xmm0
485 ; SSE-NEXT: psubusw %xmm2, %xmm1
489 ; AVX1: # %bb.0: # %vector.ph
490 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
491 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
492 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
493 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
494 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
498 ; AVX2: # %bb.0: # %vector.ph
499 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
502 ; AVX512-LABEL: test8:
503 ; AVX512: # %bb.0: # %vector.ph
504 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
507 %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
508 %1 = add <16 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
509 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
513 define <16 x i16> @test8a(<16 x i16> %x) nounwind {
515 ; SSE: # %bb.0: # %vector.ph
516 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
517 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
520 ; AVX1-LABEL: test8a:
521 ; AVX1: # %bb.0: # %vector.ph
522 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
523 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
524 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
525 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
528 ; AVX2-LABEL: test8a:
529 ; AVX2: # %bb.0: # %vector.ph
530 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
533 ; AVX512-LABEL: test8a:
534 ; AVX512: # %bb.0: # %vector.ph
535 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
538 %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32765, i16 32764, i16 32763, i16 32762, i16 32761, i16 32760, i16 32759, i16 32758, i16 32757, i16 32756, i16 32755, i16 32754, i16 32753, i16 32752, i16 32751>
539 %1 = add <16 x i16> %x, <i16 -32767, i16 -32766, i16 -32765, i16 -32764, i16 -32763, i16 -32762, i16 -32761, i16 -32760, i16 -32759, i16 -32758, i16 -32757, i16 -32756, i16 -32755, i16 -32754, i16 -32753, i16 -32752>
540 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
544 define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
546 ; SSE: # %bb.0: # %vector.ph
547 ; SSE-NEXT: movd %edi, %xmm2
548 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
549 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
550 ; SSE-NEXT: psubusw %xmm2, %xmm0
551 ; SSE-NEXT: psubusw %xmm2, %xmm1
555 ; AVX1: # %bb.0: # %vector.ph
556 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
557 ; AVX1-NEXT: vmovd %edi, %xmm2
558 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
559 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
560 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
561 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
562 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
566 ; AVX2: # %bb.0: # %vector.ph
567 ; AVX2-NEXT: vmovd %edi, %xmm1
568 ; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
569 ; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
572 ; AVX512-LABEL: test9:
573 ; AVX512: # %bb.0: # %vector.ph
574 ; AVX512-NEXT: vpbroadcastw %edi, %ymm1
575 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
578 %0 = insertelement <16 x i16> undef, i16 %w, i32 0
579 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
580 %1 = icmp ult <16 x i16> %x, %broadcast15
581 %2 = sub <16 x i16> %x, %broadcast15
582 %res = select <16 x i1> %1, <16 x i16> zeroinitializer, <16 x i16> %2
586 define <32 x i8> @test10(<32 x i8> %x) nounwind {
588 ; SSE: # %bb.0: # %vector.ph
589 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
590 ; SSE-NEXT: psubusb %xmm2, %xmm0
591 ; SSE-NEXT: psubusb %xmm2, %xmm1
594 ; AVX1-LABEL: test10:
595 ; AVX1: # %bb.0: # %vector.ph
596 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
597 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
598 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
599 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
600 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
603 ; AVX2-LABEL: test10:
604 ; AVX2: # %bb.0: # %vector.ph
605 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
608 ; AVX512-LABEL: test10:
609 ; AVX512: # %bb.0: # %vector.ph
610 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
613 %0 = icmp slt <32 x i8> %x, zeroinitializer
614 %1 = xor <32 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
615 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
619 define <32 x i8> @test11(<32 x i8> %x) nounwind {
621 ; SSE: # %bb.0: # %vector.ph
622 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
623 ; SSE-NEXT: psubusb %xmm2, %xmm0
624 ; SSE-NEXT: psubusb %xmm2, %xmm1
627 ; AVX1-LABEL: test11:
628 ; AVX1: # %bb.0: # %vector.ph
629 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
630 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
631 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
632 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
633 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
636 ; AVX2-LABEL: test11:
637 ; AVX2: # %bb.0: # %vector.ph
638 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
641 ; AVX512-LABEL: test11:
642 ; AVX512: # %bb.0: # %vector.ph
643 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
646 %0 = icmp ugt <32 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
647 %1 = add <32 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
648 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
652 define <32 x i8> @test11a(<32 x i8> %x) nounwind {
653 ; SSE-LABEL: test11a:
654 ; SSE: # %bb.0: # %vector.ph
655 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
656 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
659 ; AVX1-LABEL: test11a:
660 ; AVX1: # %bb.0: # %vector.ph
661 ; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
662 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
663 ; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
664 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
667 ; AVX2-LABEL: test11a:
668 ; AVX2: # %bb.0: # %vector.ph
669 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
672 ; AVX512-LABEL: test11a:
673 ; AVX512: # %bb.0: # %vector.ph
674 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
677 %0 = icmp ugt <32 x i8> %x, <i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 117, i8 116, i8 115, i8 114, i8 113, i8 112, i8 111, i8 110, i8 109, i8 108, i8 107, i8 106, i8 105, i8 104, i8 103, i8 102, i8 101, i8 100, i8 99, i8 98, i8 97, i8 96, i8 95>
678 %1 = add <32 x i8> %x, <i8 -127, i8 -126, i8 -125, i8 -124, i8 -123, i8 -122, i8 -121, i8 -120, i8 -119, i8 -118, i8 -117, i8 -116, i8 -115, i8 -114, i8 -113, i8 -112, i8 -111, i8 -110, i8 -109, i8 -108, i8 -107, i8 -106, i8 -105, i8 -104, i8 -103, i8 -102, i8 -101, i8 -100, i8 -99, i8 -98, i8 -97, i8 -96>
679 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
683 define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
684 ; SSE2-LABEL: test12:
685 ; SSE2: # %bb.0: # %vector.ph
686 ; SSE2-NEXT: movd %edi, %xmm2
687 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
688 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
689 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
690 ; SSE2-NEXT: psubusb %xmm2, %xmm0
691 ; SSE2-NEXT: psubusb %xmm2, %xmm1
694 ; SSSE3-LABEL: test12:
695 ; SSSE3: # %bb.0: # %vector.ph
696 ; SSSE3-NEXT: movd %edi, %xmm2
697 ; SSSE3-NEXT: pxor %xmm3, %xmm3
698 ; SSSE3-NEXT: pshufb %xmm3, %xmm2
699 ; SSSE3-NEXT: psubusb %xmm2, %xmm0
700 ; SSSE3-NEXT: psubusb %xmm2, %xmm1
703 ; SSE41-LABEL: test12:
704 ; SSE41: # %bb.0: # %vector.ph
705 ; SSE41-NEXT: movd %edi, %xmm2
706 ; SSE41-NEXT: pxor %xmm3, %xmm3
707 ; SSE41-NEXT: pshufb %xmm3, %xmm2
708 ; SSE41-NEXT: psubusb %xmm2, %xmm0
709 ; SSE41-NEXT: psubusb %xmm2, %xmm1
712 ; AVX1-LABEL: test12:
713 ; AVX1: # %bb.0: # %vector.ph
714 ; AVX1-NEXT: vmovd %edi, %xmm1
715 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
716 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
717 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
718 ; AVX1-NEXT: vpsubusb %xmm1, %xmm2, %xmm2
719 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
720 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
723 ; AVX2-LABEL: test12:
724 ; AVX2: # %bb.0: # %vector.ph
725 ; AVX2-NEXT: vmovd %edi, %xmm1
726 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
727 ; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
730 ; AVX512-LABEL: test12:
731 ; AVX512: # %bb.0: # %vector.ph
732 ; AVX512-NEXT: vpbroadcastb %edi, %ymm1
733 ; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
736 %0 = insertelement <32 x i8> undef, i8 %w, i32 0
737 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
738 %1 = icmp ult <32 x i8> %x, %broadcast15
739 %2 = sub <32 x i8> %x, %broadcast15
740 %res = select <32 x i1> %1, <32 x i8> zeroinitializer, <32 x i8> %2
744 define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
745 ; SSE2-LABEL: test13:
746 ; SSE2: # %bb.0: # %vector.ph
747 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
748 ; SSE2-NEXT: movdqa %xmm2, %xmm4
749 ; SSE2-NEXT: pxor %xmm3, %xmm4
750 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
751 ; SSE2-NEXT: movdqa %xmm5, %xmm6
752 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
753 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
754 ; SSE2-NEXT: pand %xmm6, %xmm2
755 ; SSE2-NEXT: pxor %xmm4, %xmm6
756 ; SSE2-NEXT: por %xmm2, %xmm6
757 ; SSE2-NEXT: pslld $16, %xmm6
758 ; SSE2-NEXT: psrad $16, %xmm6
759 ; SSE2-NEXT: pxor %xmm1, %xmm3
760 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
761 ; SSE2-NEXT: pxor %xmm5, %xmm4
762 ; SSE2-NEXT: pand %xmm1, %xmm5
763 ; SSE2-NEXT: por %xmm4, %xmm5
764 ; SSE2-NEXT: pslld $16, %xmm5
765 ; SSE2-NEXT: psrad $16, %xmm5
766 ; SSE2-NEXT: packssdw %xmm6, %xmm5
767 ; SSE2-NEXT: psubusw %xmm5, %xmm0
770 ; SSSE3-LABEL: test13:
771 ; SSSE3: # %bb.0: # %vector.ph
772 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
773 ; SSSE3-NEXT: movdqa %xmm2, %xmm4
774 ; SSSE3-NEXT: pxor %xmm3, %xmm4
775 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
776 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
777 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
778 ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
779 ; SSSE3-NEXT: pand %xmm6, %xmm2
780 ; SSSE3-NEXT: pxor %xmm4, %xmm6
781 ; SSSE3-NEXT: por %xmm2, %xmm6
782 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
783 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
784 ; SSSE3-NEXT: pxor %xmm1, %xmm3
785 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
786 ; SSSE3-NEXT: pxor %xmm5, %xmm4
787 ; SSSE3-NEXT: pand %xmm1, %xmm5
788 ; SSSE3-NEXT: por %xmm4, %xmm5
789 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
790 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
791 ; SSSE3-NEXT: psubusw %xmm5, %xmm0
794 ; SSE41-LABEL: test13:
795 ; SSE41: # %bb.0: # %vector.ph
796 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
797 ; SSE41-NEXT: pminud %xmm3, %xmm2
798 ; SSE41-NEXT: pminud %xmm3, %xmm1
799 ; SSE41-NEXT: packusdw %xmm2, %xmm1
800 ; SSE41-NEXT: psubusw %xmm1, %xmm0
803 ; AVX1-LABEL: test13:
804 ; AVX1: # %bb.0: # %vector.ph
805 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
806 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
807 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
808 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
809 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
810 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
811 ; AVX1-NEXT: vzeroupper
814 ; AVX2-LABEL: test13:
815 ; AVX2: # %bb.0: # %vector.ph
816 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
817 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
818 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
819 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
820 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
821 ; AVX2-NEXT: vzeroupper
824 ; AVX512-LABEL: test13:
825 ; AVX512: # %bb.0: # %vector.ph
826 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
827 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
828 ; AVX512-NEXT: vzeroupper
831 %lhs = zext <8 x i16> %x to <8 x i32>
832 %cond = icmp ult <8 x i32> %lhs, %y
833 %sub = sub <8 x i32> %lhs, %y
834 %trunc = trunc <8 x i32> %sub to <8 x i16>
835 %res = select <8 x i1> %cond, <8 x i16> zeroinitializer, <8 x i16> %trunc
839 ; FIXME: match this to UMIN+TRUNC+PSUBUS
840 define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
841 ; SSE2OR3-LABEL: test14:
842 ; SSE2OR3: # %bb.0: # %vector.ph
843 ; SSE2OR3-NEXT: pxor %xmm6, %xmm6
844 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm5
845 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm7
846 ; SSE2OR3-NEXT: movdqa %xmm3, %xmm8
847 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm9
848 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
849 ; SSE2OR3-NEXT: pand %xmm10, %xmm4
850 ; SSE2OR3-NEXT: pand %xmm10, %xmm3
851 ; SSE2OR3-NEXT: packuswb %xmm4, %xmm3
852 ; SSE2OR3-NEXT: movdqa %xmm1, %xmm4
853 ; SSE2OR3-NEXT: pand %xmm10, %xmm2
854 ; SSE2OR3-NEXT: pand %xmm10, %xmm1
855 ; SSE2OR3-NEXT: packuswb %xmm2, %xmm1
856 ; SSE2OR3-NEXT: packuswb %xmm3, %xmm1
857 ; SSE2OR3-NEXT: psubb %xmm0, %xmm1
858 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2
859 ; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
860 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm0
861 ; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
862 ; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
863 ; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
864 ; SSE2OR3-NEXT: movdqa %xmm5, %xmm3
865 ; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
866 ; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
867 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
868 ; SSE2OR3-NEXT: pxor %xmm6, %xmm7
869 ; SSE2OR3-NEXT: por %xmm6, %xmm5
870 ; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm5
871 ; SSE2OR3-NEXT: pxor %xmm6, %xmm8
872 ; SSE2OR3-NEXT: por %xmm6, %xmm3
873 ; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm3
874 ; SSE2OR3-NEXT: packssdw %xmm5, %xmm3
875 ; SSE2OR3-NEXT: pxor %xmm6, %xmm9
876 ; SSE2OR3-NEXT: por %xmm6, %xmm2
877 ; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm2
878 ; SSE2OR3-NEXT: pxor %xmm6, %xmm4
879 ; SSE2OR3-NEXT: por %xmm6, %xmm0
880 ; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm0
881 ; SSE2OR3-NEXT: packssdw %xmm2, %xmm0
882 ; SSE2OR3-NEXT: packsswb %xmm3, %xmm0
883 ; SSE2OR3-NEXT: pandn %xmm1, %xmm0
886 ; SSE41-LABEL: test14:
887 ; SSE41: # %bb.0: # %vector.ph
888 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
889 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
890 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
891 ; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
892 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
893 ; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3]
894 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
895 ; SSE41-NEXT: pmaxud %xmm4, %xmm8
896 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm8
897 ; SSE41-NEXT: pmaxud %xmm3, %xmm7
898 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
899 ; SSE41-NEXT: packssdw %xmm8, %xmm7
900 ; SSE41-NEXT: pmaxud %xmm1, %xmm6
901 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm6
902 ; SSE41-NEXT: pmaxud %xmm2, %xmm5
903 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
904 ; SSE41-NEXT: packssdw %xmm5, %xmm6
905 ; SSE41-NEXT: packsswb %xmm7, %xmm6
906 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
907 ; SSE41-NEXT: pand %xmm5, %xmm4
908 ; SSE41-NEXT: pand %xmm5, %xmm3
909 ; SSE41-NEXT: packusdw %xmm4, %xmm3
910 ; SSE41-NEXT: pand %xmm5, %xmm2
911 ; SSE41-NEXT: pand %xmm1, %xmm5
912 ; SSE41-NEXT: packusdw %xmm2, %xmm5
913 ; SSE41-NEXT: packuswb %xmm3, %xmm5
914 ; SSE41-NEXT: psubb %xmm0, %xmm5
915 ; SSE41-NEXT: pand %xmm6, %xmm5
916 ; SSE41-NEXT: movdqa %xmm5, %xmm0
919 ; AVX1-LABEL: test14:
920 ; AVX1: # %bb.0: # %vector.ph
921 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
922 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
923 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
924 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
925 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
926 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
927 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
928 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
929 ; AVX1-NEXT: vpmaxud %xmm6, %xmm7, %xmm6
930 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
931 ; AVX1-NEXT: vpmaxud %xmm5, %xmm2, %xmm5
932 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm5
933 ; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
934 ; AVX1-NEXT: vpmaxud %xmm4, %xmm1, %xmm4
935 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm4
936 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
937 ; AVX1-NEXT: vpmaxud %xmm3, %xmm6, %xmm3
938 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3
939 ; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
940 ; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3
941 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
942 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
943 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
944 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
945 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
946 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
947 ; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
948 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
949 ; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0
950 ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
951 ; AVX1-NEXT: vzeroupper
954 ; AVX2-LABEL: test14:
955 ; AVX2: # %bb.0: # %vector.ph
956 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
957 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
958 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
959 ; AVX2-NEXT: vpmaxud %ymm4, %ymm1, %ymm4
960 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4
961 ; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm3
962 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm3
963 ; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3
964 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
965 ; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
966 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
967 ; AVX2-NEXT: vpacksswb %xmm4, %xmm3, %xmm3
968 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
969 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
970 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
971 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
972 ; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
973 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
974 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
975 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
976 ; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0
977 ; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
978 ; AVX2-NEXT: vzeroupper
981 ; AVX512-LABEL: test14:
982 ; AVX512: # %bb.0: # %vector.ph
983 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
984 ; AVX512-NEXT: vpcmpnltud %zmm2, %zmm1, %k1
985 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1
986 ; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} {z}
987 ; AVX512-NEXT: vzeroupper
990 %rhs = zext <16 x i8> %x to <16 x i32>
991 %cond = icmp ult <16 x i32> %y, %rhs
992 %sub = sub <16 x i32> %y, %rhs
993 %truncsub = trunc <16 x i32> %sub to <16 x i8>
994 %res = select <16 x i1> %cond, <16 x i8> zeroinitializer, <16 x i8> %truncsub
998 define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
999 ; SSE2-LABEL: test15:
1000 ; SSE2: # %bb.0: # %vector.ph
1001 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1002 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1003 ; SSE2-NEXT: pxor %xmm3, %xmm4
1004 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
1005 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1006 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
1007 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
1008 ; SSE2-NEXT: pand %xmm6, %xmm2
1009 ; SSE2-NEXT: pxor %xmm4, %xmm6
1010 ; SSE2-NEXT: por %xmm2, %xmm6
1011 ; SSE2-NEXT: pslld $16, %xmm6
1012 ; SSE2-NEXT: psrad $16, %xmm6
1013 ; SSE2-NEXT: pxor %xmm1, %xmm3
1014 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1015 ; SSE2-NEXT: pxor %xmm5, %xmm4
1016 ; SSE2-NEXT: pand %xmm1, %xmm5
1017 ; SSE2-NEXT: por %xmm4, %xmm5
1018 ; SSE2-NEXT: pslld $16, %xmm5
1019 ; SSE2-NEXT: psrad $16, %xmm5
1020 ; SSE2-NEXT: packssdw %xmm6, %xmm5
1021 ; SSE2-NEXT: psubusw %xmm5, %xmm0
1024 ; SSSE3-LABEL: test15:
1025 ; SSSE3: # %bb.0: # %vector.ph
1026 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1027 ; SSSE3-NEXT: movdqa %xmm2, %xmm4
1028 ; SSSE3-NEXT: pxor %xmm3, %xmm4
1029 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
1030 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
1031 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
1032 ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
1033 ; SSSE3-NEXT: pand %xmm6, %xmm2
1034 ; SSSE3-NEXT: pxor %xmm4, %xmm6
1035 ; SSSE3-NEXT: por %xmm2, %xmm6
1036 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1037 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
1038 ; SSSE3-NEXT: pxor %xmm1, %xmm3
1039 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
1040 ; SSSE3-NEXT: pxor %xmm5, %xmm4
1041 ; SSSE3-NEXT: pand %xmm1, %xmm5
1042 ; SSSE3-NEXT: por %xmm4, %xmm5
1043 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
1044 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
1045 ; SSSE3-NEXT: psubusw %xmm5, %xmm0
1048 ; SSE41-LABEL: test15:
1049 ; SSE41: # %bb.0: # %vector.ph
1050 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1051 ; SSE41-NEXT: pminud %xmm3, %xmm2
1052 ; SSE41-NEXT: pminud %xmm3, %xmm1
1053 ; SSE41-NEXT: packusdw %xmm2, %xmm1
1054 ; SSE41-NEXT: psubusw %xmm1, %xmm0
1057 ; AVX1-LABEL: test15:
1058 ; AVX1: # %bb.0: # %vector.ph
1059 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1060 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
1061 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
1062 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
1063 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1064 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1065 ; AVX1-NEXT: vzeroupper
1068 ; AVX2-LABEL: test15:
1069 ; AVX2: # %bb.0: # %vector.ph
1070 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
1071 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
1072 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1073 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1074 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1075 ; AVX2-NEXT: vzeroupper
1078 ; AVX512-LABEL: test15:
1079 ; AVX512: # %bb.0: # %vector.ph
1080 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
1081 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1082 ; AVX512-NEXT: vzeroupper
1085 %lhs = zext <8 x i16> %x to <8 x i32>
1086 %cond = icmp ugt <8 x i32> %lhs, %y
1087 %sub = sub <8 x i32> %lhs, %y
1088 %truncsub = trunc <8 x i32> %sub to <8 x i16>
1089 %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
1093 ; FIXME: match this to UMIN+TRUNC+PSUBUS
1094 define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
1095 ; SSE2-LABEL: test16:
1096 ; SSE2: # %bb.0: # %vector.ph
1097 ; SSE2-NEXT: pxor %xmm3, %xmm3
1098 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1099 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1100 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1101 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1102 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1103 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1104 ; SSE2-NEXT: pxor %xmm3, %xmm6
1105 ; SSE2-NEXT: por %xmm3, %xmm5
1106 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
1107 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1108 ; SSE2-NEXT: pxor %xmm3, %xmm6
1109 ; SSE2-NEXT: por %xmm3, %xmm4
1110 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
1111 ; SSE2-NEXT: packssdw %xmm5, %xmm4
1112 ; SSE2-NEXT: pslld $16, %xmm2
1113 ; SSE2-NEXT: psrad $16, %xmm2
1114 ; SSE2-NEXT: pslld $16, %xmm1
1115 ; SSE2-NEXT: psrad $16, %xmm1
1116 ; SSE2-NEXT: packssdw %xmm2, %xmm1
1117 ; SSE2-NEXT: psubw %xmm1, %xmm0
1118 ; SSE2-NEXT: pand %xmm4, %xmm0
1121 ; SSSE3-LABEL: test16:
1122 ; SSSE3: # %bb.0: # %vector.ph
1123 ; SSSE3-NEXT: pxor %xmm3, %xmm3
1124 ; SSSE3-NEXT: movdqa %xmm0, %xmm4
1125 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1126 ; SSSE3-NEXT: movdqa %xmm0, %xmm5
1127 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1128 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1129 ; SSSE3-NEXT: movdqa %xmm2, %xmm6
1130 ; SSSE3-NEXT: pxor %xmm3, %xmm6
1131 ; SSSE3-NEXT: por %xmm3, %xmm5
1132 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
1133 ; SSSE3-NEXT: movdqa %xmm1, %xmm6
1134 ; SSSE3-NEXT: pxor %xmm3, %xmm6
1135 ; SSSE3-NEXT: por %xmm3, %xmm4
1136 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
1137 ; SSSE3-NEXT: packssdw %xmm5, %xmm4
1138 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1139 ; SSSE3-NEXT: pshufb %xmm3, %xmm2
1140 ; SSSE3-NEXT: pshufb %xmm3, %xmm1
1141 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1142 ; SSSE3-NEXT: psubw %xmm1, %xmm0
1143 ; SSSE3-NEXT: pand %xmm4, %xmm0
1146 ; SSE41-LABEL: test16:
1147 ; SSE41: # %bb.0: # %vector.ph
1148 ; SSE41-NEXT: pxor %xmm4, %xmm4
1149 ; SSE41-NEXT: movdqa %xmm0, %xmm5
1150 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1151 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1152 ; SSE41-NEXT: pmaxud %xmm2, %xmm5
1153 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
1154 ; SSE41-NEXT: pmaxud %xmm1, %xmm3
1155 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm3
1156 ; SSE41-NEXT: packssdw %xmm5, %xmm3
1157 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
1158 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
1159 ; SSE41-NEXT: packusdw %xmm2, %xmm1
1160 ; SSE41-NEXT: psubw %xmm1, %xmm0
1161 ; SSE41-NEXT: pandn %xmm0, %xmm3
1162 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1165 ; AVX1-LABEL: test16:
1166 ; AVX1: # %bb.0: # %vector.ph
1167 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1168 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1169 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1170 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1171 ; AVX1-NEXT: vpmaxud %xmm2, %xmm4, %xmm2
1172 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm4, %xmm2
1173 ; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm3
1174 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3
1175 ; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2
1176 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1177 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1178 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1179 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
1180 ; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0
1181 ; AVX1-NEXT: vzeroupper
1184 ; AVX2-LABEL: test16:
1185 ; AVX2: # %bb.0: # %vector.ph
1186 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1187 ; AVX2-NEXT: vpmaxud %ymm2, %ymm1, %ymm2
1188 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2
1189 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
1190 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
1191 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1192 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
1193 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1194 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1195 ; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
1196 ; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
1197 ; AVX2-NEXT: vzeroupper
1200 ; AVX512-LABEL: test16:
1201 ; AVX512: # %bb.0: # %vector.ph
1202 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1203 ; AVX512-NEXT: vpcmpltud %ymm2, %ymm1, %k1
1204 ; AVX512-NEXT: vpmovdw %ymm1, %xmm1
1205 ; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z}
1206 ; AVX512-NEXT: vzeroupper
1209 %lhs = zext <8 x i16> %x to <8 x i32>
1210 %cond = icmp ult <8 x i32> %y, %lhs
1211 %sub = sub <8 x i32> %lhs, %y
1212 %truncsub = trunc <8 x i32> %sub to <8 x i16>
1213 %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
1217 define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind {
1218 ; SSE2-LABEL: test17:
1219 ; SSE2: # %bb.0: # %vector.ph
1220 ; SSE2-NEXT: movd %edi, %xmm4
1221 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1222 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
1223 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
1224 ; SSE2-NEXT: psubusb %xmm4, %xmm0
1225 ; SSE2-NEXT: psubusb %xmm4, %xmm1
1226 ; SSE2-NEXT: psubusb %xmm4, %xmm2
1227 ; SSE2-NEXT: psubusb %xmm4, %xmm3
1230 ; SSSE3-LABEL: test17:
1231 ; SSSE3: # %bb.0: # %vector.ph
1232 ; SSSE3-NEXT: movd %edi, %xmm4
1233 ; SSSE3-NEXT: pxor %xmm5, %xmm5
1234 ; SSSE3-NEXT: pshufb %xmm5, %xmm4
1235 ; SSSE3-NEXT: psubusb %xmm4, %xmm0
1236 ; SSSE3-NEXT: psubusb %xmm4, %xmm1
1237 ; SSSE3-NEXT: psubusb %xmm4, %xmm2
1238 ; SSSE3-NEXT: psubusb %xmm4, %xmm3
1241 ; SSE41-LABEL: test17:
1242 ; SSE41: # %bb.0: # %vector.ph
1243 ; SSE41-NEXT: movd %edi, %xmm4
1244 ; SSE41-NEXT: pxor %xmm5, %xmm5
1245 ; SSE41-NEXT: pshufb %xmm5, %xmm4
1246 ; SSE41-NEXT: psubusb %xmm4, %xmm0
1247 ; SSE41-NEXT: psubusb %xmm4, %xmm1
1248 ; SSE41-NEXT: psubusb %xmm4, %xmm2
1249 ; SSE41-NEXT: psubusb %xmm4, %xmm3
1252 ; AVX1-LABEL: test17:
1253 ; AVX1: # %bb.0: # %vector.ph
1254 ; AVX1-NEXT: vmovd %edi, %xmm2
1255 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1256 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1257 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1258 ; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3
1259 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
1260 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1261 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1262 ; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3
1263 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
1264 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1267 ; AVX2-LABEL: test17:
1268 ; AVX2: # %bb.0: # %vector.ph
1269 ; AVX2-NEXT: vmovd %edi, %xmm2
1270 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
1271 ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
1272 ; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1
1275 ; AVX512-LABEL: test17:
1276 ; AVX512: # %bb.0: # %vector.ph
1277 ; AVX512-NEXT: vpbroadcastb %edi, %zmm1
1278 ; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0
1281 %0 = insertelement <64 x i8> undef, i8 %w, i32 0
1282 %broadcast15 = shufflevector <64 x i8> %0, <64 x i8> undef, <64 x i32> zeroinitializer
1283 %1 = icmp ult <64 x i8> %x, %broadcast15
1284 %2 = sub <64 x i8> %x, %broadcast15
1285 %res = select <64 x i1> %1, <64 x i8> zeroinitializer, <64 x i8> %2
1289 define <32 x i16> @test18(<32 x i16> %x, i16 zeroext %w) nounwind {
1290 ; SSE-LABEL: test18:
1291 ; SSE: # %bb.0: # %vector.ph
1292 ; SSE-NEXT: movd %edi, %xmm4
1293 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
1294 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
1295 ; SSE-NEXT: psubusw %xmm4, %xmm0
1296 ; SSE-NEXT: psubusw %xmm4, %xmm1
1297 ; SSE-NEXT: psubusw %xmm4, %xmm2
1298 ; SSE-NEXT: psubusw %xmm4, %xmm3
1301 ; AVX1-LABEL: test18:
1302 ; AVX1: # %bb.0: # %vector.ph
1303 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1304 ; AVX1-NEXT: vmovd %edi, %xmm3
1305 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
1306 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1307 ; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2
1308 ; AVX1-NEXT: vpsubusw %xmm3, %xmm0, %xmm0
1309 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1310 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1311 ; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2
1312 ; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1
1313 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1316 ; AVX2-LABEL: test18:
1317 ; AVX2: # %bb.0: # %vector.ph
1318 ; AVX2-NEXT: vmovd %edi, %xmm2
1319 ; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
1320 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
1321 ; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1
1324 ; AVX512-LABEL: test18:
1325 ; AVX512: # %bb.0: # %vector.ph
1326 ; AVX512-NEXT: vpbroadcastw %edi, %zmm1
1327 ; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
1330 %0 = insertelement <32 x i16> undef, i16 %w, i32 0
1331 %broadcast15 = shufflevector <32 x i16> %0, <32 x i16> undef, <32 x i32> zeroinitializer
1332 %1 = icmp ult <32 x i16> %x, %broadcast15
1333 %2 = sub <32 x i16> %x, %broadcast15
1334 %res = select <32 x i1> %1, <32 x i16> zeroinitializer, <32 x i16> %2
1338 define <8 x i16> @psubus_8i16_max(<8 x i16> %x, <8 x i16> %y) nounwind {
1339 ; SSE-LABEL: psubus_8i16_max:
1340 ; SSE: # %bb.0: # %vector.ph
1341 ; SSE-NEXT: psubusw %xmm1, %xmm0
1344 ; AVX-LABEL: psubus_8i16_max:
1345 ; AVX: # %bb.0: # %vector.ph
1346 ; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1349 %cmp = icmp ult <8 x i16> %x, %y
1350 %max = select <8 x i1> %cmp, <8 x i16> %y, <8 x i16> %x
1351 %res = sub <8 x i16> %max, %y
1355 define <16 x i8> @psubus_16i8_max(<16 x i8> %x, <16 x i8> %y) nounwind {
1356 ; SSE-LABEL: psubus_16i8_max:
1357 ; SSE: # %bb.0: # %vector.ph
1358 ; SSE-NEXT: psubusb %xmm1, %xmm0
1361 ; AVX-LABEL: psubus_16i8_max:
1362 ; AVX: # %bb.0: # %vector.ph
1363 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
1366 %cmp = icmp ult <16 x i8> %x, %y
1367 %max = select <16 x i1> %cmp, <16 x i8> %y, <16 x i8> %x
1368 %res = sub <16 x i8> %max, %y
1372 define <16 x i16> @psubus_16i16_max(<16 x i16> %x, <16 x i16> %y) nounwind {
1373 ; SSE-LABEL: psubus_16i16_max:
1374 ; SSE: # %bb.0: # %vector.ph
1375 ; SSE-NEXT: psubusw %xmm2, %xmm0
1376 ; SSE-NEXT: psubusw %xmm3, %xmm1
1379 ; AVX1-LABEL: psubus_16i16_max:
1380 ; AVX1: # %bb.0: # %vector.ph
1381 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1382 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1383 ; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
1384 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1385 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1388 ; AVX2-LABEL: psubus_16i16_max:
1389 ; AVX2: # %bb.0: # %vector.ph
1390 ; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1393 ; AVX512-LABEL: psubus_16i16_max:
1394 ; AVX512: # %bb.0: # %vector.ph
1395 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1398 %cmp = icmp ult <16 x i16> %x, %y
1399 %max = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> %x
1400 %res = sub <16 x i16> %max, %y
1404 define <32 x i16> @psubus_32i16_max(<32 x i16> %x, <32 x i16> %y) nounwind {
1405 ; SSE-LABEL: psubus_32i16_max:
1406 ; SSE: # %bb.0: # %vector.ph
1407 ; SSE-NEXT: psubusw %xmm4, %xmm0
1408 ; SSE-NEXT: psubusw %xmm5, %xmm1
1409 ; SSE-NEXT: psubusw %xmm6, %xmm2
1410 ; SSE-NEXT: psubusw %xmm7, %xmm3
1413 ; AVX1-LABEL: psubus_32i16_max:
1414 ; AVX1: # %bb.0: # %vector.ph
1415 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1416 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
1417 ; AVX1-NEXT: vpsubusw %xmm4, %xmm5, %xmm4
1418 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
1419 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1420 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
1421 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1422 ; AVX1-NEXT: vpsubusw %xmm2, %xmm4, %xmm2
1423 ; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1
1424 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1427 ; AVX2-LABEL: psubus_32i16_max:
1428 ; AVX2: # %bb.0: # %vector.ph
1429 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
1430 ; AVX2-NEXT: vpsubusw %ymm3, %ymm1, %ymm1
1433 ; AVX512-LABEL: psubus_32i16_max:
1434 ; AVX512: # %bb.0: # %vector.ph
1435 ; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
1438 %cmp = icmp ult <32 x i16> %x, %y
1439 %max = select <32 x i1> %cmp, <32 x i16> %y, <32 x i16> %x
1440 %res = sub <32 x i16> %max, %y
1444 define <64 x i8> @psubus_64i8_max(<64 x i8> %x, <64 x i8> %y) nounwind {
1445 ; SSE-LABEL: psubus_64i8_max:
1446 ; SSE: # %bb.0: # %vector.ph
1447 ; SSE-NEXT: psubusb %xmm4, %xmm0
1448 ; SSE-NEXT: psubusb %xmm5, %xmm1
1449 ; SSE-NEXT: psubusb %xmm6, %xmm2
1450 ; SSE-NEXT: psubusb %xmm7, %xmm3
1453 ; AVX1-LABEL: psubus_64i8_max:
1454 ; AVX1: # %bb.0: # %vector.ph
1455 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1456 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
1457 ; AVX1-NEXT: vpsubusb %xmm4, %xmm5, %xmm4
1458 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
1459 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1460 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
1461 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1462 ; AVX1-NEXT: vpsubusb %xmm2, %xmm4, %xmm2
1463 ; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1
1464 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1467 ; AVX2-LABEL: psubus_64i8_max:
1468 ; AVX2: # %bb.0: # %vector.ph
1469 ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
1470 ; AVX2-NEXT: vpsubusb %ymm3, %ymm1, %ymm1
1473 ; AVX512-LABEL: psubus_64i8_max:
1474 ; AVX512: # %bb.0: # %vector.ph
1475 ; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0
1478 %cmp = icmp ult <64 x i8> %x, %y
1479 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
1480 %res = sub <64 x i8> %max, %y
1484 define <32 x i8> @psubus_32i8_max(<32 x i8> %x, <32 x i8> %y) nounwind {
1485 ; SSE-LABEL: psubus_32i8_max:
1486 ; SSE: # %bb.0: # %vector.ph
1487 ; SSE-NEXT: psubusb %xmm2, %xmm0
1488 ; SSE-NEXT: psubusb %xmm3, %xmm1
1491 ; AVX1-LABEL: psubus_32i8_max:
1492 ; AVX1: # %bb.0: # %vector.ph
1493 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1494 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1495 ; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm2
1496 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
1497 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1500 ; AVX2-LABEL: psubus_32i8_max:
1501 ; AVX2: # %bb.0: # %vector.ph
1502 ; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
1505 ; AVX512-LABEL: psubus_32i8_max:
1506 ; AVX512: # %bb.0: # %vector.ph
1507 ; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
1510 %cmp = icmp ult <32 x i8> %x, %y
1511 %max = select <32 x i1> %cmp, <32 x i8> %y, <32 x i8> %x
1512 %res = sub <32 x i8> %max, %y
1516 define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
1517 ; SSE2-LABEL: psubus_8i32_max:
1518 ; SSE2: # %bb.0: # %vector.ph
1519 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1520 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1521 ; SSE2-NEXT: pxor %xmm3, %xmm4
1522 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
1523 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1524 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
1525 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
1526 ; SSE2-NEXT: pand %xmm6, %xmm2
1527 ; SSE2-NEXT: pxor %xmm4, %xmm6
1528 ; SSE2-NEXT: por %xmm2, %xmm6
1529 ; SSE2-NEXT: pslld $16, %xmm6
1530 ; SSE2-NEXT: psrad $16, %xmm6
1531 ; SSE2-NEXT: pxor %xmm1, %xmm3
1532 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1533 ; SSE2-NEXT: pxor %xmm5, %xmm4
1534 ; SSE2-NEXT: pand %xmm1, %xmm5
1535 ; SSE2-NEXT: por %xmm4, %xmm5
1536 ; SSE2-NEXT: pslld $16, %xmm5
1537 ; SSE2-NEXT: psrad $16, %xmm5
1538 ; SSE2-NEXT: packssdw %xmm6, %xmm5
1539 ; SSE2-NEXT: psubusw %xmm5, %xmm0
1542 ; SSSE3-LABEL: psubus_8i32_max:
1543 ; SSSE3: # %bb.0: # %vector.ph
1544 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1545 ; SSSE3-NEXT: movdqa %xmm2, %xmm4
1546 ; SSSE3-NEXT: pxor %xmm3, %xmm4
1547 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
1548 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
1549 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
1550 ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
1551 ; SSSE3-NEXT: pand %xmm6, %xmm2
1552 ; SSSE3-NEXT: pxor %xmm4, %xmm6
1553 ; SSSE3-NEXT: por %xmm2, %xmm6
1554 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1555 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
1556 ; SSSE3-NEXT: pxor %xmm1, %xmm3
1557 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
1558 ; SSSE3-NEXT: pxor %xmm5, %xmm4
1559 ; SSSE3-NEXT: pand %xmm1, %xmm5
1560 ; SSSE3-NEXT: por %xmm4, %xmm5
1561 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
1562 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
1563 ; SSSE3-NEXT: psubusw %xmm5, %xmm0
1566 ; SSE41-LABEL: psubus_8i32_max:
1567 ; SSE41: # %bb.0: # %vector.ph
1568 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1569 ; SSE41-NEXT: pminud %xmm3, %xmm2
1570 ; SSE41-NEXT: pminud %xmm3, %xmm1
1571 ; SSE41-NEXT: packusdw %xmm2, %xmm1
1572 ; SSE41-NEXT: psubusw %xmm1, %xmm0
1575 ; AVX1-LABEL: psubus_8i32_max:
1576 ; AVX1: # %bb.0: # %vector.ph
1577 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1578 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
1579 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
1580 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
1581 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1582 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1583 ; AVX1-NEXT: vzeroupper
1586 ; AVX2-LABEL: psubus_8i32_max:
1587 ; AVX2: # %bb.0: # %vector.ph
1588 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
1589 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
1590 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1591 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1592 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1593 ; AVX2-NEXT: vzeroupper
1596 ; AVX512-LABEL: psubus_8i32_max:
1597 ; AVX512: # %bb.0: # %vector.ph
1598 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
1599 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1600 ; AVX512-NEXT: vzeroupper
1603 %lhs = zext <8 x i16> %x to <8 x i32>
1604 %cond = icmp ult <8 x i32> %lhs, %y
1605 %max = select <8 x i1> %cond, <8 x i32> %y, <8 x i32> %lhs
1606 %sub = sub <8 x i32> %max, %y
1607 %res = trunc <8 x i32> %sub to <8 x i16>
1611 define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
1612 ; SSE2OR3-LABEL: psubus_8i64_max:
1613 ; SSE2OR3: # %bb.0: # %vector.ph
1614 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
1615 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm7
1616 ; SSE2OR3-NEXT: pxor %xmm5, %xmm7
1617 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
1618 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
1619 ; SSE2OR3-NEXT: movdqa %xmm6, %xmm9
1620 ; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9
1621 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
1622 ; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8
1623 ; SSE2OR3-NEXT: pand %xmm9, %xmm8
1624 ; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm7
1625 ; SSE2OR3-NEXT: pand %xmm8, %xmm4
1626 ; SSE2OR3-NEXT: pxor %xmm7, %xmm8
1627 ; SSE2OR3-NEXT: por %xmm4, %xmm8
1628 ; SSE2OR3-NEXT: movdqa %xmm3, %xmm4
1629 ; SSE2OR3-NEXT: pxor %xmm5, %xmm4
1630 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
1631 ; SSE2OR3-NEXT: movdqa %xmm6, %xmm10
1632 ; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10
1633 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1634 ; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm4
1635 ; SSE2OR3-NEXT: pand %xmm10, %xmm4
1636 ; SSE2OR3-NEXT: pand %xmm4, %xmm3
1637 ; SSE2OR3-NEXT: pxor %xmm7, %xmm4
1638 ; SSE2OR3-NEXT: por %xmm3, %xmm4
1639 ; SSE2OR3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm8[0,2]
1640 ; SSE2OR3-NEXT: pslld $16, %xmm4
1641 ; SSE2OR3-NEXT: psrad $16, %xmm4
1642 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3
1643 ; SSE2OR3-NEXT: pxor %xmm5, %xmm3
1644 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
1645 ; SSE2OR3-NEXT: movdqa %xmm6, %xmm9
1646 ; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9
1647 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1648 ; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm3
1649 ; SSE2OR3-NEXT: pand %xmm9, %xmm3
1650 ; SSE2OR3-NEXT: pand %xmm3, %xmm2
1651 ; SSE2OR3-NEXT: pxor %xmm7, %xmm3
1652 ; SSE2OR3-NEXT: por %xmm2, %xmm3
1653 ; SSE2OR3-NEXT: movdqa %xmm1, %xmm2
1654 ; SSE2OR3-NEXT: pxor %xmm5, %xmm2
1655 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2]
1656 ; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6
1657 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1658 ; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm2
1659 ; SSE2OR3-NEXT: pand %xmm6, %xmm2
1660 ; SSE2OR3-NEXT: pxor %xmm2, %xmm7
1661 ; SSE2OR3-NEXT: pand %xmm1, %xmm2
1662 ; SSE2OR3-NEXT: por %xmm7, %xmm2
1663 ; SSE2OR3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
1664 ; SSE2OR3-NEXT: pslld $16, %xmm2
1665 ; SSE2OR3-NEXT: psrad $16, %xmm2
1666 ; SSE2OR3-NEXT: packssdw %xmm4, %xmm2
1667 ; SSE2OR3-NEXT: psubusw %xmm2, %xmm0
1668 ; SSE2OR3-NEXT: retq
1670 ; SSE41-LABEL: psubus_8i64_max:
1671 ; SSE41: # %bb.0: # %vector.ph
1672 ; SSE41-NEXT: movdqa %xmm0, %xmm5
1673 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
1674 ; SSE41-NEXT: movdqa %xmm4, %xmm0
1675 ; SSE41-NEXT: pxor %xmm9, %xmm0
1676 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991]
1677 ; SSE41-NEXT: movdqa %xmm7, %xmm8
1678 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
1679 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
1680 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
1681 ; SSE41-NEXT: movdqa %xmm6, %xmm0
1682 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0
1683 ; SSE41-NEXT: pand %xmm8, %xmm0
1684 ; SSE41-NEXT: movapd {{.*#+}} xmm8 = [65535,65535]
1685 ; SSE41-NEXT: movapd %xmm8, %xmm10
1686 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10
1687 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1688 ; SSE41-NEXT: pxor %xmm9, %xmm0
1689 ; SSE41-NEXT: movdqa %xmm7, %xmm4
1690 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
1691 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
1692 ; SSE41-NEXT: movdqa %xmm6, %xmm0
1693 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0
1694 ; SSE41-NEXT: pand %xmm4, %xmm0
1695 ; SSE41-NEXT: movapd %xmm8, %xmm4
1696 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
1697 ; SSE41-NEXT: packusdw %xmm10, %xmm4
1698 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1699 ; SSE41-NEXT: pxor %xmm9, %xmm0
1700 ; SSE41-NEXT: movdqa %xmm7, %xmm3
1701 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
1702 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
1703 ; SSE41-NEXT: movdqa %xmm6, %xmm0
1704 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0
1705 ; SSE41-NEXT: pand %xmm3, %xmm0
1706 ; SSE41-NEXT: movapd %xmm8, %xmm3
1707 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
1708 ; SSE41-NEXT: pxor %xmm1, %xmm9
1709 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm7
1710 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
1711 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
1712 ; SSE41-NEXT: pand %xmm7, %xmm6
1713 ; SSE41-NEXT: movdqa %xmm6, %xmm0
1714 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8
1715 ; SSE41-NEXT: packusdw %xmm3, %xmm8
1716 ; SSE41-NEXT: packusdw %xmm4, %xmm8
1717 ; SSE41-NEXT: psubusw %xmm8, %xmm5
1718 ; SSE41-NEXT: movdqa %xmm5, %xmm0
1721 ; AVX1-LABEL: psubus_8i64_max:
1722 ; AVX1: # %bb.0: # %vector.ph
1723 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1724 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
1725 ; AVX1-NEXT: # xmm4 = mem[0,0]
1726 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
1727 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343]
1728 ; AVX1-NEXT: # xmm6 = mem[0,0]
1729 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
1730 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535]
1731 ; AVX1-NEXT: # xmm7 = mem[0,0]
1732 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
1733 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
1734 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
1735 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2
1736 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1737 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1738 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
1739 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
1740 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
1741 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4
1742 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
1743 ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm7, %xmm1
1744 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1745 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1746 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1747 ; AVX1-NEXT: vzeroupper
1750 ; AVX2-LABEL: psubus_8i64_max:
1751 ; AVX2: # %bb.0: # %vector.ph
1752 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
1753 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4
1754 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343]
1755 ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
1756 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [65535,65535,65535,65535]
1757 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
1758 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3
1759 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
1760 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
1761 ; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
1762 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1763 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1764 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1765 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1766 ; AVX2-NEXT: vzeroupper
1769 ; AVX512-LABEL: psubus_8i64_max:
1770 ; AVX512: # %bb.0: # %vector.ph
1771 ; AVX512-NEXT: vpmovusqw %zmm1, %xmm1
1772 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1773 ; AVX512-NEXT: vzeroupper
1776 %lhs = zext <8 x i16> %x to <8 x i64>
1777 %cond = icmp ult <8 x i64> %lhs, %y
1778 %max = select <8 x i1> %cond, <8 x i64> %y, <8 x i64> %lhs
1779 %sub = sub <8 x i64> %max, %y
1780 %res = trunc <8 x i64> %sub to <8 x i16>
1784 define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
1785 ; SSE2-LABEL: psubus_16i32_max:
1786 ; SSE2: # %bb.0: # %vector.ph
1787 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
1788 ; SSE2-NEXT: movdqa %xmm3, %xmm8
1789 ; SSE2-NEXT: pxor %xmm7, %xmm8
1790 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
1791 ; SSE2-NEXT: movdqa %xmm6, %xmm9
1792 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
1793 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm8
1794 ; SSE2-NEXT: pand %xmm9, %xmm3
1795 ; SSE2-NEXT: pxor %xmm8, %xmm9
1796 ; SSE2-NEXT: por %xmm3, %xmm9
1797 ; SSE2-NEXT: pslld $16, %xmm9
1798 ; SSE2-NEXT: psrad $16, %xmm9
1799 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1800 ; SSE2-NEXT: pxor %xmm7, %xmm3
1801 ; SSE2-NEXT: movdqa %xmm6, %xmm10
1802 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm10
1803 ; SSE2-NEXT: pand %xmm10, %xmm2
1804 ; SSE2-NEXT: pxor %xmm8, %xmm10
1805 ; SSE2-NEXT: por %xmm2, %xmm10
1806 ; SSE2-NEXT: pslld $16, %xmm10
1807 ; SSE2-NEXT: psrad $16, %xmm10
1808 ; SSE2-NEXT: packssdw %xmm9, %xmm10
1809 ; SSE2-NEXT: psubusw %xmm10, %xmm0
1810 ; SSE2-NEXT: movdqa %xmm5, %xmm2
1811 ; SSE2-NEXT: pxor %xmm7, %xmm2
1812 ; SSE2-NEXT: movdqa %xmm6, %xmm3
1813 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
1814 ; SSE2-NEXT: pand %xmm3, %xmm5
1815 ; SSE2-NEXT: pxor %xmm8, %xmm3
1816 ; SSE2-NEXT: por %xmm5, %xmm3
1817 ; SSE2-NEXT: pslld $16, %xmm3
1818 ; SSE2-NEXT: psrad $16, %xmm3
1819 ; SSE2-NEXT: pxor %xmm4, %xmm7
1820 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
1821 ; SSE2-NEXT: pxor %xmm6, %xmm8
1822 ; SSE2-NEXT: pand %xmm4, %xmm6
1823 ; SSE2-NEXT: por %xmm8, %xmm6
1824 ; SSE2-NEXT: pslld $16, %xmm6
1825 ; SSE2-NEXT: psrad $16, %xmm6
1826 ; SSE2-NEXT: packssdw %xmm3, %xmm6
1827 ; SSE2-NEXT: psubusw %xmm6, %xmm1
1830 ; SSSE3-LABEL: psubus_16i32_max:
1831 ; SSSE3: # %bb.0: # %vector.ph
1832 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
1833 ; SSSE3-NEXT: movdqa %xmm3, %xmm8
1834 ; SSSE3-NEXT: pxor %xmm7, %xmm8
1835 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
1836 ; SSSE3-NEXT: movdqa %xmm6, %xmm9
1837 ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
1838 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8
1839 ; SSSE3-NEXT: pand %xmm9, %xmm3
1840 ; SSSE3-NEXT: pxor %xmm8, %xmm9
1841 ; SSSE3-NEXT: por %xmm3, %xmm9
1842 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1843 ; SSSE3-NEXT: pshufb %xmm3, %xmm9
1844 ; SSSE3-NEXT: movdqa %xmm2, %xmm10
1845 ; SSSE3-NEXT: pxor %xmm7, %xmm10
1846 ; SSSE3-NEXT: movdqa %xmm6, %xmm11
1847 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11
1848 ; SSSE3-NEXT: pand %xmm11, %xmm2
1849 ; SSSE3-NEXT: pxor %xmm8, %xmm11
1850 ; SSSE3-NEXT: por %xmm2, %xmm11
1851 ; SSSE3-NEXT: pshufb %xmm3, %xmm11
1852 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm9[0]
1853 ; SSSE3-NEXT: psubusw %xmm11, %xmm0
1854 ; SSSE3-NEXT: movdqa %xmm5, %xmm2
1855 ; SSSE3-NEXT: pxor %xmm7, %xmm2
1856 ; SSSE3-NEXT: movdqa %xmm6, %xmm9
1857 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9
1858 ; SSSE3-NEXT: pand %xmm9, %xmm5
1859 ; SSSE3-NEXT: pxor %xmm8, %xmm9
1860 ; SSSE3-NEXT: por %xmm5, %xmm9
1861 ; SSSE3-NEXT: pshufb %xmm3, %xmm9
1862 ; SSSE3-NEXT: pxor %xmm4, %xmm7
1863 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6
1864 ; SSSE3-NEXT: pxor %xmm6, %xmm8
1865 ; SSSE3-NEXT: pand %xmm4, %xmm6
1866 ; SSSE3-NEXT: por %xmm8, %xmm6
1867 ; SSSE3-NEXT: pshufb %xmm3, %xmm6
1868 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0]
1869 ; SSSE3-NEXT: psubusw %xmm6, %xmm1
1872 ; SSE41-LABEL: psubus_16i32_max:
1873 ; SSE41: # %bb.0: # %vector.ph
1874 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
1875 ; SSE41-NEXT: pminud %xmm6, %xmm3
1876 ; SSE41-NEXT: pminud %xmm6, %xmm2
1877 ; SSE41-NEXT: packusdw %xmm3, %xmm2
1878 ; SSE41-NEXT: psubusw %xmm2, %xmm0
1879 ; SSE41-NEXT: pminud %xmm6, %xmm5
1880 ; SSE41-NEXT: pminud %xmm6, %xmm4
1881 ; SSE41-NEXT: packusdw %xmm5, %xmm4
1882 ; SSE41-NEXT: psubusw %xmm4, %xmm1
1885 ; AVX1-LABEL: psubus_16i32_max:
1886 ; AVX1: # %bb.0: # %vector.ph
1887 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1888 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [65535,65535,65535,65535]
1889 ; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
1890 ; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2
1891 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1892 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1893 ; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
1894 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1895 ; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
1896 ; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1
1897 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1898 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1899 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1902 ; AVX2-LABEL: psubus_16i32_max:
1903 ; AVX2: # %bb.0: # %vector.ph
1904 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535]
1905 ; AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2
1906 ; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
1907 ; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
1908 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
1909 ; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1912 ; AVX512-LABEL: psubus_16i32_max:
1913 ; AVX512: # %bb.0: # %vector.ph
1914 ; AVX512-NEXT: vpmovusdw %zmm1, %ymm1
1915 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1918 %lhs = zext <16 x i16> %x to <16 x i32>
1919 %cond = icmp ult <16 x i32> %lhs, %y
1920 %max = select <16 x i1> %cond, <16 x i32> %y, <16 x i32> %lhs
1921 %sub = sub <16 x i32> %max, %y
1922 %res = trunc <16 x i32> %sub to <16 x i16>
1926 define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwind {
1927 ; SSE2-LABEL: psubus_i16_i32_max_swapped:
1928 ; SSE2: # %bb.0: # %vector.ph
1929 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1930 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1931 ; SSE2-NEXT: pxor %xmm3, %xmm4
1932 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
1933 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1934 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
1935 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
1936 ; SSE2-NEXT: pand %xmm6, %xmm2
1937 ; SSE2-NEXT: pxor %xmm4, %xmm6
1938 ; SSE2-NEXT: por %xmm2, %xmm6
1939 ; SSE2-NEXT: pslld $16, %xmm6
1940 ; SSE2-NEXT: psrad $16, %xmm6
1941 ; SSE2-NEXT: pxor %xmm1, %xmm3
1942 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1943 ; SSE2-NEXT: pxor %xmm5, %xmm4
1944 ; SSE2-NEXT: pand %xmm1, %xmm5
1945 ; SSE2-NEXT: por %xmm4, %xmm5
1946 ; SSE2-NEXT: pslld $16, %xmm5
1947 ; SSE2-NEXT: psrad $16, %xmm5
1948 ; SSE2-NEXT: packssdw %xmm6, %xmm5
1949 ; SSE2-NEXT: psubusw %xmm5, %xmm0
1952 ; SSSE3-LABEL: psubus_i16_i32_max_swapped:
1953 ; SSSE3: # %bb.0: # %vector.ph
1954 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
1955 ; SSSE3-NEXT: movdqa %xmm2, %xmm4
1956 ; SSSE3-NEXT: pxor %xmm3, %xmm4
1957 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
1958 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
1959 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
1960 ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
1961 ; SSSE3-NEXT: pand %xmm6, %xmm2
1962 ; SSSE3-NEXT: pxor %xmm4, %xmm6
1963 ; SSSE3-NEXT: por %xmm2, %xmm6
1964 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1965 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
1966 ; SSSE3-NEXT: pxor %xmm1, %xmm3
1967 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
1968 ; SSSE3-NEXT: pxor %xmm5, %xmm4
1969 ; SSSE3-NEXT: pand %xmm1, %xmm5
1970 ; SSSE3-NEXT: por %xmm4, %xmm5
1971 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
1972 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
1973 ; SSSE3-NEXT: psubusw %xmm5, %xmm0
1976 ; SSE41-LABEL: psubus_i16_i32_max_swapped:
1977 ; SSE41: # %bb.0: # %vector.ph
1978 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1979 ; SSE41-NEXT: pminud %xmm3, %xmm2
1980 ; SSE41-NEXT: pminud %xmm3, %xmm1
1981 ; SSE41-NEXT: packusdw %xmm2, %xmm1
1982 ; SSE41-NEXT: psubusw %xmm1, %xmm0
1985 ; AVX1-LABEL: psubus_i16_i32_max_swapped:
1986 ; AVX1: # %bb.0: # %vector.ph
1987 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1988 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
1989 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
1990 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
1991 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1992 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1993 ; AVX1-NEXT: vzeroupper
1996 ; AVX2-LABEL: psubus_i16_i32_max_swapped:
1997 ; AVX2: # %bb.0: # %vector.ph
1998 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
1999 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
2000 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2001 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2002 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2003 ; AVX2-NEXT: vzeroupper
2006 ; AVX512-LABEL: psubus_i16_i32_max_swapped:
2007 ; AVX512: # %bb.0: # %vector.ph
2008 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
2009 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2010 ; AVX512-NEXT: vzeroupper
2013 %lhs = zext <8 x i16> %x to <8 x i32>
2014 %cond = icmp ult <8 x i32> %y, %lhs
2015 %max = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
2016 %sub = sub <8 x i32> %max, %y
2017 %res = trunc <8 x i32> %sub to <8 x i16>
2021 define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind {
2022 ; SSE2-LABEL: psubus_i16_i32_min:
2023 ; SSE2: # %bb.0: # %vector.ph
2024 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
2025 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2026 ; SSE2-NEXT: pxor %xmm3, %xmm4
2027 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
2028 ; SSE2-NEXT: movdqa %xmm5, %xmm6
2029 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
2030 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
2031 ; SSE2-NEXT: pand %xmm6, %xmm2
2032 ; SSE2-NEXT: pxor %xmm4, %xmm6
2033 ; SSE2-NEXT: por %xmm2, %xmm6
2034 ; SSE2-NEXT: pslld $16, %xmm6
2035 ; SSE2-NEXT: psrad $16, %xmm6
2036 ; SSE2-NEXT: pxor %xmm1, %xmm3
2037 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
2038 ; SSE2-NEXT: pxor %xmm5, %xmm4
2039 ; SSE2-NEXT: pand %xmm1, %xmm5
2040 ; SSE2-NEXT: por %xmm4, %xmm5
2041 ; SSE2-NEXT: pslld $16, %xmm5
2042 ; SSE2-NEXT: psrad $16, %xmm5
2043 ; SSE2-NEXT: packssdw %xmm6, %xmm5
2044 ; SSE2-NEXT: psubusw %xmm5, %xmm0
2047 ; SSSE3-LABEL: psubus_i16_i32_min:
2048 ; SSSE3: # %bb.0: # %vector.ph
2049 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
2050 ; SSSE3-NEXT: movdqa %xmm2, %xmm4
2051 ; SSSE3-NEXT: pxor %xmm3, %xmm4
2052 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
2053 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
2054 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
2055 ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
2056 ; SSSE3-NEXT: pand %xmm6, %xmm2
2057 ; SSSE3-NEXT: pxor %xmm4, %xmm6
2058 ; SSSE3-NEXT: por %xmm2, %xmm6
2059 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2060 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
2061 ; SSSE3-NEXT: pxor %xmm1, %xmm3
2062 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
2063 ; SSSE3-NEXT: pxor %xmm5, %xmm4
2064 ; SSSE3-NEXT: pand %xmm1, %xmm5
2065 ; SSSE3-NEXT: por %xmm4, %xmm5
2066 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
2067 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
2068 ; SSSE3-NEXT: psubusw %xmm5, %xmm0
2071 ; SSE41-LABEL: psubus_i16_i32_min:
2072 ; SSE41: # %bb.0: # %vector.ph
2073 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
2074 ; SSE41-NEXT: pminud %xmm3, %xmm2
2075 ; SSE41-NEXT: pminud %xmm3, %xmm1
2076 ; SSE41-NEXT: packusdw %xmm2, %xmm1
2077 ; SSE41-NEXT: psubusw %xmm1, %xmm0
2080 ; AVX1-LABEL: psubus_i16_i32_min:
2081 ; AVX1: # %bb.0: # %vector.ph
2082 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2083 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
2084 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
2085 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
2086 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2087 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2088 ; AVX1-NEXT: vzeroupper
2091 ; AVX2-LABEL: psubus_i16_i32_min:
2092 ; AVX2: # %bb.0: # %vector.ph
2093 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
2094 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
2095 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2096 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2097 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2098 ; AVX2-NEXT: vzeroupper
2101 ; AVX512-LABEL: psubus_i16_i32_min:
2102 ; AVX512: # %bb.0: # %vector.ph
2103 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
2104 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2105 ; AVX512-NEXT: vzeroupper
2108 %lhs = zext <8 x i16> %x to <8 x i32>
2109 %cond = icmp ult <8 x i32> %lhs, %y
2110 %min = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
2111 %sub = sub <8 x i32> %lhs, %min
2112 %res = trunc <8 x i32> %sub to <8 x i16>
2116 define void @subus_v8i8(ptr %p1, ptr %p2) {
2117 ; SSE-LABEL: subus_v8i8:
2119 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2120 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2121 ; SSE-NEXT: psubusb %xmm1, %xmm0
2122 ; SSE-NEXT: movq %xmm0, (%rdi)
2125 ; AVX-LABEL: subus_v8i8:
2127 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2128 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2129 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2130 ; AVX-NEXT: vmovq %xmm0, (%rdi)
2132 %ld1 = load <8 x i8>, ptr %p1, align 8
2133 %ld2 = load <8 x i8>, ptr %p2, align 8
2134 %1 = sub <8 x i8> %ld1, %ld2
2135 %2 = icmp ugt <8 x i8> %ld1, %ld2
2136 %sh3 = select <8 x i1> %2, <8 x i8> %1, <8 x i8> zeroinitializer
2137 store <8 x i8> %sh3, ptr %p1, align 8
2141 define void @subus_v4i8(ptr %p1, ptr %p2) {
2142 ; SSE-LABEL: subus_v4i8:
2144 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2145 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2146 ; SSE-NEXT: psubusb %xmm1, %xmm0
2147 ; SSE-NEXT: movd %xmm0, (%rdi)
2150 ; AVX-LABEL: subus_v4i8:
2152 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2153 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2154 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2155 ; AVX-NEXT: vmovd %xmm0, (%rdi)
2157 %ld1 = load <4 x i8>, ptr %p1, align 8
2158 %ld2 = load <4 x i8>, ptr %p2, align 8
2159 %1 = sub <4 x i8> %ld1, %ld2
2160 %2 = icmp ugt <4 x i8> %ld1, %ld2
2161 %sh3 = select <4 x i1> %2, <4 x i8> %1, <4 x i8> zeroinitializer
2162 store <4 x i8> %sh3, ptr %p1, align 8
2166 define void @subus_v2i8(ptr %p1, ptr %p2) {
2167 ; SSE2OR3-LABEL: subus_v2i8:
2169 ; SSE2OR3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2170 ; SSE2OR3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2171 ; SSE2OR3-NEXT: psubusb %xmm1, %xmm0
2172 ; SSE2OR3-NEXT: movd %xmm0, %eax
2173 ; SSE2OR3-NEXT: movw %ax, (%rdi)
2174 ; SSE2OR3-NEXT: retq
2176 ; SSE41-LABEL: subus_v2i8:
2178 ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2179 ; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2180 ; SSE41-NEXT: psubusb %xmm1, %xmm0
2181 ; SSE41-NEXT: pextrw $0, %xmm0, (%rdi)
2184 ; AVX-LABEL: subus_v2i8:
2186 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2187 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2188 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2189 ; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
2191 %ld1 = load <2 x i8>, ptr %p1, align 8
2192 %ld2 = load <2 x i8>, ptr %p2, align 8
2193 %1 = sub <2 x i8> %ld1, %ld2
2194 %2 = icmp ugt <2 x i8> %ld1, %ld2
2195 %sh3 = select <2 x i1> %2, <2 x i8> %1, <2 x i8> zeroinitializer
2196 store <2 x i8> %sh3, ptr %p1, align 8
2200 define void @subus_v4i16(ptr %p1, ptr %p2) {
2201 ; SSE-LABEL: subus_v4i16:
2203 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2204 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2205 ; SSE-NEXT: psubusw %xmm1, %xmm0
2206 ; SSE-NEXT: movq %xmm0, (%rdi)
2209 ; AVX-LABEL: subus_v4i16:
2211 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2212 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2213 ; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2214 ; AVX-NEXT: vmovq %xmm0, (%rdi)
2216 %ld1 = load <4 x i16>, ptr %p1, align 8
2217 %ld2 = load <4 x i16>, ptr %p2, align 8
2218 %1 = sub <4 x i16> %ld1, %ld2
2219 %2 = icmp ugt <4 x i16> %ld1, %ld2
2220 %sh3 = select <4 x i1> %2, <4 x i16> %1, <4 x i16> zeroinitializer
2221 store <4 x i16> %sh3, ptr %p1, align 8
2225 define void @subus_v2i16(ptr %p1, ptr %p2) {
2226 ; SSE-LABEL: subus_v2i16:
2228 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2229 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2230 ; SSE-NEXT: psubusw %xmm1, %xmm0
2231 ; SSE-NEXT: movd %xmm0, (%rdi)
2234 ; AVX-LABEL: subus_v2i16:
2236 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2237 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2238 ; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2239 ; AVX-NEXT: vmovd %xmm0, (%rdi)
2241 %ld1 = load <2 x i16>, ptr %p1, align 8
2242 %ld2 = load <2 x i16>, ptr %p2, align 8
2243 %1 = sub <2 x i16> %ld1, %ld2
2244 %2 = icmp ugt <2 x i16> %ld1, %ld2
2245 %sh3 = select <2 x i1> %2, <2 x i16> %1, <2 x i16> zeroinitializer
2246 store <2 x i16> %sh3, ptr %p1, align 8
2250 define <16 x i8> @test19(<16 x i8> %x) {
2251 ; SSE-LABEL: test19:
2252 ; SSE: # %bb.0: # %entry
2253 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2256 ; AVX-LABEL: test19:
2257 ; AVX: # %bb.0: # %entry
2258 ; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2261 %0 = icmp ugt <16 x i8> %x, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2262 %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2263 %2 = add <16 x i8> %1, <i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70>
2267 define <16 x i8> @test20(<16 x i8> %x) {
2268 ; SSE-LABEL: test20:
2269 ; SSE: # %bb.0: # %entry
2270 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2273 ; AVX-LABEL: test20:
2274 ; AVX: # %bb.0: # %entry
2275 ; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2278 %0 = icmp ugt <16 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70>
2279 %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70>
2280 %2 = add <16 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70>
2284 define <8 x i16> @test21(<8 x i16> %x) {
2285 ; SSE-LABEL: test21:
2286 ; SSE: # %bb.0: # %entry
2287 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2290 ; AVX-LABEL: test21:
2291 ; AVX: # %bb.0: # %entry
2292 ; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2295 %0 = icmp ugt <8 x i16> %x, <i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700>
2296 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700>
2297 %2 = add <8 x i16> %1, <i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700>
2301 define <8 x i16> @test22(<8 x i16> %x) {
2302 ; SSE-LABEL: test22:
2303 ; SSE: # %bb.0: # %entry
2304 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2307 ; AVX-LABEL: test22:
2308 ; AVX: # %bb.0: # %entry
2309 ; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2312 %0 = icmp ugt <8 x i16> %x, <i16 1, i16 -22000, i16 -770, i16 98, i16 19, i16 1000, i16 3456, i16 70>
2313 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 1, i16 -22000, i16 -770, i16 98, i16 19, i16 1000, i16 3456, i16 70>
2314 %2 = add <8 x i16> %1, <i16 -1, i16 22000, i16 770, i16 -98, i16 -19, i16 -1000, i16 -3456, i16 -70>
2318 define <32 x i8> @test23(<32 x i8> %x) {
2319 ; SSE-LABEL: test23:
2320 ; SSE: # %bb.0: # %entry
2321 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70]
2322 ; SSE-NEXT: psubusb %xmm2, %xmm0
2323 ; SSE-NEXT: psubusb %xmm2, %xmm1
2326 ; AVX1-LABEL: test23:
2327 ; AVX1: # %bb.0: # %entry
2328 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2329 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70]
2330 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
2331 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
2332 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2335 ; AVX2-LABEL: test23:
2336 ; AVX2: # %bb.0: # %entry
2337 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2340 ; AVX512-LABEL: test23:
2341 ; AVX512: # %bb.0: # %entry
2342 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2345 %0 = icmp ugt <32 x i8> %x, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2346 %1 = select <32 x i1> %0, <32 x i8> %x, <32 x i8> <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2347 %2 = add <32 x i8> %1, <i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70>
2351 define <32 x i8> @test24(<32 x i8> %x) {
2352 ; SSE-LABEL: test24:
2353 ; SSE: # %bb.0: # %entry
2354 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2355 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2358 ; AVX1-LABEL: test24:
2359 ; AVX1: # %bb.0: # %entry
2360 ; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2361 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2362 ; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2363 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2366 ; AVX2-LABEL: test24:
2367 ; AVX2: # %bb.0: # %entry
2368 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2371 ; AVX512-LABEL: test24:
2372 ; AVX512: # %bb.0: # %entry
2373 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2376 %0 = icmp ugt <32 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2377 %1 = select <32 x i1> %0, <32 x i8> %x, <32 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2378 %2 = add <32 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70>
2382 define <16 x i16> @test25(<16 x i16> %x) {
2383 ; SSE-LABEL: test25:
2384 ; SSE: # %bb.0: # %entry
2385 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000]
2386 ; SSE-NEXT: psubusw %xmm2, %xmm0
2387 ; SSE-NEXT: psubusw %xmm2, %xmm1
2390 ; AVX1-LABEL: test25:
2391 ; AVX1: # %bb.0: # %entry
2392 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2393 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000]
2394 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
2395 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
2396 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2399 ; AVX2-LABEL: test25:
2400 ; AVX2: # %bb.0: # %entry
2401 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2404 ; AVX512-LABEL: test25:
2405 ; AVX512: # %bb.0: # %entry
2406 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2409 %0 = icmp ugt <16 x i16> %x, <i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000>
2410 %1 = select <16 x i1> %0, <16 x i16> %x, <16 x i16> <i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000>
2411 %2 = add <16 x i16> %1, <i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000>
2415 define <16 x i16> @test26(<16 x i16> %x) {
2416 ; SSE-LABEL: test26:
2417 ; SSE: # %bb.0: # %entry
2418 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2419 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2422 ; AVX1-LABEL: test26:
2423 ; AVX1: # %bb.0: # %entry
2424 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2425 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2426 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2427 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2430 ; AVX2-LABEL: test26:
2431 ; AVX2: # %bb.0: # %entry
2432 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2435 ; AVX512-LABEL: test26:
2436 ; AVX512: # %bb.0: # %entry
2437 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2440 %0 = icmp ugt <16 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70>
2441 %1 = select <16 x i1> %0, <16 x i16> %x, <16 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70>
2442 %2 = add <16 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9800, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -34, i16 -55, i16 -70>
2446 define <64 x i8> @test27(<64 x i8> %x) {
2447 ; SSE-LABEL: test27:
2448 ; SSE: # %bb.0: # %entry
2449 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
2450 ; SSE-NEXT: psubusb %xmm4, %xmm0
2451 ; SSE-NEXT: psubusb %xmm4, %xmm1
2452 ; SSE-NEXT: psubusb %xmm4, %xmm2
2453 ; SSE-NEXT: psubusb %xmm4, %xmm3
2456 ; AVX1-LABEL: test27:
2457 ; AVX1: # %bb.0: # %entry
2458 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2459 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
2460 ; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2
2461 ; AVX1-NEXT: vpsubusb %xmm3, %xmm0, %xmm0
2462 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2463 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2464 ; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2
2465 ; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1
2466 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2469 ; AVX2-LABEL: test27:
2470 ; AVX2: # %bb.0: # %entry
2471 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
2472 ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
2473 ; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1
2476 ; AVX512-LABEL: test27:
2477 ; AVX512: # %bb.0: # %entry
2478 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2481 %0 = icmp ugt <64 x i8> %x, <i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154>
2482 %1 = select <64 x i1> %0, <64 x i8> %x, <64 x i8> <i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154>
2483 %2 = add <64 x i8> %1, <i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154>
2487 define <64 x i8> @test28(<64 x i8> %x) {
2488 ; SSE-LABEL: test28:
2489 ; SSE: # %bb.0: # %entry
2490 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70]
2491 ; SSE-NEXT: psubusb %xmm4, %xmm0
2492 ; SSE-NEXT: psubusb %xmm4, %xmm2
2493 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2494 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2497 ; AVX1-LABEL: test28:
2498 ; AVX1: # %bb.0: # %entry
2499 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70]
2500 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm3
2501 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2502 ; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2503 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
2504 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm2
2505 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2506 ; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2507 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2510 ; AVX2-LABEL: test28:
2511 ; AVX2: # %bb.0: # %entry
2512 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2513 ; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2516 ; AVX512-LABEL: test28:
2517 ; AVX512: # %bb.0: # %entry
2518 ; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2521 %0 = icmp ugt <64 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70, i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -116, i8 -77, i8 -70, i8 123, i8 98, i8 67, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2522 %1 = select <64 x i1> %0, <64 x i8> %x, <64 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70, i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -116, i8 -77, i8 -70, i8 123, i8 98, i8 67, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2523 %2 = add <64 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70, i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 116, i8 77, i8 70, i8 -123, i8 -98, i8 -67, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70>
2527 define <32 x i16> @test29(<32 x i16> %x) {
2528 ; SSE-LABEL: test29:
2529 ; SSE: # %bb.0: # %entry
2530 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2531 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2532 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2533 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2536 ; AVX1-LABEL: test29:
2537 ; AVX1: # %bb.0: # %entry
2538 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
2539 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2540 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2541 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2542 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
2543 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2544 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2545 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2548 ; AVX2-LABEL: test29:
2549 ; AVX2: # %bb.0: # %entry
2550 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2551 ; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2554 ; AVX512-LABEL: test29:
2555 ; AVX512: # %bb.0: # %entry
2556 ; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2559 %0 = icmp ugt <32 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70, i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9805, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 346, i16 55, i16 70>
2560 %1 = select <32 x i1> %0, <32 x i16> %x, <32 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70, i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9805, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 346, i16 55, i16 70>
2561 %2 = add <32 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9800, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -34, i16 -55, i16 -70, i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9805, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -346, i16 -55, i16 -70>
2566 define i64 @test30(<8 x i16> %x) {
2567 ; SSE-LABEL: test30:
2568 ; SSE: # %bb.0: # %entry
2569 ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2570 ; SSE-NEXT: movq %xmm0, %rax
2573 ; AVX-LABEL: test30:
2574 ; AVX: # %bb.0: # %entry
2575 ; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2576 ; AVX-NEXT: vmovq %xmm0, %rax
2579 %0 = icmp ugt <8 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 undef, i16 undef, i16 undef, i16 undef>
2580 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 undef, i16 undef, i16 undef, i16 undef>
2581 %2 = add <8 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 undef, i16 undef, i16 undef, i16 undef>
2582 %3 = bitcast <8 x i16> %2 to <2 x i64>
2583 %4 = extractelement <2 x i64> %3, i32 0
2588 define i64 @test31(<2 x i64> %x) {
2589 ; SSE-LABEL: test31:
2591 ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2592 ; SSE-NEXT: movq %xmm0, %rax
2595 ; AVX-LABEL: test31:
2597 ; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2598 ; AVX-NEXT: vmovq %xmm0, %rax
2600 %t0 = bitcast <2 x i64> %x to <16 x i8>
2601 %cmp = icmp ugt <16 x i8> %t0, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
2602 %bop = add <16 x i8> %t0, <i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
2603 %sel = select <16 x i1> %cmp, <16 x i8> %bop, <16 x i8> zeroinitializer
2604 %bc = bitcast <16 x i8> %sel to <2 x i64>
2605 %ext = extractelement <2 x i64> %bc, i32 0
2609 ; v8i16/v8i32 - sub(x,trunc(umin(zext(x),y)))
2610 define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) {
2611 ; SSE2-LABEL: test32:
2613 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
2614 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2615 ; SSE2-NEXT: pxor %xmm3, %xmm4
2616 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
2617 ; SSE2-NEXT: movdqa %xmm5, %xmm6
2618 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
2619 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
2620 ; SSE2-NEXT: pand %xmm6, %xmm2
2621 ; SSE2-NEXT: pxor %xmm4, %xmm6
2622 ; SSE2-NEXT: por %xmm2, %xmm6
2623 ; SSE2-NEXT: pslld $16, %xmm6
2624 ; SSE2-NEXT: psrad $16, %xmm6
2625 ; SSE2-NEXT: pxor %xmm1, %xmm3
2626 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
2627 ; SSE2-NEXT: pxor %xmm5, %xmm4
2628 ; SSE2-NEXT: pand %xmm1, %xmm5
2629 ; SSE2-NEXT: por %xmm4, %xmm5
2630 ; SSE2-NEXT: pslld $16, %xmm5
2631 ; SSE2-NEXT: psrad $16, %xmm5
2632 ; SSE2-NEXT: packssdw %xmm6, %xmm5
2633 ; SSE2-NEXT: psubusw %xmm5, %xmm0
2636 ; SSSE3-LABEL: test32:
2638 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
2639 ; SSSE3-NEXT: movdqa %xmm2, %xmm4
2640 ; SSSE3-NEXT: pxor %xmm3, %xmm4
2641 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
2642 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
2643 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
2644 ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
2645 ; SSSE3-NEXT: pand %xmm6, %xmm2
2646 ; SSSE3-NEXT: pxor %xmm4, %xmm6
2647 ; SSSE3-NEXT: por %xmm2, %xmm6
2648 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2649 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
2650 ; SSSE3-NEXT: pxor %xmm1, %xmm3
2651 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
2652 ; SSSE3-NEXT: pxor %xmm5, %xmm4
2653 ; SSSE3-NEXT: pand %xmm1, %xmm5
2654 ; SSSE3-NEXT: por %xmm4, %xmm5
2655 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
2656 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
2657 ; SSSE3-NEXT: psubusw %xmm5, %xmm0
2660 ; SSE41-LABEL: test32:
2662 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
2663 ; SSE41-NEXT: pminud %xmm3, %xmm2
2664 ; SSE41-NEXT: pminud %xmm3, %xmm1
2665 ; SSE41-NEXT: packusdw %xmm2, %xmm1
2666 ; SSE41-NEXT: psubusw %xmm1, %xmm0
2669 ; AVX1-LABEL: test32:
2671 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2672 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
2673 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
2674 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
2675 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2676 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2677 ; AVX1-NEXT: vzeroupper
2680 ; AVX2-LABEL: test32:
2682 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
2683 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
2684 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2685 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2686 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2687 ; AVX2-NEXT: vzeroupper
2690 ; AVX512-LABEL: test32:
2692 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
2693 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2694 ; AVX512-NEXT: vzeroupper
2696 %zext = zext <8 x i16> %a0 to <8 x i32>
2697 %icmp = icmp ult <8 x i32> %zext, %a1
2698 %umin = select <8 x i1> %icmp, <8 x i32> %zext, <8 x i32> %a1
2699 %trunc = trunc <8 x i32> %umin to <8 x i16>
2700 %sub = sub <8 x i16> %a0, %trunc
2704 ; v8i32/v8i64 - sub(x,trunc(umin(y,zext(x))))
2705 define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) {
2706 ; SSE2OR3-LABEL: test33:
2708 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
2709 ; SSE2OR3-NEXT: movdqa %xmm3, %xmm8
2710 ; SSE2OR3-NEXT: pxor %xmm6, %xmm8
2711 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
2712 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
2713 ; SSE2OR3-NEXT: movdqa %xmm7, %xmm10
2714 ; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10
2715 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
2716 ; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm9
2717 ; SSE2OR3-NEXT: pand %xmm10, %xmm9
2718 ; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8
2719 ; SSE2OR3-NEXT: pand %xmm9, %xmm3
2720 ; SSE2OR3-NEXT: pxor %xmm8, %xmm9
2721 ; SSE2OR3-NEXT: por %xmm3, %xmm9
2722 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3
2723 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3
2724 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
2725 ; SSE2OR3-NEXT: movdqa %xmm7, %xmm11
2726 ; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm11
2727 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2728 ; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3
2729 ; SSE2OR3-NEXT: pand %xmm11, %xmm3
2730 ; SSE2OR3-NEXT: pand %xmm3, %xmm2
2731 ; SSE2OR3-NEXT: pxor %xmm8, %xmm3
2732 ; SSE2OR3-NEXT: por %xmm2, %xmm3
2733 ; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm9[0,2]
2734 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2
2735 ; SSE2OR3-NEXT: psubd %xmm3, %xmm2
2736 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3
2737 ; SSE2OR3-NEXT: pxor %xmm6, %xmm0
2738 ; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm0
2739 ; SSE2OR3-NEXT: pand %xmm2, %xmm0
2740 ; SSE2OR3-NEXT: movdqa %xmm5, %xmm2
2741 ; SSE2OR3-NEXT: pxor %xmm6, %xmm2
2742 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
2743 ; SSE2OR3-NEXT: movdqa %xmm7, %xmm9
2744 ; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm9
2745 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2746 ; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm2
2747 ; SSE2OR3-NEXT: pand %xmm9, %xmm2
2748 ; SSE2OR3-NEXT: pand %xmm2, %xmm5
2749 ; SSE2OR3-NEXT: pxor %xmm8, %xmm2
2750 ; SSE2OR3-NEXT: por %xmm5, %xmm2
2751 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm3
2752 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3
2753 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
2754 ; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm7
2755 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2756 ; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3
2757 ; SSE2OR3-NEXT: pand %xmm7, %xmm3
2758 ; SSE2OR3-NEXT: pxor %xmm3, %xmm8
2759 ; SSE2OR3-NEXT: pand %xmm4, %xmm3
2760 ; SSE2OR3-NEXT: por %xmm8, %xmm3
2761 ; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
2762 ; SSE2OR3-NEXT: movdqa %xmm1, %xmm2
2763 ; SSE2OR3-NEXT: psubd %xmm3, %xmm2
2764 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3
2765 ; SSE2OR3-NEXT: pxor %xmm6, %xmm1
2766 ; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm1
2767 ; SSE2OR3-NEXT: pand %xmm2, %xmm1
2768 ; SSE2OR3-NEXT: retq
2770 ; SSE41-LABEL: test33:
2772 ; SSE41-NEXT: movdqa %xmm0, %xmm7
2773 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
2774 ; SSE41-NEXT: movdqa %xmm3, %xmm0
2775 ; SSE41-NEXT: pxor %xmm10, %xmm0
2776 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455]
2777 ; SSE41-NEXT: movdqa %xmm8, %xmm9
2778 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm9
2779 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
2780 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647]
2781 ; SSE41-NEXT: movdqa %xmm6, %xmm0
2782 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0
2783 ; SSE41-NEXT: pand %xmm9, %xmm0
2784 ; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295]
2785 ; SSE41-NEXT: movapd %xmm9, %xmm11
2786 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11
2787 ; SSE41-NEXT: movdqa %xmm2, %xmm0
2788 ; SSE41-NEXT: pxor %xmm10, %xmm0
2789 ; SSE41-NEXT: movdqa %xmm8, %xmm3
2790 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
2791 ; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
2792 ; SSE41-NEXT: movdqa %xmm6, %xmm0
2793 ; SSE41-NEXT: pcmpgtd %xmm12, %xmm0
2794 ; SSE41-NEXT: pand %xmm3, %xmm0
2795 ; SSE41-NEXT: movapd %xmm9, %xmm3
2796 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
2797 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2]
2798 ; SSE41-NEXT: pmaxud %xmm3, %xmm7
2799 ; SSE41-NEXT: psubd %xmm3, %xmm7
2800 ; SSE41-NEXT: movdqa %xmm5, %xmm0
2801 ; SSE41-NEXT: pxor %xmm10, %xmm0
2802 ; SSE41-NEXT: movdqa %xmm8, %xmm2
2803 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
2804 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
2805 ; SSE41-NEXT: movdqa %xmm6, %xmm0
2806 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm0
2807 ; SSE41-NEXT: pand %xmm2, %xmm0
2808 ; SSE41-NEXT: movapd %xmm9, %xmm2
2809 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
2810 ; SSE41-NEXT: pxor %xmm4, %xmm10
2811 ; SSE41-NEXT: pcmpeqd %xmm10, %xmm8
2812 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
2813 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
2814 ; SSE41-NEXT: pand %xmm8, %xmm6
2815 ; SSE41-NEXT: movdqa %xmm6, %xmm0
2816 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9
2817 ; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2]
2818 ; SSE41-NEXT: pmaxud %xmm9, %xmm1
2819 ; SSE41-NEXT: psubd %xmm9, %xmm1
2820 ; SSE41-NEXT: movdqa %xmm7, %xmm0
2823 ; AVX1-LABEL: test33:
2825 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
2826 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
2827 ; AVX1-NEXT: # xmm4 = mem[0,0]
2828 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
2829 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
2830 ; AVX1-NEXT: # xmm6 = mem[0,0]
2831 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
2832 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295]
2833 ; AVX1-NEXT: # xmm7 = mem[0,0]
2834 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
2835 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
2836 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
2837 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2
2838 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
2839 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
2840 ; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm3
2841 ; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
2842 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
2843 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
2844 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
2845 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
2846 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4
2847 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
2848 ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm7, %xmm1
2849 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
2850 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
2851 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
2852 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2855 ; AVX2-SLOW-LABEL: test33:
2856 ; AVX2-SLOW: # %bb.0:
2857 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
2858 ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm2, %ymm4
2859 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
2860 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
2861 ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
2862 ; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
2863 ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3
2864 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
2865 ; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
2866 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
2867 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2868 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6]
2869 ; AVX2-SLOW-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
2870 ; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2871 ; AVX2-SLOW-NEXT: retq
2873 ; AVX2-FAST-ALL-LABEL: test33:
2874 ; AVX2-FAST-ALL: # %bb.0:
2875 ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
2876 ; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4
2877 ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
2878 ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
2879 ; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
2880 ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1
2881 ; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
2882 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1
2883 ; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm2, %ymm3
2884 ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
2885 ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2
2886 ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm2
2887 ; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2888 ; AVX2-FAST-ALL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
2889 ; AVX2-FAST-ALL-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2890 ; AVX2-FAST-ALL-NEXT: retq
2892 ; AVX2-FAST-PERLANE-LABEL: test33:
2893 ; AVX2-FAST-PERLANE: # %bb.0:
2894 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
2895 ; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4
2896 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
2897 ; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
2898 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
2899 ; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
2900 ; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3
2901 ; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
2902 ; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
2903 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
2904 ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2905 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6]
2906 ; AVX2-FAST-PERLANE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
2907 ; AVX2-FAST-PERLANE-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2908 ; AVX2-FAST-PERLANE-NEXT: retq
2910 ; AVX512-LABEL: test33:
2912 ; AVX512-NEXT: vpmovusqd %zmm1, %ymm1
2913 ; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
2914 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
2916 %zext = zext <8 x i32> %a0 to <8 x i64>
2917 %icmp = icmp ult <8 x i64> %a1, %zext
2918 %umin = select <8 x i1> %icmp, <8 x i64> %a1, <8 x i64> %zext
2919 %trunc = trunc <8 x i64> %umin to <8 x i32>
2920 %sub = sub <8 x i32> %a0, %trunc
2924 ; v8i32/v8i64 - sub(x,trunc(umin(zext(and(x,1)),y)))
2925 define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) {
2926 ; SSE2OR3-LABEL: test34:
2928 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1]
2929 ; SSE2OR3-NEXT: pand %xmm6, %xmm1
2930 ; SSE2OR3-NEXT: pand %xmm6, %xmm0
2931 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
2932 ; SSE2OR3-NEXT: movdqa %xmm3, %xmm8
2933 ; SSE2OR3-NEXT: pxor %xmm6, %xmm8
2934 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
2935 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
2936 ; SSE2OR3-NEXT: movdqa %xmm7, %xmm10
2937 ; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10
2938 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
2939 ; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm9
2940 ; SSE2OR3-NEXT: pand %xmm10, %xmm9
2941 ; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8
2942 ; SSE2OR3-NEXT: pand %xmm9, %xmm3
2943 ; SSE2OR3-NEXT: pxor %xmm8, %xmm9
2944 ; SSE2OR3-NEXT: por %xmm3, %xmm9
2945 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3
2946 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3
2947 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
2948 ; SSE2OR3-NEXT: movdqa %xmm7, %xmm11
2949 ; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm11
2950 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2951 ; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3
2952 ; SSE2OR3-NEXT: pand %xmm11, %xmm3
2953 ; SSE2OR3-NEXT: pand %xmm3, %xmm2
2954 ; SSE2OR3-NEXT: pxor %xmm8, %xmm3
2955 ; SSE2OR3-NEXT: por %xmm2, %xmm3
2956 ; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm9[0,2]
2957 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2
2958 ; SSE2OR3-NEXT: psubd %xmm3, %xmm2
2959 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3
2960 ; SSE2OR3-NEXT: por %xmm6, %xmm0
2961 ; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm0
2962 ; SSE2OR3-NEXT: pand %xmm2, %xmm0
2963 ; SSE2OR3-NEXT: movdqa %xmm5, %xmm2
2964 ; SSE2OR3-NEXT: pxor %xmm6, %xmm2
2965 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
2966 ; SSE2OR3-NEXT: movdqa %xmm7, %xmm9
2967 ; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm9
2968 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2969 ; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm2
2970 ; SSE2OR3-NEXT: pand %xmm9, %xmm2
2971 ; SSE2OR3-NEXT: pand %xmm2, %xmm5
2972 ; SSE2OR3-NEXT: pxor %xmm8, %xmm2
2973 ; SSE2OR3-NEXT: por %xmm5, %xmm2
2974 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm3
2975 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3
2976 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
2977 ; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm7
2978 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2979 ; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3
2980 ; SSE2OR3-NEXT: pand %xmm7, %xmm3
2981 ; SSE2OR3-NEXT: pxor %xmm3, %xmm8
2982 ; SSE2OR3-NEXT: pand %xmm4, %xmm3
2983 ; SSE2OR3-NEXT: por %xmm8, %xmm3
2984 ; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
2985 ; SSE2OR3-NEXT: movdqa %xmm1, %xmm2
2986 ; SSE2OR3-NEXT: psubd %xmm3, %xmm2
2987 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3
2988 ; SSE2OR3-NEXT: por %xmm6, %xmm1
2989 ; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm1
2990 ; SSE2OR3-NEXT: pand %xmm2, %xmm1
2991 ; SSE2OR3-NEXT: retq
2993 ; SSE41-LABEL: test34:
2995 ; SSE41-NEXT: movdqa %xmm0, %xmm6
2996 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
2997 ; SSE41-NEXT: pand %xmm0, %xmm1
2998 ; SSE41-NEXT: pand %xmm0, %xmm6
2999 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
3000 ; SSE41-NEXT: movdqa %xmm3, %xmm0
3001 ; SSE41-NEXT: pxor %xmm10, %xmm0
3002 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455]
3003 ; SSE41-NEXT: movdqa %xmm8, %xmm9
3004 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm9
3005 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2]
3006 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
3007 ; SSE41-NEXT: movdqa %xmm7, %xmm0
3008 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0
3009 ; SSE41-NEXT: pand %xmm9, %xmm0
3010 ; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295]
3011 ; SSE41-NEXT: movapd %xmm9, %xmm11
3012 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11
3013 ; SSE41-NEXT: movdqa %xmm2, %xmm0
3014 ; SSE41-NEXT: pxor %xmm10, %xmm0
3015 ; SSE41-NEXT: movdqa %xmm8, %xmm3
3016 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
3017 ; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
3018 ; SSE41-NEXT: movdqa %xmm7, %xmm0
3019 ; SSE41-NEXT: pcmpgtd %xmm12, %xmm0
3020 ; SSE41-NEXT: pand %xmm3, %xmm0
3021 ; SSE41-NEXT: movapd %xmm9, %xmm3
3022 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
3023 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2]
3024 ; SSE41-NEXT: pmaxud %xmm3, %xmm6
3025 ; SSE41-NEXT: psubd %xmm3, %xmm6
3026 ; SSE41-NEXT: movdqa %xmm5, %xmm0
3027 ; SSE41-NEXT: pxor %xmm10, %xmm0
3028 ; SSE41-NEXT: movdqa %xmm8, %xmm2
3029 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
3030 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
3031 ; SSE41-NEXT: movdqa %xmm7, %xmm0
3032 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm0
3033 ; SSE41-NEXT: pand %xmm2, %xmm0
3034 ; SSE41-NEXT: movapd %xmm9, %xmm2
3035 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
3036 ; SSE41-NEXT: pxor %xmm4, %xmm10
3037 ; SSE41-NEXT: pcmpeqd %xmm10, %xmm8
3038 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
3039 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
3040 ; SSE41-NEXT: pand %xmm8, %xmm7
3041 ; SSE41-NEXT: movdqa %xmm7, %xmm0
3042 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9
3043 ; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2]
3044 ; SSE41-NEXT: pmaxud %xmm9, %xmm1
3045 ; SSE41-NEXT: psubd %xmm9, %xmm1
3046 ; SSE41-NEXT: movdqa %xmm6, %xmm0
3049 ; AVX1-LABEL: test34:
3051 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3052 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
3053 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
3054 ; AVX1-NEXT: # xmm4 = mem[0,0]
3055 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
3056 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103]
3057 ; AVX1-NEXT: # xmm6 = mem[0,0]
3058 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
3059 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295]
3060 ; AVX1-NEXT: # xmm7 = mem[0,0]
3061 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
3062 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5
3063 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
3064 ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2
3065 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
3066 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
3067 ; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm3
3068 ; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
3069 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
3070 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5
3071 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
3072 ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3
3073 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4
3074 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
3075 ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm7, %xmm1
3076 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
3077 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
3078 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
3079 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
3082 ; AVX2-SLOW-LABEL: test34:
3083 ; AVX2-SLOW: # %bb.0:
3084 ; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
3085 ; AVX2-SLOW-NEXT: vpand %ymm3, %ymm0, %ymm0
3086 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
3087 ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm2, %ymm4
3088 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
3089 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
3090 ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
3091 ; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
3092 ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3
3093 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
3094 ; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
3095 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
3096 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
3097 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6]
3098 ; AVX2-SLOW-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
3099 ; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0
3100 ; AVX2-SLOW-NEXT: retq
3102 ; AVX2-FAST-ALL-LABEL: test34:
3103 ; AVX2-FAST-ALL: # %bb.0:
3104 ; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
3105 ; AVX2-FAST-ALL-NEXT: vpand %ymm3, %ymm0, %ymm0
3106 ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
3107 ; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4
3108 ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
3109 ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
3110 ; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
3111 ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1
3112 ; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
3113 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1
3114 ; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm2, %ymm3
3115 ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
3116 ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2
3117 ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm2
3118 ; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
3119 ; AVX2-FAST-ALL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
3120 ; AVX2-FAST-ALL-NEXT: vpsubd %ymm1, %ymm0, %ymm0
3121 ; AVX2-FAST-ALL-NEXT: retq
3123 ; AVX2-FAST-PERLANE-LABEL: test34:
3124 ; AVX2-FAST-PERLANE: # %bb.0:
3125 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
3126 ; AVX2-FAST-PERLANE-NEXT: vpand %ymm3, %ymm0, %ymm0
3127 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
3128 ; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4
3129 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
3130 ; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
3131 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
3132 ; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
3133 ; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3
3134 ; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
3135 ; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
3136 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
3137 ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
3138 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6]
3139 ; AVX2-FAST-PERLANE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
3140 ; AVX2-FAST-PERLANE-NEXT: vpsubd %ymm1, %ymm0, %ymm0
3141 ; AVX2-FAST-PERLANE-NEXT: retq
3143 ; AVX512-LABEL: test34:
3145 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
3146 ; AVX512-NEXT: vpmovusqd %zmm1, %ymm1
3147 ; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
3148 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
3150 %mask = and <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3151 %zext = zext <8 x i32> %mask to <8 x i64>
3152 %icmp = icmp ult <8 x i64> %zext, %a1
3153 %umin = select <8 x i1> %icmp, <8 x i64> %zext, <8 x i64> %a1
3154 %trunc = trunc <8 x i64> %umin to <8 x i32>
3155 %sub = sub <8 x i32> %mask, %trunc