1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
10 define <8 x i16> @test1(<8 x i16> %x) nounwind {
12 ; SSE: # %bb.0: # %vector.ph
13 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
17 ; AVX: # %bb.0: # %vector.ph
18 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
21 %0 = icmp slt <8 x i16> %x, zeroinitializer
22 %1 = xor <8 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
23 %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
27 define <8 x i16> @test2(<8 x i16> %x) nounwind {
29 ; SSE: # %bb.0: # %vector.ph
30 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
34 ; AVX: # %bb.0: # %vector.ph
35 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
38 %0 = icmp ugt <8 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
39 %1 = add <8 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
40 %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
44 define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
46 ; SSE: # %bb.0: # %vector.ph
47 ; SSE-NEXT: movd %edi, %xmm1
48 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
49 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
50 ; SSE-NEXT: psubusw %xmm1, %xmm0
54 ; AVX1: # %bb.0: # %vector.ph
55 ; AVX1-NEXT: vmovd %edi, %xmm1
56 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
57 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
58 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
62 ; AVX2: # %bb.0: # %vector.ph
63 ; AVX2-NEXT: vmovd %edi, %xmm1
64 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
65 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
68 ; AVX512-LABEL: test3:
69 ; AVX512: # %bb.0: # %vector.ph
70 ; AVX512-NEXT: vpbroadcastw %edi, %xmm1
71 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
74 %0 = insertelement <8 x i16> undef, i16 %w, i32 0
75 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
76 %1 = icmp ult <8 x i16> %x, %broadcast15
77 %2 = sub <8 x i16> %x, %broadcast15
78 %res = select <8 x i1> %1, <8 x i16> zeroinitializer, <8 x i16> %2
82 define <16 x i8> @test4(<16 x i8> %x) nounwind {
84 ; SSE: # %bb.0: # %vector.ph
85 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
89 ; AVX: # %bb.0: # %vector.ph
90 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
93 %0 = icmp slt <16 x i8> %x, zeroinitializer
94 %1 = xor <16 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
95 %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
99 define <16 x i8> @test5(<16 x i8> %x) nounwind {
101 ; SSE: # %bb.0: # %vector.ph
102 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
106 ; AVX: # %bb.0: # %vector.ph
107 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
110 %0 = icmp ugt <16 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
111 %1 = add <16 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
112 %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
116 define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
118 ; SSE2: # %bb.0: # %vector.ph
119 ; SSE2-NEXT: movd %edi, %xmm1
120 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
121 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
122 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
123 ; SSE2-NEXT: psubusb %xmm1, %xmm0
126 ; SSSE3-LABEL: test6:
127 ; SSSE3: # %bb.0: # %vector.ph
128 ; SSSE3-NEXT: movd %edi, %xmm1
129 ; SSSE3-NEXT: pxor %xmm2, %xmm2
130 ; SSSE3-NEXT: pshufb %xmm2, %xmm1
131 ; SSSE3-NEXT: psubusb %xmm1, %xmm0
134 ; SSE41-LABEL: test6:
135 ; SSE41: # %bb.0: # %vector.ph
136 ; SSE41-NEXT: movd %edi, %xmm1
137 ; SSE41-NEXT: pxor %xmm2, %xmm2
138 ; SSE41-NEXT: pshufb %xmm2, %xmm1
139 ; SSE41-NEXT: psubusb %xmm1, %xmm0
143 ; AVX1: # %bb.0: # %vector.ph
144 ; AVX1-NEXT: vmovd %edi, %xmm1
145 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
146 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
147 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
151 ; AVX2: # %bb.0: # %vector.ph
152 ; AVX2-NEXT: vmovd %edi, %xmm1
153 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
154 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
157 ; AVX512-LABEL: test6:
158 ; AVX512: # %bb.0: # %vector.ph
159 ; AVX512-NEXT: vpbroadcastb %edi, %xmm1
160 ; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
163 %0 = insertelement <16 x i8> undef, i8 %w, i32 0
164 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
165 %1 = icmp ult <16 x i8> %x, %broadcast15
166 %2 = sub <16 x i8> %x, %broadcast15
167 %res = select <16 x i1> %1, <16 x i8> zeroinitializer, <16 x i8> %2
171 define <16 x i16> @test7(<16 x i16> %x) nounwind {
173 ; SSE: # %bb.0: # %vector.ph
174 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
175 ; SSE-NEXT: psubusw %xmm2, %xmm0
176 ; SSE-NEXT: psubusw %xmm2, %xmm1
180 ; AVX1: # %bb.0: # %vector.ph
181 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
182 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
183 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
184 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
185 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
189 ; AVX2: # %bb.0: # %vector.ph
190 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
193 ; AVX512-LABEL: test7:
194 ; AVX512: # %bb.0: # %vector.ph
195 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
198 %0 = icmp slt <16 x i16> %x, zeroinitializer
199 %1 = xor <16 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
200 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
204 define <16 x i16> @test8(<16 x i16> %x) nounwind {
206 ; SSE: # %bb.0: # %vector.ph
207 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
208 ; SSE-NEXT: psubusw %xmm2, %xmm0
209 ; SSE-NEXT: psubusw %xmm2, %xmm1
213 ; AVX1: # %bb.0: # %vector.ph
214 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
215 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
216 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
217 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
218 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
222 ; AVX2: # %bb.0: # %vector.ph
223 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
226 ; AVX512-LABEL: test8:
227 ; AVX512: # %bb.0: # %vector.ph
228 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
231 %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
232 %1 = add <16 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
233 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
237 define <16 x i16> @test8a(<16 x i16> %x) nounwind {
239 ; SSE: # %bb.0: # %vector.ph
240 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
241 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm1
244 ; AVX1-LABEL: test8a:
245 ; AVX1: # %bb.0: # %vector.ph
246 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm1
247 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
248 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
249 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
252 ; AVX2-LABEL: test8a:
253 ; AVX2: # %bb.0: # %vector.ph
254 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
257 ; AVX512-LABEL: test8a:
258 ; AVX512: # %bb.0: # %vector.ph
259 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
262 %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32765, i16 32764, i16 32763, i16 32762, i16 32761, i16 32760, i16 32759, i16 32758, i16 32757, i16 32756, i16 32755, i16 32754, i16 32753, i16 32752, i16 32751>
263 %1 = add <16 x i16> %x, <i16 -32767, i16 -32766, i16 -32765, i16 -32764, i16 -32763, i16 -32762, i16 -32761, i16 -32760, i16 -32759, i16 -32758, i16 -32757, i16 -32756, i16 -32755, i16 -32754, i16 -32753, i16 -32752>
264 %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
268 define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
270 ; SSE: # %bb.0: # %vector.ph
271 ; SSE-NEXT: movd %edi, %xmm2
272 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
273 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
274 ; SSE-NEXT: psubusw %xmm2, %xmm0
275 ; SSE-NEXT: psubusw %xmm2, %xmm1
279 ; AVX1: # %bb.0: # %vector.ph
280 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
281 ; AVX1-NEXT: vmovd %edi, %xmm2
282 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
283 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
284 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
285 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
286 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
290 ; AVX2: # %bb.0: # %vector.ph
291 ; AVX2-NEXT: vmovd %edi, %xmm1
292 ; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
293 ; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
296 ; AVX512-LABEL: test9:
297 ; AVX512: # %bb.0: # %vector.ph
298 ; AVX512-NEXT: vpbroadcastw %edi, %ymm1
299 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
302 %0 = insertelement <16 x i16> undef, i16 %w, i32 0
303 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
304 %1 = icmp ult <16 x i16> %x, %broadcast15
305 %2 = sub <16 x i16> %x, %broadcast15
306 %res = select <16 x i1> %1, <16 x i16> zeroinitializer, <16 x i16> %2
310 define <32 x i8> @test10(<32 x i8> %x) nounwind {
312 ; SSE: # %bb.0: # %vector.ph
313 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
314 ; SSE-NEXT: psubusb %xmm2, %xmm0
315 ; SSE-NEXT: psubusb %xmm2, %xmm1
318 ; AVX1-LABEL: test10:
319 ; AVX1: # %bb.0: # %vector.ph
320 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
321 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
322 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
323 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
324 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
327 ; AVX2-LABEL: test10:
328 ; AVX2: # %bb.0: # %vector.ph
329 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
332 ; AVX512-LABEL: test10:
333 ; AVX512: # %bb.0: # %vector.ph
334 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
337 %0 = icmp slt <32 x i8> %x, zeroinitializer
338 %1 = xor <32 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
339 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
343 define <32 x i8> @test11(<32 x i8> %x) nounwind {
345 ; SSE: # %bb.0: # %vector.ph
346 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
347 ; SSE-NEXT: psubusb %xmm2, %xmm0
348 ; SSE-NEXT: psubusb %xmm2, %xmm1
351 ; AVX1-LABEL: test11:
352 ; AVX1: # %bb.0: # %vector.ph
353 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
354 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
355 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
356 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
357 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
360 ; AVX2-LABEL: test11:
361 ; AVX2: # %bb.0: # %vector.ph
362 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
365 ; AVX512-LABEL: test11:
366 ; AVX512: # %bb.0: # %vector.ph
367 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
370 %0 = icmp ugt <32 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
371 %1 = add <32 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
372 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
376 define <32 x i8> @test11a(<32 x i8> %x) nounwind {
377 ; SSE-LABEL: test11a:
378 ; SSE: # %bb.0: # %vector.ph
379 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
380 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm1
383 ; AVX1-LABEL: test11a:
384 ; AVX1: # %bb.0: # %vector.ph
385 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm1
386 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
387 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
388 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
391 ; AVX2-LABEL: test11a:
392 ; AVX2: # %bb.0: # %vector.ph
393 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
396 ; AVX512-LABEL: test11a:
397 ; AVX512: # %bb.0: # %vector.ph
398 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
401 %0 = icmp ugt <32 x i8> %x, <i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 117, i8 116, i8 115, i8 114, i8 113, i8 112, i8 111, i8 110, i8 109, i8 108, i8 107, i8 106, i8 105, i8 104, i8 103, i8 102, i8 101, i8 100, i8 99, i8 98, i8 97, i8 96, i8 95>
402 %1 = add <32 x i8> %x, <i8 -127, i8 -126, i8 -125, i8 -124, i8 -123, i8 -122, i8 -121, i8 -120, i8 -119, i8 -118, i8 -117, i8 -116, i8 -115, i8 -114, i8 -113, i8 -112, i8 -111, i8 -110, i8 -109, i8 -108, i8 -107, i8 -106, i8 -105, i8 -104, i8 -103, i8 -102, i8 -101, i8 -100, i8 -99, i8 -98, i8 -97, i8 -96>
403 %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
407 define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
408 ; SSE2-LABEL: test12:
409 ; SSE2: # %bb.0: # %vector.ph
410 ; SSE2-NEXT: movd %edi, %xmm2
411 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
412 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
413 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
414 ; SSE2-NEXT: psubusb %xmm2, %xmm0
415 ; SSE2-NEXT: psubusb %xmm2, %xmm1
418 ; SSSE3-LABEL: test12:
419 ; SSSE3: # %bb.0: # %vector.ph
420 ; SSSE3-NEXT: movd %edi, %xmm2
421 ; SSSE3-NEXT: pxor %xmm3, %xmm3
422 ; SSSE3-NEXT: pshufb %xmm3, %xmm2
423 ; SSSE3-NEXT: psubusb %xmm2, %xmm0
424 ; SSSE3-NEXT: psubusb %xmm2, %xmm1
427 ; SSE41-LABEL: test12:
428 ; SSE41: # %bb.0: # %vector.ph
429 ; SSE41-NEXT: movd %edi, %xmm2
430 ; SSE41-NEXT: pxor %xmm3, %xmm3
431 ; SSE41-NEXT: pshufb %xmm3, %xmm2
432 ; SSE41-NEXT: psubusb %xmm2, %xmm0
433 ; SSE41-NEXT: psubusb %xmm2, %xmm1
436 ; AVX1-LABEL: test12:
437 ; AVX1: # %bb.0: # %vector.ph
438 ; AVX1-NEXT: vmovd %edi, %xmm1
439 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
440 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
441 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
442 ; AVX1-NEXT: vpsubusb %xmm1, %xmm2, %xmm2
443 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
444 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
447 ; AVX2-LABEL: test12:
448 ; AVX2: # %bb.0: # %vector.ph
449 ; AVX2-NEXT: vmovd %edi, %xmm1
450 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
451 ; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
454 ; AVX512-LABEL: test12:
455 ; AVX512: # %bb.0: # %vector.ph
456 ; AVX512-NEXT: vpbroadcastb %edi, %ymm1
457 ; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
460 %0 = insertelement <32 x i8> undef, i8 %w, i32 0
461 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
462 %1 = icmp ult <32 x i8> %x, %broadcast15
463 %2 = sub <32 x i8> %x, %broadcast15
464 %res = select <32 x i1> %1, <32 x i8> zeroinitializer, <32 x i8> %2
468 define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
469 ; SSE2-LABEL: test13:
470 ; SSE2: # %bb.0: # %vector.ph
471 ; SSE2-NEXT: pxor %xmm4, %xmm4
472 ; SSE2-NEXT: movdqa %xmm0, %xmm3
473 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
474 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
475 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
476 ; SSE2-NEXT: movdqa %xmm0, %xmm5
477 ; SSE2-NEXT: psubd %xmm2, %xmm0
478 ; SSE2-NEXT: movdqa %xmm2, %xmm6
479 ; SSE2-NEXT: pxor %xmm4, %xmm6
480 ; SSE2-NEXT: por %xmm4, %xmm5
481 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
482 ; SSE2-NEXT: movdqa %xmm1, %xmm2
483 ; SSE2-NEXT: pxor %xmm4, %xmm2
484 ; SSE2-NEXT: por %xmm3, %xmm4
485 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
486 ; SSE2-NEXT: packssdw %xmm6, %xmm2
487 ; SSE2-NEXT: psubd %xmm1, %xmm3
488 ; SSE2-NEXT: pslld $16, %xmm0
489 ; SSE2-NEXT: psrad $16, %xmm0
490 ; SSE2-NEXT: pslld $16, %xmm3
491 ; SSE2-NEXT: psrad $16, %xmm3
492 ; SSE2-NEXT: packssdw %xmm0, %xmm3
493 ; SSE2-NEXT: pandn %xmm3, %xmm2
494 ; SSE2-NEXT: movdqa %xmm2, %xmm0
497 ; SSSE3-LABEL: test13:
498 ; SSSE3: # %bb.0: # %vector.ph
499 ; SSSE3-NEXT: pxor %xmm3, %xmm3
500 ; SSSE3-NEXT: movdqa %xmm0, %xmm4
501 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
502 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
503 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
504 ; SSSE3-NEXT: movdqa %xmm0, %xmm5
505 ; SSSE3-NEXT: psubd %xmm2, %xmm0
506 ; SSSE3-NEXT: movdqa %xmm2, %xmm6
507 ; SSSE3-NEXT: pxor %xmm3, %xmm6
508 ; SSSE3-NEXT: por %xmm3, %xmm5
509 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
510 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
511 ; SSSE3-NEXT: pxor %xmm3, %xmm2
512 ; SSSE3-NEXT: por %xmm4, %xmm3
513 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
514 ; SSSE3-NEXT: packssdw %xmm6, %xmm2
515 ; SSSE3-NEXT: psubd %xmm1, %xmm4
516 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
517 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
518 ; SSSE3-NEXT: pshufb %xmm1, %xmm4
519 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
520 ; SSSE3-NEXT: pandn %xmm4, %xmm2
521 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
524 ; SSE41-LABEL: test13:
525 ; SSE41: # %bb.0: # %vector.ph
526 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
527 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
528 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
529 ; SSE41-NEXT: movdqa %xmm4, %xmm0
530 ; SSE41-NEXT: pmaxud %xmm1, %xmm0
531 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
532 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
533 ; SSE41-NEXT: pxor %xmm5, %xmm0
534 ; SSE41-NEXT: movdqa %xmm3, %xmm6
535 ; SSE41-NEXT: pmaxud %xmm2, %xmm6
536 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm6
537 ; SSE41-NEXT: pxor %xmm5, %xmm6
538 ; SSE41-NEXT: packssdw %xmm6, %xmm0
539 ; SSE41-NEXT: psubd %xmm2, %xmm3
540 ; SSE41-NEXT: psubd %xmm1, %xmm4
541 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
542 ; SSE41-NEXT: pshufb %xmm1, %xmm4
543 ; SSE41-NEXT: pshufb %xmm1, %xmm3
544 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
545 ; SSE41-NEXT: pandn %xmm4, %xmm0
548 ; AVX1-LABEL: test13:
549 ; AVX1: # %bb.0: # %vector.ph
550 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
551 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
552 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
553 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
554 ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm4
555 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4
556 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
557 ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
558 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm6
559 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm0, %xmm6
560 ; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm5
561 ; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4
562 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
563 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm1
564 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
565 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
566 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
567 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
568 ; AVX1-NEXT: vpandn %xmm0, %xmm4, %xmm0
569 ; AVX1-NEXT: vzeroupper
572 ; AVX2-LABEL: test13:
573 ; AVX2: # %bb.0: # %vector.ph
574 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
575 ; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm2
576 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2
577 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
578 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
579 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
580 ; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
581 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
582 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
583 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
584 ; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0
585 ; AVX2-NEXT: vzeroupper
588 ; AVX512-LABEL: test13:
589 ; AVX512: # %bb.0: # %vector.ph
590 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
591 ; AVX512-NEXT: vpcmpnltud %ymm1, %ymm0, %k1
592 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
593 ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
594 ; AVX512-NEXT: vzeroupper
597 %lhs = zext <8 x i16> %x to <8 x i32>
598 %cond = icmp ult <8 x i32> %lhs, %y
599 %sub = sub <8 x i32> %lhs, %y
600 %trunc = trunc <8 x i32> %sub to <8 x i16>
601 %res = select <8 x i1> %cond, <8 x i16> zeroinitializer, <8 x i16> %trunc
605 define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
606 ; SSE2-LABEL: test14:
607 ; SSE2: # %bb.0: # %vector.ph
608 ; SSE2-NEXT: movdqa %xmm0, %xmm5
609 ; SSE2-NEXT: pxor %xmm0, %xmm0
610 ; SSE2-NEXT: movdqa %xmm5, %xmm6
611 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
612 ; SSE2-NEXT: movdqa %xmm6, %xmm8
613 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
614 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
615 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
616 ; SSE2-NEXT: movdqa %xmm5, %xmm10
617 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
618 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
619 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
620 ; SSE2-NEXT: movdqa %xmm4, %xmm9
621 ; SSE2-NEXT: pxor %xmm0, %xmm9
622 ; SSE2-NEXT: psubd %xmm5, %xmm4
623 ; SSE2-NEXT: por %xmm0, %xmm5
624 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm5
625 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255]
626 ; SSE2-NEXT: pand %xmm9, %xmm5
627 ; SSE2-NEXT: movdqa %xmm3, %xmm7
628 ; SSE2-NEXT: pxor %xmm0, %xmm7
629 ; SSE2-NEXT: psubd %xmm10, %xmm3
630 ; SSE2-NEXT: por %xmm0, %xmm10
631 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm10
632 ; SSE2-NEXT: pand %xmm9, %xmm10
633 ; SSE2-NEXT: packuswb %xmm5, %xmm10
634 ; SSE2-NEXT: movdqa %xmm2, %xmm5
635 ; SSE2-NEXT: pxor %xmm0, %xmm5
636 ; SSE2-NEXT: psubd %xmm6, %xmm2
637 ; SSE2-NEXT: por %xmm0, %xmm6
638 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
639 ; SSE2-NEXT: pand %xmm9, %xmm6
640 ; SSE2-NEXT: movdqa %xmm1, %xmm5
641 ; SSE2-NEXT: pxor %xmm0, %xmm5
642 ; SSE2-NEXT: por %xmm8, %xmm0
643 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
644 ; SSE2-NEXT: pand %xmm9, %xmm0
645 ; SSE2-NEXT: packuswb %xmm6, %xmm0
646 ; SSE2-NEXT: packuswb %xmm10, %xmm0
647 ; SSE2-NEXT: psubd %xmm8, %xmm1
648 ; SSE2-NEXT: pand %xmm9, %xmm4
649 ; SSE2-NEXT: pand %xmm9, %xmm3
650 ; SSE2-NEXT: packuswb %xmm4, %xmm3
651 ; SSE2-NEXT: pand %xmm9, %xmm2
652 ; SSE2-NEXT: pand %xmm9, %xmm1
653 ; SSE2-NEXT: packuswb %xmm2, %xmm1
654 ; SSE2-NEXT: packuswb %xmm3, %xmm1
655 ; SSE2-NEXT: pandn %xmm1, %xmm0
658 ; SSSE3-LABEL: test14:
659 ; SSSE3: # %bb.0: # %vector.ph
660 ; SSSE3-NEXT: movdqa %xmm0, %xmm5
661 ; SSSE3-NEXT: pxor %xmm0, %xmm0
662 ; SSSE3-NEXT: movdqa %xmm5, %xmm7
663 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
664 ; SSSE3-NEXT: movdqa %xmm7, %xmm8
665 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
666 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
667 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
668 ; SSSE3-NEXT: movdqa %xmm5, %xmm10
669 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
670 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
671 ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
672 ; SSSE3-NEXT: movdqa %xmm2, %xmm9
673 ; SSSE3-NEXT: pxor %xmm0, %xmm9
674 ; SSSE3-NEXT: psubd %xmm5, %xmm2
675 ; SSSE3-NEXT: por %xmm0, %xmm5
676 ; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5
677 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
678 ; SSSE3-NEXT: pshufb %xmm9, %xmm5
679 ; SSSE3-NEXT: movdqa %xmm1, %xmm6
680 ; SSSE3-NEXT: pxor %xmm0, %xmm6
681 ; SSSE3-NEXT: psubd %xmm10, %xmm1
682 ; SSSE3-NEXT: por %xmm0, %xmm10
683 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm10
684 ; SSSE3-NEXT: pshufb %xmm9, %xmm10
685 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
686 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
687 ; SSSE3-NEXT: pxor %xmm0, %xmm5
688 ; SSSE3-NEXT: psubd %xmm7, %xmm4
689 ; SSSE3-NEXT: por %xmm0, %xmm7
690 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
691 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
692 ; SSSE3-NEXT: pshufb %xmm5, %xmm7
693 ; SSSE3-NEXT: movdqa %xmm3, %xmm6
694 ; SSSE3-NEXT: pxor %xmm0, %xmm6
695 ; SSSE3-NEXT: por %xmm8, %xmm0
696 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
697 ; SSSE3-NEXT: pshufb %xmm5, %xmm0
698 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
699 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
700 ; SSSE3-NEXT: psubd %xmm8, %xmm3
701 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
702 ; SSSE3-NEXT: pand %xmm5, %xmm4
703 ; SSSE3-NEXT: pand %xmm5, %xmm3
704 ; SSSE3-NEXT: packuswb %xmm4, %xmm3
705 ; SSSE3-NEXT: pand %xmm5, %xmm2
706 ; SSSE3-NEXT: pand %xmm5, %xmm1
707 ; SSSE3-NEXT: packuswb %xmm2, %xmm1
708 ; SSSE3-NEXT: packuswb %xmm3, %xmm1
709 ; SSSE3-NEXT: andnpd %xmm1, %xmm0
712 ; SSE41-LABEL: test14:
713 ; SSE41: # %bb.0: # %vector.ph
714 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
715 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
716 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
717 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
718 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
719 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
720 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
721 ; SSE41-NEXT: movdqa %xmm4, %xmm0
722 ; SSE41-NEXT: pmaxud %xmm10, %xmm0
723 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
724 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
725 ; SSE41-NEXT: pxor %xmm6, %xmm0
726 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
727 ; SSE41-NEXT: pshufb %xmm7, %xmm0
728 ; SSE41-NEXT: movdqa %xmm3, %xmm5
729 ; SSE41-NEXT: pmaxud %xmm9, %xmm5
730 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm5
731 ; SSE41-NEXT: pxor %xmm6, %xmm5
732 ; SSE41-NEXT: pshufb %xmm7, %xmm5
733 ; SSE41-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
734 ; SSE41-NEXT: movdqa %xmm1, %xmm0
735 ; SSE41-NEXT: pmaxud %xmm8, %xmm0
736 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
737 ; SSE41-NEXT: pxor %xmm6, %xmm0
738 ; SSE41-NEXT: movdqa {{.*#+}} xmm12 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
739 ; SSE41-NEXT: pshufb %xmm12, %xmm0
740 ; SSE41-NEXT: movdqa %xmm2, %xmm7
741 ; SSE41-NEXT: pmaxud %xmm11, %xmm7
742 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
743 ; SSE41-NEXT: pxor %xmm6, %xmm7
744 ; SSE41-NEXT: pshufb %xmm12, %xmm7
745 ; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
746 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
747 ; SSE41-NEXT: psubd %xmm11, %xmm2
748 ; SSE41-NEXT: psubd %xmm8, %xmm1
749 ; SSE41-NEXT: psubd %xmm9, %xmm3
750 ; SSE41-NEXT: psubd %xmm10, %xmm4
751 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
752 ; SSE41-NEXT: pand %xmm5, %xmm4
753 ; SSE41-NEXT: pand %xmm5, %xmm3
754 ; SSE41-NEXT: packusdw %xmm4, %xmm3
755 ; SSE41-NEXT: pand %xmm5, %xmm1
756 ; SSE41-NEXT: pand %xmm5, %xmm2
757 ; SSE41-NEXT: packusdw %xmm2, %xmm1
758 ; SSE41-NEXT: packuswb %xmm3, %xmm1
759 ; SSE41-NEXT: pandn %xmm1, %xmm0
762 ; AVX1-LABEL: test14:
763 ; AVX1: # %bb.0: # %vector.ph
764 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
765 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
766 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
767 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
768 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
769 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
770 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
771 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
772 ; AVX1-NEXT: vpmaxud %xmm0, %xmm6, %xmm7
773 ; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm7
774 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
775 ; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7
776 ; AVX1-NEXT: vpmaxud %xmm11, %xmm2, %xmm4
777 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4
778 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
779 ; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm10
780 ; AVX1-NEXT: vpmaxud %xmm9, %xmm1, %xmm7
781 ; AVX1-NEXT: vpcmpeqd %xmm7, %xmm1, %xmm7
782 ; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7
783 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
784 ; AVX1-NEXT: vpmaxud %xmm8, %xmm4, %xmm5
785 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5
786 ; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm3
787 ; AVX1-NEXT: vpackssdw %xmm3, %xmm7, %xmm3
788 ; AVX1-NEXT: vpacksswb %xmm10, %xmm3, %xmm3
789 ; AVX1-NEXT: vpsubd %xmm8, %xmm4, %xmm4
790 ; AVX1-NEXT: vpsubd %xmm9, %xmm1, %xmm1
791 ; AVX1-NEXT: vpsubd %xmm11, %xmm2, %xmm2
792 ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0
793 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
794 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
795 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
796 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
797 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
798 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm2
799 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
800 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
801 ; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
802 ; AVX1-NEXT: vzeroupper
805 ; AVX2-LABEL: test14:
806 ; AVX2: # %bb.0: # %vector.ph
807 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
808 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
809 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
810 ; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm4
811 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4
812 ; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
813 ; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4
814 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
815 ; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
816 ; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm6
817 ; AVX2-NEXT: vpcmpeqd %ymm6, %ymm2, %ymm6
818 ; AVX2-NEXT: vpxor %ymm5, %ymm6, %ymm5
819 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
820 ; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
821 ; AVX2-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
822 ; AVX2-NEXT: vpsubd %ymm3, %ymm2, %ymm2
823 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
824 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
825 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
826 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
827 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
828 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
829 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1
830 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
831 ; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
832 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
833 ; AVX2-NEXT: vpandn %xmm0, %xmm4, %xmm0
834 ; AVX2-NEXT: vzeroupper
837 ; AVX512-LABEL: test14:
838 ; AVX512: # %bb.0: # %vector.ph
839 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
840 ; AVX512-NEXT: vpcmpnltud %zmm0, %zmm1, %k1
841 ; AVX512-NEXT: vpsubd %zmm0, %zmm1, %zmm0
842 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
843 ; AVX512-NEXT: vzeroupper
846 %rhs = zext <16 x i8> %x to <16 x i32>
847 %cond = icmp ult <16 x i32> %y, %rhs
848 %sub = sub <16 x i32> %y, %rhs
849 %truncsub = trunc <16 x i32> %sub to <16 x i8>
850 %res = select <16 x i1> %cond, <16 x i8> zeroinitializer, <16 x i8> %truncsub
854 define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
855 ; SSE2-LABEL: test15:
856 ; SSE2: # %bb.0: # %vector.ph
857 ; SSE2-NEXT: movdqa %xmm0, %xmm3
858 ; SSE2-NEXT: pxor %xmm4, %xmm4
859 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
860 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
861 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
862 ; SSE2-NEXT: movdqa %xmm3, %xmm5
863 ; SSE2-NEXT: psubd %xmm2, %xmm3
864 ; SSE2-NEXT: pxor %xmm4, %xmm2
865 ; SSE2-NEXT: por %xmm4, %xmm5
866 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
867 ; SSE2-NEXT: movdqa %xmm1, %xmm2
868 ; SSE2-NEXT: pxor %xmm4, %xmm2
869 ; SSE2-NEXT: por %xmm0, %xmm4
870 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
871 ; SSE2-NEXT: packssdw %xmm5, %xmm4
872 ; SSE2-NEXT: psubd %xmm1, %xmm0
873 ; SSE2-NEXT: pslld $16, %xmm3
874 ; SSE2-NEXT: psrad $16, %xmm3
875 ; SSE2-NEXT: pslld $16, %xmm0
876 ; SSE2-NEXT: psrad $16, %xmm0
877 ; SSE2-NEXT: packssdw %xmm3, %xmm0
878 ; SSE2-NEXT: pand %xmm4, %xmm0
881 ; SSSE3-LABEL: test15:
882 ; SSSE3: # %bb.0: # %vector.ph
883 ; SSSE3-NEXT: pxor %xmm4, %xmm4
884 ; SSSE3-NEXT: movdqa %xmm0, %xmm3
885 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
886 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
887 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
888 ; SSSE3-NEXT: movdqa %xmm0, %xmm5
889 ; SSSE3-NEXT: psubd %xmm2, %xmm0
890 ; SSSE3-NEXT: pxor %xmm4, %xmm2
891 ; SSSE3-NEXT: por %xmm4, %xmm5
892 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
893 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
894 ; SSSE3-NEXT: pxor %xmm4, %xmm2
895 ; SSSE3-NEXT: por %xmm3, %xmm4
896 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
897 ; SSSE3-NEXT: packssdw %xmm5, %xmm4
898 ; SSSE3-NEXT: psubd %xmm1, %xmm3
899 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
900 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
901 ; SSSE3-NEXT: pshufb %xmm1, %xmm3
902 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
903 ; SSSE3-NEXT: pand %xmm4, %xmm3
904 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
907 ; SSE41-LABEL: test15:
908 ; SSE41: # %bb.0: # %vector.ph
909 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
910 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
911 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
912 ; SSE41-NEXT: movdqa %xmm0, %xmm4
913 ; SSE41-NEXT: pminud %xmm1, %xmm4
914 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
915 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
916 ; SSE41-NEXT: pxor %xmm5, %xmm4
917 ; SSE41-NEXT: movdqa %xmm3, %xmm6
918 ; SSE41-NEXT: pminud %xmm2, %xmm6
919 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm6
920 ; SSE41-NEXT: pxor %xmm5, %xmm6
921 ; SSE41-NEXT: packssdw %xmm6, %xmm4
922 ; SSE41-NEXT: psubd %xmm2, %xmm3
923 ; SSE41-NEXT: psubd %xmm1, %xmm0
924 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
925 ; SSE41-NEXT: pshufb %xmm1, %xmm0
926 ; SSE41-NEXT: pshufb %xmm1, %xmm3
927 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
928 ; SSE41-NEXT: pand %xmm4, %xmm0
931 ; AVX1-LABEL: test15:
932 ; AVX1: # %bb.0: # %vector.ph
933 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
934 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
935 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
936 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
937 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm4
938 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4
939 ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm5
940 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm0, %xmm5
941 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
942 ; AVX1-NEXT: vpandn %xmm0, %xmm5, %xmm0
943 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm1
944 ; AVX1-NEXT: vpandn %xmm1, %xmm4, %xmm1
945 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
946 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
947 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
948 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
949 ; AVX1-NEXT: vzeroupper
952 ; AVX2-LABEL: test15:
953 ; AVX2: # %bb.0: # %vector.ph
954 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
955 ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm2
956 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2
957 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
958 ; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0
959 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
960 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
961 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
962 ; AVX2-NEXT: vzeroupper
965 ; AVX512-LABEL: test15:
966 ; AVX512: # %bb.0: # %vector.ph
967 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
968 ; AVX512-NEXT: vpcmpnleud %ymm1, %ymm0, %k1
969 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
970 ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
971 ; AVX512-NEXT: vzeroupper
974 %lhs = zext <8 x i16> %x to <8 x i32>
975 %cond = icmp ugt <8 x i32> %lhs, %y
976 %sub = sub <8 x i32> %lhs, %y
977 %truncsub = trunc <8 x i32> %sub to <8 x i16>
978 %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
982 define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
983 ; SSE2-LABEL: test16:
984 ; SSE2: # %bb.0: # %vector.ph
985 ; SSE2-NEXT: movdqa %xmm0, %xmm3
986 ; SSE2-NEXT: pxor %xmm4, %xmm4
987 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
988 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
989 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
990 ; SSE2-NEXT: movdqa %xmm3, %xmm5
991 ; SSE2-NEXT: psubd %xmm2, %xmm3
992 ; SSE2-NEXT: pxor %xmm4, %xmm2
993 ; SSE2-NEXT: por %xmm4, %xmm5
994 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
995 ; SSE2-NEXT: movdqa %xmm1, %xmm2
996 ; SSE2-NEXT: pxor %xmm4, %xmm2
997 ; SSE2-NEXT: por %xmm0, %xmm4
998 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
999 ; SSE2-NEXT: packssdw %xmm5, %xmm4
1000 ; SSE2-NEXT: psubd %xmm1, %xmm0
1001 ; SSE2-NEXT: pslld $16, %xmm3
1002 ; SSE2-NEXT: psrad $16, %xmm3
1003 ; SSE2-NEXT: pslld $16, %xmm0
1004 ; SSE2-NEXT: psrad $16, %xmm0
1005 ; SSE2-NEXT: packssdw %xmm3, %xmm0
1006 ; SSE2-NEXT: pand %xmm4, %xmm0
1009 ; SSSE3-LABEL: test16:
1010 ; SSSE3: # %bb.0: # %vector.ph
1011 ; SSSE3-NEXT: pxor %xmm4, %xmm4
1012 ; SSSE3-NEXT: movdqa %xmm0, %xmm3
1013 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1014 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1015 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
1016 ; SSSE3-NEXT: movdqa %xmm0, %xmm5
1017 ; SSSE3-NEXT: psubd %xmm2, %xmm0
1018 ; SSSE3-NEXT: pxor %xmm4, %xmm2
1019 ; SSSE3-NEXT: por %xmm4, %xmm5
1020 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
1021 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
1022 ; SSSE3-NEXT: pxor %xmm4, %xmm2
1023 ; SSSE3-NEXT: por %xmm3, %xmm4
1024 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
1025 ; SSSE3-NEXT: packssdw %xmm5, %xmm4
1026 ; SSSE3-NEXT: psubd %xmm1, %xmm3
1027 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1028 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
1029 ; SSSE3-NEXT: pshufb %xmm1, %xmm3
1030 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1031 ; SSSE3-NEXT: pand %xmm4, %xmm3
1032 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
1035 ; SSE41-LABEL: test16:
1036 ; SSE41: # %bb.0: # %vector.ph
1037 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
1038 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
1039 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1040 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1041 ; SSE41-NEXT: pmaxud %xmm0, %xmm4
1042 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm4
1043 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
1044 ; SSE41-NEXT: pxor %xmm5, %xmm4
1045 ; SSE41-NEXT: movdqa %xmm2, %xmm6
1046 ; SSE41-NEXT: pmaxud %xmm3, %xmm6
1047 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm6
1048 ; SSE41-NEXT: pxor %xmm5, %xmm6
1049 ; SSE41-NEXT: packssdw %xmm6, %xmm4
1050 ; SSE41-NEXT: psubd %xmm2, %xmm3
1051 ; SSE41-NEXT: psubd %xmm1, %xmm0
1052 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1053 ; SSE41-NEXT: pshufb %xmm1, %xmm0
1054 ; SSE41-NEXT: pshufb %xmm1, %xmm3
1055 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
1056 ; SSE41-NEXT: pand %xmm4, %xmm0
1059 ; AVX1-LABEL: test16:
1060 ; AVX1: # %bb.0: # %vector.ph
1061 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1062 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1063 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1064 ; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm3
1065 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3
1066 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1067 ; AVX1-NEXT: vpmaxud %xmm2, %xmm4, %xmm5
1068 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5
1069 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
1070 ; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
1071 ; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm1
1072 ; AVX1-NEXT: vpandn %xmm1, %xmm5, %xmm1
1073 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1074 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1075 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1076 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1077 ; AVX1-NEXT: vzeroupper
1080 ; AVX2-LABEL: test16:
1081 ; AVX2: # %bb.0: # %vector.ph
1082 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1083 ; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm2
1084 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2
1085 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
1086 ; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0
1087 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1088 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1089 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1090 ; AVX2-NEXT: vzeroupper
1093 ; AVX512-LABEL: test16:
1094 ; AVX512: # %bb.0: # %vector.ph
1095 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1096 ; AVX512-NEXT: vpcmpltud %ymm0, %ymm1, %k1
1097 ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
1098 ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
1099 ; AVX512-NEXT: vzeroupper
1102 %lhs = zext <8 x i16> %x to <8 x i32>
1103 %cond = icmp ult <8 x i32> %y, %lhs
1104 %sub = sub <8 x i32> %lhs, %y
1105 %truncsub = trunc <8 x i32> %sub to <8 x i16>
1106 %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
1110 define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind {
1111 ; SSE2-LABEL: test17:
1112 ; SSE2: # %bb.0: # %vector.ph
1113 ; SSE2-NEXT: movd %edi, %xmm4
1114 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1115 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7]
1116 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
1117 ; SSE2-NEXT: psubusb %xmm4, %xmm0
1118 ; SSE2-NEXT: psubusb %xmm4, %xmm1
1119 ; SSE2-NEXT: psubusb %xmm4, %xmm2
1120 ; SSE2-NEXT: psubusb %xmm4, %xmm3
1123 ; SSSE3-LABEL: test17:
1124 ; SSSE3: # %bb.0: # %vector.ph
1125 ; SSSE3-NEXT: movd %edi, %xmm4
1126 ; SSSE3-NEXT: pxor %xmm5, %xmm5
1127 ; SSSE3-NEXT: pshufb %xmm5, %xmm4
1128 ; SSSE3-NEXT: psubusb %xmm4, %xmm0
1129 ; SSSE3-NEXT: psubusb %xmm4, %xmm1
1130 ; SSSE3-NEXT: psubusb %xmm4, %xmm2
1131 ; SSSE3-NEXT: psubusb %xmm4, %xmm3
1134 ; SSE41-LABEL: test17:
1135 ; SSE41: # %bb.0: # %vector.ph
1136 ; SSE41-NEXT: movd %edi, %xmm4
1137 ; SSE41-NEXT: pxor %xmm5, %xmm5
1138 ; SSE41-NEXT: pshufb %xmm5, %xmm4
1139 ; SSE41-NEXT: psubusb %xmm4, %xmm0
1140 ; SSE41-NEXT: psubusb %xmm4, %xmm1
1141 ; SSE41-NEXT: psubusb %xmm4, %xmm2
1142 ; SSE41-NEXT: psubusb %xmm4, %xmm3
1145 ; AVX1-LABEL: test17:
1146 ; AVX1: # %bb.0: # %vector.ph
1147 ; AVX1-NEXT: vmovd %edi, %xmm2
1148 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1149 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1150 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1151 ; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3
1152 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
1153 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1154 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1155 ; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3
1156 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
1157 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1160 ; AVX2-LABEL: test17:
1161 ; AVX2: # %bb.0: # %vector.ph
1162 ; AVX2-NEXT: vmovd %edi, %xmm2
1163 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
1164 ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
1165 ; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1
1168 ; AVX512-LABEL: test17:
1169 ; AVX512: # %bb.0: # %vector.ph
1170 ; AVX512-NEXT: vpbroadcastb %edi, %zmm1
1171 ; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0
1174 %0 = insertelement <64 x i8> undef, i8 %w, i32 0
1175 %broadcast15 = shufflevector <64 x i8> %0, <64 x i8> undef, <64 x i32> zeroinitializer
1176 %1 = icmp ult <64 x i8> %x, %broadcast15
1177 %2 = sub <64 x i8> %x, %broadcast15
1178 %res = select <64 x i1> %1, <64 x i8> zeroinitializer, <64 x i8> %2
1182 define <32 x i16> @test18(<32 x i16> %x, i16 zeroext %w) nounwind {
1183 ; SSE-LABEL: test18:
1184 ; SSE: # %bb.0: # %vector.ph
1185 ; SSE-NEXT: movd %edi, %xmm4
1186 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7]
1187 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
1188 ; SSE-NEXT: psubusw %xmm4, %xmm0
1189 ; SSE-NEXT: psubusw %xmm4, %xmm1
1190 ; SSE-NEXT: psubusw %xmm4, %xmm2
1191 ; SSE-NEXT: psubusw %xmm4, %xmm3
1194 ; AVX1-LABEL: test18:
1195 ; AVX1: # %bb.0: # %vector.ph
1196 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1197 ; AVX1-NEXT: vmovd %edi, %xmm3
1198 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
1199 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1200 ; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2
1201 ; AVX1-NEXT: vpsubusw %xmm3, %xmm0, %xmm0
1202 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1203 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1204 ; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2
1205 ; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1
1206 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1209 ; AVX2-LABEL: test18:
1210 ; AVX2: # %bb.0: # %vector.ph
1211 ; AVX2-NEXT: vmovd %edi, %xmm2
1212 ; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
1213 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
1214 ; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1
1217 ; AVX512-LABEL: test18:
1218 ; AVX512: # %bb.0: # %vector.ph
1219 ; AVX512-NEXT: vpbroadcastw %edi, %zmm1
1220 ; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
1223 %0 = insertelement <32 x i16> undef, i16 %w, i32 0
1224 %broadcast15 = shufflevector <32 x i16> %0, <32 x i16> undef, <32 x i32> zeroinitializer
1225 %1 = icmp ult <32 x i16> %x, %broadcast15
1226 %2 = sub <32 x i16> %x, %broadcast15
1227 %res = select <32 x i1> %1, <32 x i16> zeroinitializer, <32 x i16> %2
1231 define <8 x i16> @psubus_8i16_max(<8 x i16> %x, <8 x i16> %y) nounwind {
1232 ; SSE-LABEL: psubus_8i16_max:
1233 ; SSE: # %bb.0: # %vector.ph
1234 ; SSE-NEXT: psubusw %xmm1, %xmm0
1237 ; AVX-LABEL: psubus_8i16_max:
1238 ; AVX: # %bb.0: # %vector.ph
1239 ; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1242 %cmp = icmp ult <8 x i16> %x, %y
1243 %max = select <8 x i1> %cmp, <8 x i16> %y, <8 x i16> %x
1244 %res = sub <8 x i16> %max, %y
1248 define <16 x i8> @psubus_16i8_max(<16 x i8> %x, <16 x i8> %y) nounwind {
1249 ; SSE-LABEL: psubus_16i8_max:
1250 ; SSE: # %bb.0: # %vector.ph
1251 ; SSE-NEXT: psubusb %xmm1, %xmm0
1254 ; AVX-LABEL: psubus_16i8_max:
1255 ; AVX: # %bb.0: # %vector.ph
1256 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
1259 %cmp = icmp ult <16 x i8> %x, %y
1260 %max = select <16 x i1> %cmp, <16 x i8> %y, <16 x i8> %x
1261 %res = sub <16 x i8> %max, %y
1265 define <16 x i16> @psubus_16i16_max(<16 x i16> %x, <16 x i16> %y) nounwind {
1266 ; SSE-LABEL: psubus_16i16_max:
1267 ; SSE: # %bb.0: # %vector.ph
1268 ; SSE-NEXT: psubusw %xmm2, %xmm0
1269 ; SSE-NEXT: psubusw %xmm3, %xmm1
1272 ; AVX1-LABEL: psubus_16i16_max:
1273 ; AVX1: # %bb.0: # %vector.ph
1274 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1275 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1276 ; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
1277 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1278 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1281 ; AVX2-LABEL: psubus_16i16_max:
1282 ; AVX2: # %bb.0: # %vector.ph
1283 ; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1286 ; AVX512-LABEL: psubus_16i16_max:
1287 ; AVX512: # %bb.0: # %vector.ph
1288 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1291 %cmp = icmp ult <16 x i16> %x, %y
1292 %max = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> %x
1293 %res = sub <16 x i16> %max, %y
1297 define <32 x i16> @psubus_32i16_max(<32 x i16> %x, <32 x i16> %y) nounwind {
1298 ; SSE-LABEL: psubus_32i16_max:
1299 ; SSE: # %bb.0: # %vector.ph
1300 ; SSE-NEXT: psubusw %xmm4, %xmm0
1301 ; SSE-NEXT: psubusw %xmm5, %xmm1
1302 ; SSE-NEXT: psubusw %xmm6, %xmm2
1303 ; SSE-NEXT: psubusw %xmm7, %xmm3
1306 ; AVX1-LABEL: psubus_32i16_max:
1307 ; AVX1: # %bb.0: # %vector.ph
1308 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1309 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
1310 ; AVX1-NEXT: vpsubusw %xmm4, %xmm5, %xmm4
1311 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
1312 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1313 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
1314 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1315 ; AVX1-NEXT: vpsubusw %xmm2, %xmm4, %xmm2
1316 ; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1
1317 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1320 ; AVX2-LABEL: psubus_32i16_max:
1321 ; AVX2: # %bb.0: # %vector.ph
1322 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
1323 ; AVX2-NEXT: vpsubusw %ymm3, %ymm1, %ymm1
1326 ; AVX512-LABEL: psubus_32i16_max:
1327 ; AVX512: # %bb.0: # %vector.ph
1328 ; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
1331 %cmp = icmp ult <32 x i16> %x, %y
1332 %max = select <32 x i1> %cmp, <32 x i16> %y, <32 x i16> %x
1333 %res = sub <32 x i16> %max, %y
1337 define <64 x i8> @psubus_64i8_max(<64 x i8> %x, <64 x i8> %y) nounwind {
1338 ; SSE-LABEL: psubus_64i8_max:
1339 ; SSE: # %bb.0: # %vector.ph
1340 ; SSE-NEXT: psubusb %xmm4, %xmm0
1341 ; SSE-NEXT: psubusb %xmm5, %xmm1
1342 ; SSE-NEXT: psubusb %xmm6, %xmm2
1343 ; SSE-NEXT: psubusb %xmm7, %xmm3
1346 ; AVX1-LABEL: psubus_64i8_max:
1347 ; AVX1: # %bb.0: # %vector.ph
1348 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1349 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
1350 ; AVX1-NEXT: vpsubusb %xmm4, %xmm5, %xmm4
1351 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
1352 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1353 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
1354 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1355 ; AVX1-NEXT: vpsubusb %xmm2, %xmm4, %xmm2
1356 ; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1
1357 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1360 ; AVX2-LABEL: psubus_64i8_max:
1361 ; AVX2: # %bb.0: # %vector.ph
1362 ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
1363 ; AVX2-NEXT: vpsubusb %ymm3, %ymm1, %ymm1
1366 ; AVX512-LABEL: psubus_64i8_max:
1367 ; AVX512: # %bb.0: # %vector.ph
1368 ; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0
1371 %cmp = icmp ult <64 x i8> %x, %y
1372 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
1373 %res = sub <64 x i8> %max, %y
1377 define <32 x i8> @psubus_32i8_max(<32 x i8> %x, <32 x i8> %y) nounwind {
1378 ; SSE-LABEL: psubus_32i8_max:
1379 ; SSE: # %bb.0: # %vector.ph
1380 ; SSE-NEXT: psubusb %xmm2, %xmm0
1381 ; SSE-NEXT: psubusb %xmm3, %xmm1
1384 ; AVX1-LABEL: psubus_32i8_max:
1385 ; AVX1: # %bb.0: # %vector.ph
1386 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1387 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1388 ; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm2
1389 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
1390 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1393 ; AVX2-LABEL: psubus_32i8_max:
1394 ; AVX2: # %bb.0: # %vector.ph
1395 ; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
1398 ; AVX512-LABEL: psubus_32i8_max:
1399 ; AVX512: # %bb.0: # %vector.ph
1400 ; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
1403 %cmp = icmp ult <32 x i8> %x, %y
1404 %max = select <32 x i1> %cmp, <32 x i8> %y, <32 x i8> %x
1405 %res = sub <32 x i8> %max, %y
1409 define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
1410 ; SSE2-LABEL: psubus_8i32_max:
1411 ; SSE2: # %bb.0: # %vector.ph
1412 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1413 ; SSE2-NEXT: pxor %xmm4, %xmm4
1414 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1415 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1416 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
1417 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1418 ; SSE2-NEXT: pxor %xmm5, %xmm6
1419 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1420 ; SSE2-NEXT: por %xmm5, %xmm4
1421 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
1422 ; SSE2-NEXT: pand %xmm4, %xmm3
1423 ; SSE2-NEXT: pandn %xmm2, %xmm4
1424 ; SSE2-NEXT: por %xmm3, %xmm4
1425 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1426 ; SSE2-NEXT: pxor %xmm5, %xmm3
1427 ; SSE2-NEXT: por %xmm0, %xmm5
1428 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1429 ; SSE2-NEXT: pand %xmm5, %xmm0
1430 ; SSE2-NEXT: pandn %xmm1, %xmm5
1431 ; SSE2-NEXT: por %xmm5, %xmm0
1432 ; SSE2-NEXT: psubd %xmm1, %xmm0
1433 ; SSE2-NEXT: psubd %xmm2, %xmm4
1434 ; SSE2-NEXT: pslld $16, %xmm4
1435 ; SSE2-NEXT: psrad $16, %xmm4
1436 ; SSE2-NEXT: pslld $16, %xmm0
1437 ; SSE2-NEXT: psrad $16, %xmm0
1438 ; SSE2-NEXT: packssdw %xmm4, %xmm0
1441 ; SSSE3-LABEL: psubus_8i32_max:
1442 ; SSSE3: # %bb.0: # %vector.ph
1443 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1444 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
1445 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
1446 ; SSSE3-NEXT: pxor %xmm4, %xmm5
1447 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
1448 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
1449 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
1450 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
1451 ; SSSE3-NEXT: pand %xmm7, %xmm2
1452 ; SSSE3-NEXT: pandn %xmm5, %xmm7
1453 ; SSSE3-NEXT: por %xmm2, %xmm7
1454 ; SSSE3-NEXT: pshufb %xmm3, %xmm7
1455 ; SSSE3-NEXT: pxor %xmm1, %xmm4
1456 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
1457 ; SSSE3-NEXT: pand %xmm6, %xmm1
1458 ; SSSE3-NEXT: pandn %xmm5, %xmm6
1459 ; SSSE3-NEXT: por %xmm1, %xmm6
1460 ; SSSE3-NEXT: pshufb %xmm3, %xmm6
1461 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1462 ; SSSE3-NEXT: psubusw %xmm6, %xmm0
1465 ; SSE41-LABEL: psubus_8i32_max:
1466 ; SSE41: # %bb.0: # %vector.ph
1467 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1468 ; SSE41-NEXT: pminud %xmm3, %xmm2
1469 ; SSE41-NEXT: pminud %xmm3, %xmm1
1470 ; SSE41-NEXT: packusdw %xmm2, %xmm1
1471 ; SSE41-NEXT: psubusw %xmm1, %xmm0
1474 ; AVX1-LABEL: psubus_8i32_max:
1475 ; AVX1: # %bb.0: # %vector.ph
1476 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1477 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
1478 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
1479 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
1480 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1481 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1482 ; AVX1-NEXT: vzeroupper
1485 ; AVX2-LABEL: psubus_8i32_max:
1486 ; AVX2: # %bb.0: # %vector.ph
1487 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
1488 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
1489 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1490 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1491 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1492 ; AVX2-NEXT: vzeroupper
1495 ; AVX512-LABEL: psubus_8i32_max:
1496 ; AVX512: # %bb.0: # %vector.ph
1497 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
1498 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1499 ; AVX512-NEXT: vzeroupper
1502 %lhs = zext <8 x i16> %x to <8 x i32>
1503 %cond = icmp ult <8 x i32> %lhs, %y
1504 %max = select <8 x i1> %cond, <8 x i32> %y, <8 x i32> %lhs
1505 %sub = sub <8 x i32> %max, %y
1506 %res = trunc <8 x i32> %sub to <8 x i16>
1510 define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
1511 ; SSE2-LABEL: psubus_8i64_max:
1512 ; SSE2: # %bb.0: # %vector.ph
1513 ; SSE2-NEXT: pxor %xmm5, %xmm5
1514 ; SSE2-NEXT: movdqa %xmm0, %xmm10
1515 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
1516 ; SSE2-NEXT: movdqa %xmm10, %xmm8
1517 ; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
1518 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3]
1519 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
1520 ; SSE2-NEXT: movdqa %xmm0, %xmm9
1521 ; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
1522 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
1523 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456]
1524 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1525 ; SSE2-NEXT: pxor %xmm11, %xmm6
1526 ; SSE2-NEXT: movdqa %xmm0, %xmm7
1527 ; SSE2-NEXT: por %xmm11, %xmm7
1528 ; SSE2-NEXT: movdqa %xmm7, %xmm5
1529 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
1530 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
1531 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm7
1532 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
1533 ; SSE2-NEXT: pand %xmm12, %xmm7
1534 ; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3]
1535 ; SSE2-NEXT: por %xmm7, %xmm13
1536 ; SSE2-NEXT: pand %xmm13, %xmm0
1537 ; SSE2-NEXT: pandn %xmm2, %xmm13
1538 ; SSE2-NEXT: por %xmm0, %xmm13
1539 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1540 ; SSE2-NEXT: pxor %xmm11, %xmm0
1541 ; SSE2-NEXT: movdqa %xmm9, %xmm5
1542 ; SSE2-NEXT: por %xmm11, %xmm5
1543 ; SSE2-NEXT: movdqa %xmm5, %xmm7
1544 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
1545 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2]
1546 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm5
1547 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1548 ; SSE2-NEXT: pand %xmm12, %xmm5
1549 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
1550 ; SSE2-NEXT: por %xmm5, %xmm0
1551 ; SSE2-NEXT: pand %xmm0, %xmm9
1552 ; SSE2-NEXT: pandn %xmm1, %xmm0
1553 ; SSE2-NEXT: por %xmm9, %xmm0
1554 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1555 ; SSE2-NEXT: pxor %xmm11, %xmm5
1556 ; SSE2-NEXT: movdqa %xmm10, %xmm7
1557 ; SSE2-NEXT: por %xmm11, %xmm7
1558 ; SSE2-NEXT: movdqa %xmm7, %xmm6
1559 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
1560 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
1561 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm7
1562 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
1563 ; SSE2-NEXT: pand %xmm9, %xmm5
1564 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
1565 ; SSE2-NEXT: por %xmm5, %xmm7
1566 ; SSE2-NEXT: pand %xmm7, %xmm10
1567 ; SSE2-NEXT: pandn %xmm4, %xmm7
1568 ; SSE2-NEXT: por %xmm10, %xmm7
1569 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1570 ; SSE2-NEXT: pxor %xmm11, %xmm5
1571 ; SSE2-NEXT: por %xmm8, %xmm11
1572 ; SSE2-NEXT: movdqa %xmm11, %xmm6
1573 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
1574 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
1575 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm11
1576 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,3,3]
1577 ; SSE2-NEXT: pand %xmm9, %xmm5
1578 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1579 ; SSE2-NEXT: por %xmm5, %xmm6
1580 ; SSE2-NEXT: pand %xmm6, %xmm8
1581 ; SSE2-NEXT: pandn %xmm3, %xmm6
1582 ; SSE2-NEXT: por %xmm8, %xmm6
1583 ; SSE2-NEXT: psubq %xmm3, %xmm6
1584 ; SSE2-NEXT: psubq %xmm4, %xmm7
1585 ; SSE2-NEXT: psubq %xmm1, %xmm0
1586 ; SSE2-NEXT: psubq %xmm2, %xmm13
1587 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,2,3]
1588 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1589 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1590 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
1591 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1592 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3]
1593 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1594 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
1595 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1596 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1597 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1600 ; SSSE3-LABEL: psubus_8i64_max:
1601 ; SSSE3: # %bb.0: # %vector.ph
1602 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
1603 ; SSSE3-NEXT: movdqa %xmm2, %xmm7
1604 ; SSSE3-NEXT: pxor %xmm5, %xmm7
1605 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991]
1606 ; SSSE3-NEXT: movdqa %xmm8, %xmm6
1607 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6
1608 ; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
1609 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7
1610 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
1611 ; SSSE3-NEXT: pand %xmm9, %xmm7
1612 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1613 ; SSSE3-NEXT: por %xmm7, %xmm6
1614 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535]
1615 ; SSSE3-NEXT: pand %xmm6, %xmm2
1616 ; SSSE3-NEXT: pandn %xmm9, %xmm6
1617 ; SSSE3-NEXT: por %xmm2, %xmm6
1618 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
1619 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7]
1620 ; SSSE3-NEXT: movdqa %xmm1, %xmm6
1621 ; SSSE3-NEXT: pxor %xmm5, %xmm6
1622 ; SSSE3-NEXT: movdqa %xmm8, %xmm7
1623 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
1624 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
1625 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6
1626 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1627 ; SSSE3-NEXT: pand %xmm2, %xmm6
1628 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
1629 ; SSSE3-NEXT: por %xmm6, %xmm2
1630 ; SSSE3-NEXT: pand %xmm2, %xmm1
1631 ; SSSE3-NEXT: pandn %xmm9, %xmm2
1632 ; SSSE3-NEXT: por %xmm1, %xmm2
1633 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1634 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1635 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
1636 ; SSSE3-NEXT: movdqa %xmm4, %xmm2
1637 ; SSSE3-NEXT: pxor %xmm5, %xmm2
1638 ; SSSE3-NEXT: movdqa %xmm8, %xmm6
1639 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
1640 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
1641 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
1642 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1643 ; SSSE3-NEXT: pand %xmm7, %xmm2
1644 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
1645 ; SSSE3-NEXT: por %xmm2, %xmm6
1646 ; SSSE3-NEXT: pand %xmm6, %xmm4
1647 ; SSSE3-NEXT: pandn %xmm9, %xmm6
1648 ; SSSE3-NEXT: por %xmm4, %xmm6
1649 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
1650 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
1651 ; SSSE3-NEXT: pxor %xmm3, %xmm5
1652 ; SSSE3-NEXT: movdqa %xmm8, %xmm4
1653 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
1654 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
1655 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5
1656 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1657 ; SSSE3-NEXT: pand %xmm6, %xmm5
1658 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1659 ; SSSE3-NEXT: por %xmm5, %xmm4
1660 ; SSSE3-NEXT: pand %xmm4, %xmm3
1661 ; SSSE3-NEXT: pandn %xmm9, %xmm4
1662 ; SSSE3-NEXT: por %xmm3, %xmm4
1663 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
1664 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
1665 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1666 ; SSSE3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
1667 ; SSSE3-NEXT: psubusw %xmm3, %xmm0
1670 ; SSE41-LABEL: psubus_8i64_max:
1671 ; SSE41: # %bb.0: # %vector.ph
1672 ; SSE41-NEXT: movdqa %xmm0, %xmm8
1673 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
1674 ; SSE41-NEXT: movdqa %xmm4, %xmm0
1675 ; SSE41-NEXT: pxor %xmm6, %xmm0
1676 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
1677 ; SSE41-NEXT: movdqa %xmm9, %xmm7
1678 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
1679 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
1680 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
1681 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1682 ; SSE41-NEXT: pand %xmm5, %xmm0
1683 ; SSE41-NEXT: por %xmm7, %xmm0
1684 ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535]
1685 ; SSE41-NEXT: movapd %xmm7, %xmm5
1686 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5
1687 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1688 ; SSE41-NEXT: pxor %xmm6, %xmm0
1689 ; SSE41-NEXT: movdqa %xmm9, %xmm4
1690 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
1691 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
1692 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
1693 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1694 ; SSE41-NEXT: pand %xmm10, %xmm0
1695 ; SSE41-NEXT: por %xmm4, %xmm0
1696 ; SSE41-NEXT: movapd %xmm7, %xmm4
1697 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
1698 ; SSE41-NEXT: packusdw %xmm5, %xmm4
1699 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1700 ; SSE41-NEXT: pxor %xmm6, %xmm0
1701 ; SSE41-NEXT: movdqa %xmm9, %xmm3
1702 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
1703 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
1704 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm0
1705 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1706 ; SSE41-NEXT: pand %xmm5, %xmm0
1707 ; SSE41-NEXT: por %xmm3, %xmm0
1708 ; SSE41-NEXT: movapd %xmm7, %xmm3
1709 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
1710 ; SSE41-NEXT: pxor %xmm1, %xmm6
1711 ; SSE41-NEXT: movdqa %xmm9, %xmm2
1712 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm2
1713 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
1714 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
1715 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
1716 ; SSE41-NEXT: pand %xmm5, %xmm0
1717 ; SSE41-NEXT: por %xmm2, %xmm0
1718 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
1719 ; SSE41-NEXT: packusdw %xmm3, %xmm7
1720 ; SSE41-NEXT: packusdw %xmm4, %xmm7
1721 ; SSE41-NEXT: psubusw %xmm7, %xmm8
1722 ; SSE41-NEXT: movdqa %xmm8, %xmm0
1725 ; AVX1-LABEL: psubus_8i64_max:
1726 ; AVX1: # %bb.0: # %vector.ph
1727 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1728 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
1729 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
1730 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
1731 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
1732 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6
1733 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm6
1734 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
1735 ; AVX1-NEXT: vmovapd {{.*#+}} ymm6 = [65535,65535,65535,65535]
1736 ; AVX1-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2
1737 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1738 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1739 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1740 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
1741 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
1742 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4
1743 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
1744 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
1745 ; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
1746 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1747 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1748 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1749 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1750 ; AVX1-NEXT: vzeroupper
1753 ; AVX2-LABEL: psubus_8i64_max:
1754 ; AVX2: # %bb.0: # %vector.ph
1755 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
1756 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4
1757 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343]
1758 ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
1759 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [65535,65535,65535,65535]
1760 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
1761 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3
1762 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
1763 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
1764 ; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
1765 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
1766 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1767 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1768 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1769 ; AVX2-NEXT: vzeroupper
1772 ; AVX512-LABEL: psubus_8i64_max:
1773 ; AVX512: # %bb.0: # %vector.ph
1774 ; AVX512-NEXT: vpmovusqw %zmm1, %xmm1
1775 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1776 ; AVX512-NEXT: vzeroupper
1779 %lhs = zext <8 x i16> %x to <8 x i64>
1780 %cond = icmp ult <8 x i64> %lhs, %y
1781 %max = select <8 x i1> %cond, <8 x i64> %y, <8 x i64> %lhs
1782 %sub = sub <8 x i64> %max, %y
1783 %res = trunc <8 x i64> %sub to <8 x i16>
1787 define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
1788 ; SSE2-LABEL: psubus_16i32_max:
1789 ; SSE2: # %bb.0: # %vector.ph
1790 ; SSE2-NEXT: movdqa %xmm1, %xmm8
1791 ; SSE2-NEXT: pxor %xmm7, %xmm7
1792 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
1793 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
1794 ; SSE2-NEXT: movdqa %xmm0, %xmm10
1795 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
1796 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
1797 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
1798 ; SSE2-NEXT: movdqa %xmm3, %xmm6
1799 ; SSE2-NEXT: pxor %xmm7, %xmm6
1800 ; SSE2-NEXT: movdqa %xmm0, %xmm9
1801 ; SSE2-NEXT: por %xmm7, %xmm9
1802 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm9
1803 ; SSE2-NEXT: pand %xmm9, %xmm0
1804 ; SSE2-NEXT: pandn %xmm3, %xmm9
1805 ; SSE2-NEXT: por %xmm0, %xmm9
1806 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1807 ; SSE2-NEXT: pxor %xmm7, %xmm6
1808 ; SSE2-NEXT: movdqa %xmm10, %xmm0
1809 ; SSE2-NEXT: por %xmm7, %xmm0
1810 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
1811 ; SSE2-NEXT: pand %xmm0, %xmm10
1812 ; SSE2-NEXT: pandn %xmm2, %xmm0
1813 ; SSE2-NEXT: por %xmm10, %xmm0
1814 ; SSE2-NEXT: movdqa %xmm5, %xmm10
1815 ; SSE2-NEXT: pxor %xmm7, %xmm10
1816 ; SSE2-NEXT: movdqa %xmm8, %xmm6
1817 ; SSE2-NEXT: por %xmm7, %xmm6
1818 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm6
1819 ; SSE2-NEXT: pand %xmm6, %xmm8
1820 ; SSE2-NEXT: pandn %xmm5, %xmm6
1821 ; SSE2-NEXT: por %xmm8, %xmm6
1822 ; SSE2-NEXT: movdqa %xmm4, %xmm8
1823 ; SSE2-NEXT: pxor %xmm7, %xmm8
1824 ; SSE2-NEXT: por %xmm1, %xmm7
1825 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
1826 ; SSE2-NEXT: pand %xmm7, %xmm1
1827 ; SSE2-NEXT: pandn %xmm4, %xmm7
1828 ; SSE2-NEXT: por %xmm7, %xmm1
1829 ; SSE2-NEXT: psubd %xmm4, %xmm1
1830 ; SSE2-NEXT: psubd %xmm5, %xmm6
1831 ; SSE2-NEXT: psubd %xmm2, %xmm0
1832 ; SSE2-NEXT: psubd %xmm3, %xmm9
1833 ; SSE2-NEXT: pslld $16, %xmm9
1834 ; SSE2-NEXT: psrad $16, %xmm9
1835 ; SSE2-NEXT: pslld $16, %xmm0
1836 ; SSE2-NEXT: psrad $16, %xmm0
1837 ; SSE2-NEXT: packssdw %xmm9, %xmm0
1838 ; SSE2-NEXT: pslld $16, %xmm6
1839 ; SSE2-NEXT: psrad $16, %xmm6
1840 ; SSE2-NEXT: pslld $16, %xmm1
1841 ; SSE2-NEXT: psrad $16, %xmm1
1842 ; SSE2-NEXT: packssdw %xmm6, %xmm1
1845 ; SSSE3-LABEL: psubus_16i32_max:
1846 ; SSSE3: # %bb.0: # %vector.ph
1847 ; SSSE3-NEXT: movdqa %xmm1, %xmm8
1848 ; SSSE3-NEXT: pxor %xmm7, %xmm7
1849 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
1850 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
1851 ; SSSE3-NEXT: movdqa %xmm0, %xmm10
1852 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
1853 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
1854 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
1855 ; SSSE3-NEXT: movdqa %xmm3, %xmm6
1856 ; SSSE3-NEXT: pxor %xmm7, %xmm6
1857 ; SSSE3-NEXT: movdqa %xmm0, %xmm9
1858 ; SSSE3-NEXT: por %xmm7, %xmm9
1859 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9
1860 ; SSSE3-NEXT: pand %xmm9, %xmm0
1861 ; SSSE3-NEXT: pandn %xmm3, %xmm9
1862 ; SSSE3-NEXT: por %xmm0, %xmm9
1863 ; SSSE3-NEXT: movdqa %xmm2, %xmm6
1864 ; SSSE3-NEXT: pxor %xmm7, %xmm6
1865 ; SSSE3-NEXT: movdqa %xmm10, %xmm0
1866 ; SSSE3-NEXT: por %xmm7, %xmm0
1867 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
1868 ; SSSE3-NEXT: pand %xmm0, %xmm10
1869 ; SSSE3-NEXT: pandn %xmm2, %xmm0
1870 ; SSSE3-NEXT: por %xmm10, %xmm0
1871 ; SSSE3-NEXT: movdqa %xmm5, %xmm10
1872 ; SSSE3-NEXT: pxor %xmm7, %xmm10
1873 ; SSSE3-NEXT: movdqa %xmm8, %xmm6
1874 ; SSSE3-NEXT: por %xmm7, %xmm6
1875 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm6
1876 ; SSSE3-NEXT: pand %xmm6, %xmm8
1877 ; SSSE3-NEXT: pandn %xmm5, %xmm6
1878 ; SSSE3-NEXT: por %xmm8, %xmm6
1879 ; SSSE3-NEXT: movdqa %xmm4, %xmm8
1880 ; SSSE3-NEXT: pxor %xmm7, %xmm8
1881 ; SSSE3-NEXT: por %xmm1, %xmm7
1882 ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7
1883 ; SSSE3-NEXT: pand %xmm7, %xmm1
1884 ; SSSE3-NEXT: pandn %xmm4, %xmm7
1885 ; SSSE3-NEXT: por %xmm7, %xmm1
1886 ; SSSE3-NEXT: psubd %xmm4, %xmm1
1887 ; SSSE3-NEXT: psubd %xmm5, %xmm6
1888 ; SSSE3-NEXT: psubd %xmm2, %xmm0
1889 ; SSSE3-NEXT: psubd %xmm3, %xmm9
1890 ; SSSE3-NEXT: pslld $16, %xmm9
1891 ; SSSE3-NEXT: psrad $16, %xmm9
1892 ; SSSE3-NEXT: pslld $16, %xmm0
1893 ; SSSE3-NEXT: psrad $16, %xmm0
1894 ; SSSE3-NEXT: packssdw %xmm9, %xmm0
1895 ; SSSE3-NEXT: pslld $16, %xmm6
1896 ; SSSE3-NEXT: psrad $16, %xmm6
1897 ; SSSE3-NEXT: pslld $16, %xmm1
1898 ; SSSE3-NEXT: psrad $16, %xmm1
1899 ; SSSE3-NEXT: packssdw %xmm6, %xmm1
1902 ; SSE41-LABEL: psubus_16i32_max:
1903 ; SSE41: # %bb.0: # %vector.ph
1904 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
1905 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
1906 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1907 ; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
1908 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
1909 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1910 ; SSE41-NEXT: pmaxud %xmm2, %xmm0
1911 ; SSE41-NEXT: pmaxud %xmm3, %xmm7
1912 ; SSE41-NEXT: pmaxud %xmm4, %xmm1
1913 ; SSE41-NEXT: pmaxud %xmm5, %xmm6
1914 ; SSE41-NEXT: psubd %xmm5, %xmm6
1915 ; SSE41-NEXT: psubd %xmm4, %xmm1
1916 ; SSE41-NEXT: psubd %xmm3, %xmm7
1917 ; SSE41-NEXT: psubd %xmm2, %xmm0
1918 ; SSE41-NEXT: pxor %xmm2, %xmm2
1919 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1920 ; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2],xmm2[3],xmm7[4],xmm2[5],xmm7[6],xmm2[7]
1921 ; SSE41-NEXT: packusdw %xmm7, %xmm0
1922 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
1923 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2],xmm2[3],xmm6[4],xmm2[5],xmm6[6],xmm2[7]
1924 ; SSE41-NEXT: packusdw %xmm6, %xmm1
1927 ; AVX1-LABEL: psubus_16i32_max:
1928 ; AVX1: # %bb.0: # %vector.ph
1929 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1930 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
1931 ; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
1932 ; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1
1933 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1934 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1935 ; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
1936 ; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2
1937 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1938 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1939 ; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
1940 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
1941 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1944 ; AVX2-LABEL: psubus_16i32_max:
1945 ; AVX2: # %bb.0: # %vector.ph
1946 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535]
1947 ; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
1948 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
1949 ; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
1950 ; AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2
1951 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1952 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1953 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1954 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
1955 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
1956 ; AVX2-NEXT: vpsubusw %xmm1, %xmm3, %xmm1
1957 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1958 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1959 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
1960 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1961 ; AVX2-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
1962 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1965 ; AVX512-LABEL: psubus_16i32_max:
1966 ; AVX512: # %bb.0: # %vector.ph
1967 ; AVX512-NEXT: vpmovusdw %zmm1, %ymm1
1968 ; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
1971 %lhs = zext <16 x i16> %x to <16 x i32>
1972 %cond = icmp ult <16 x i32> %lhs, %y
1973 %max = select <16 x i1> %cond, <16 x i32> %y, <16 x i32> %lhs
1974 %sub = sub <16 x i32> %max, %y
1975 %res = trunc <16 x i32> %sub to <16 x i16>
1979 define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwind {
1980 ; SSE2-LABEL: psubus_i16_i32_max_swapped:
1981 ; SSE2: # %bb.0: # %vector.ph
1982 ; SSE2-NEXT: pxor %xmm3, %xmm3
1983 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1984 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1985 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1986 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
1987 ; SSE2-NEXT: movdqa %xmm2, %xmm3
1988 ; SSE2-NEXT: pxor %xmm5, %xmm3
1989 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1990 ; SSE2-NEXT: por %xmm5, %xmm6
1991 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm3
1992 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1993 ; SSE2-NEXT: pand %xmm3, %xmm6
1994 ; SSE2-NEXT: pandn %xmm0, %xmm3
1995 ; SSE2-NEXT: por %xmm6, %xmm3
1996 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1997 ; SSE2-NEXT: pxor %xmm5, %xmm0
1998 ; SSE2-NEXT: por %xmm4, %xmm5
1999 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
2000 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2001 ; SSE2-NEXT: pand %xmm0, %xmm5
2002 ; SSE2-NEXT: pandn %xmm4, %xmm0
2003 ; SSE2-NEXT: por %xmm5, %xmm0
2004 ; SSE2-NEXT: psubd %xmm1, %xmm0
2005 ; SSE2-NEXT: psubd %xmm2, %xmm3
2006 ; SSE2-NEXT: pslld $16, %xmm3
2007 ; SSE2-NEXT: psrad $16, %xmm3
2008 ; SSE2-NEXT: pslld $16, %xmm0
2009 ; SSE2-NEXT: psrad $16, %xmm0
2010 ; SSE2-NEXT: packssdw %xmm3, %xmm0
2013 ; SSSE3-LABEL: psubus_i16_i32_max_swapped:
2014 ; SSSE3: # %bb.0: # %vector.ph
2015 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2016 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
2017 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
2018 ; SSSE3-NEXT: pxor %xmm4, %xmm5
2019 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
2020 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
2021 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
2022 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
2023 ; SSSE3-NEXT: pand %xmm7, %xmm2
2024 ; SSSE3-NEXT: pandn %xmm5, %xmm7
2025 ; SSSE3-NEXT: por %xmm2, %xmm7
2026 ; SSSE3-NEXT: pshufb %xmm3, %xmm7
2027 ; SSSE3-NEXT: pxor %xmm1, %xmm4
2028 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
2029 ; SSSE3-NEXT: pand %xmm6, %xmm1
2030 ; SSSE3-NEXT: pandn %xmm5, %xmm6
2031 ; SSSE3-NEXT: por %xmm1, %xmm6
2032 ; SSSE3-NEXT: pshufb %xmm3, %xmm6
2033 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2034 ; SSSE3-NEXT: psubusw %xmm6, %xmm0
2037 ; SSE41-LABEL: psubus_i16_i32_max_swapped:
2038 ; SSE41: # %bb.0: # %vector.ph
2039 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
2040 ; SSE41-NEXT: pminud %xmm3, %xmm2
2041 ; SSE41-NEXT: pminud %xmm3, %xmm1
2042 ; SSE41-NEXT: packusdw %xmm2, %xmm1
2043 ; SSE41-NEXT: psubusw %xmm1, %xmm0
2046 ; AVX1-LABEL: psubus_i16_i32_max_swapped:
2047 ; AVX1: # %bb.0: # %vector.ph
2048 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2049 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
2050 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
2051 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
2052 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2053 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2054 ; AVX1-NEXT: vzeroupper
2057 ; AVX2-LABEL: psubus_i16_i32_max_swapped:
2058 ; AVX2: # %bb.0: # %vector.ph
2059 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
2060 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
2061 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2062 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2063 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2064 ; AVX2-NEXT: vzeroupper
2067 ; AVX512-LABEL: psubus_i16_i32_max_swapped:
2068 ; AVX512: # %bb.0: # %vector.ph
2069 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
2070 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2071 ; AVX512-NEXT: vzeroupper
2074 %lhs = zext <8 x i16> %x to <8 x i32>
2075 %cond = icmp ult <8 x i32> %y, %lhs
2076 %max = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
2077 %sub = sub <8 x i32> %max, %y
2078 %res = trunc <8 x i32> %sub to <8 x i16>
2082 define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind {
2083 ; SSE2-LABEL: psubus_i16_i32_min:
2084 ; SSE2: # %bb.0: # %vector.ph
2085 ; SSE2-NEXT: pxor %xmm4, %xmm4
2086 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2087 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2088 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2089 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
2090 ; SSE2-NEXT: movdqa %xmm2, %xmm5
2091 ; SSE2-NEXT: pxor %xmm4, %xmm5
2092 ; SSE2-NEXT: movdqa %xmm0, %xmm6
2093 ; SSE2-NEXT: por %xmm4, %xmm6
2094 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
2095 ; SSE2-NEXT: movdqa %xmm0, %xmm6
2096 ; SSE2-NEXT: pand %xmm5, %xmm6
2097 ; SSE2-NEXT: pandn %xmm2, %xmm5
2098 ; SSE2-NEXT: por %xmm6, %xmm5
2099 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2100 ; SSE2-NEXT: pxor %xmm4, %xmm2
2101 ; SSE2-NEXT: por %xmm3, %xmm4
2102 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
2103 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2104 ; SSE2-NEXT: pand %xmm2, %xmm4
2105 ; SSE2-NEXT: pandn %xmm1, %xmm2
2106 ; SSE2-NEXT: por %xmm4, %xmm2
2107 ; SSE2-NEXT: psubd %xmm2, %xmm3
2108 ; SSE2-NEXT: psubd %xmm5, %xmm0
2109 ; SSE2-NEXT: pslld $16, %xmm0
2110 ; SSE2-NEXT: psrad $16, %xmm0
2111 ; SSE2-NEXT: pslld $16, %xmm3
2112 ; SSE2-NEXT: psrad $16, %xmm3
2113 ; SSE2-NEXT: packssdw %xmm0, %xmm3
2114 ; SSE2-NEXT: movdqa %xmm3, %xmm0
2117 ; SSSE3-LABEL: psubus_i16_i32_min:
2118 ; SSSE3: # %bb.0: # %vector.ph
2119 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2120 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
2121 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
2122 ; SSSE3-NEXT: pxor %xmm4, %xmm5
2123 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
2124 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
2125 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
2126 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
2127 ; SSSE3-NEXT: pand %xmm7, %xmm2
2128 ; SSSE3-NEXT: pandn %xmm5, %xmm7
2129 ; SSSE3-NEXT: por %xmm2, %xmm7
2130 ; SSSE3-NEXT: pshufb %xmm3, %xmm7
2131 ; SSSE3-NEXT: pxor %xmm1, %xmm4
2132 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
2133 ; SSSE3-NEXT: pand %xmm6, %xmm1
2134 ; SSSE3-NEXT: pandn %xmm5, %xmm6
2135 ; SSSE3-NEXT: por %xmm1, %xmm6
2136 ; SSSE3-NEXT: pshufb %xmm3, %xmm6
2137 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2138 ; SSSE3-NEXT: psubusw %xmm6, %xmm0
2141 ; SSE41-LABEL: psubus_i16_i32_min:
2142 ; SSE41: # %bb.0: # %vector.ph
2143 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
2144 ; SSE41-NEXT: pminud %xmm3, %xmm2
2145 ; SSE41-NEXT: pminud %xmm3, %xmm1
2146 ; SSE41-NEXT: packusdw %xmm2, %xmm1
2147 ; SSE41-NEXT: psubusw %xmm1, %xmm0
2150 ; AVX1-LABEL: psubus_i16_i32_min:
2151 ; AVX1: # %bb.0: # %vector.ph
2152 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2153 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
2154 ; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
2155 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
2156 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2157 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2158 ; AVX1-NEXT: vzeroupper
2161 ; AVX2-LABEL: psubus_i16_i32_min:
2162 ; AVX2: # %bb.0: # %vector.ph
2163 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
2164 ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
2165 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2166 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2167 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2168 ; AVX2-NEXT: vzeroupper
2171 ; AVX512-LABEL: psubus_i16_i32_min:
2172 ; AVX512: # %bb.0: # %vector.ph
2173 ; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
2174 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2175 ; AVX512-NEXT: vzeroupper
2178 %lhs = zext <8 x i16> %x to <8 x i32>
2179 %cond = icmp ult <8 x i32> %lhs, %y
2180 %min = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
2181 %sub = sub <8 x i32> %lhs, %min
2182 %res = trunc <8 x i32> %sub to <8 x i16>
2186 define void @subus_v8i8(<8 x i8>* %p1, <8 x i8>* %p2) {
2187 ; SSE-LABEL: subus_v8i8:
2189 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2190 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2191 ; SSE-NEXT: psubusb %xmm1, %xmm0
2192 ; SSE-NEXT: movq %xmm0, (%rdi)
2195 ; AVX1-LABEL: subus_v8i8:
2197 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2198 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2199 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2200 ; AVX1-NEXT: vmovq %xmm0, (%rdi)
2203 ; AVX2-LABEL: subus_v8i8:
2205 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2206 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2207 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2208 ; AVX2-NEXT: vmovq %xmm0, (%rdi)
2211 ; AVX512-LABEL: subus_v8i8:
2213 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2214 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2215 ; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2216 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2217 ; AVX512-NEXT: vpmovwb %xmm0, (%rdi)
2219 %ld1 = load <8 x i8>, <8 x i8>* %p1, align 8
2220 %ld2 = load <8 x i8>, <8 x i8>* %p2, align 8
2221 %1 = sub <8 x i8> %ld1, %ld2
2222 %2 = icmp ugt <8 x i8> %ld1, %ld2
2223 %sh3 = select <8 x i1> %2, <8 x i8> %1, <8 x i8> zeroinitializer
2224 store <8 x i8> %sh3, <8 x i8>* %p1, align 8
2228 define void @subus_v4i8(<4 x i8>* %p1, <4 x i8>* %p2) {
2229 ; SSE-LABEL: subus_v4i8:
2231 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2232 ; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2233 ; SSE-NEXT: psubusb %xmm1, %xmm0
2234 ; SSE-NEXT: movd %xmm0, (%rdi)
2237 ; AVX1-LABEL: subus_v4i8:
2239 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2240 ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2241 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2242 ; AVX1-NEXT: vmovd %xmm0, (%rdi)
2245 ; AVX2-LABEL: subus_v4i8:
2247 ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2248 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2249 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2250 ; AVX2-NEXT: vmovd %xmm0, (%rdi)
2253 ; AVX512-LABEL: subus_v4i8:
2255 ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2256 ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2257 ; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2258 ; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2259 ; AVX512-NEXT: vpmovdb %xmm0, (%rdi)
2261 %ld1 = load <4 x i8>, <4 x i8>* %p1, align 8
2262 %ld2 = load <4 x i8>, <4 x i8>* %p2, align 8
2263 %1 = sub <4 x i8> %ld1, %ld2
2264 %2 = icmp ugt <4 x i8> %ld1, %ld2
2265 %sh3 = select <4 x i1> %2, <4 x i8> %1, <4 x i8> zeroinitializer
2266 store <4 x i8> %sh3, <4 x i8>* %p1, align 8
2270 define void @subus_v2i8(<2 x i8>* %p1, <2 x i8>* %p2) {
2271 ; SSE2-LABEL: subus_v2i8:
2273 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2274 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2275 ; SSE2-NEXT: psubusb %xmm1, %xmm0
2276 ; SSE2-NEXT: movd %xmm0, %eax
2277 ; SSE2-NEXT: movw %ax, (%rdi)
2280 ; SSSE3-LABEL: subus_v2i8:
2282 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2283 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2284 ; SSSE3-NEXT: psubusb %xmm1, %xmm0
2285 ; SSSE3-NEXT: movd %xmm0, %eax
2286 ; SSSE3-NEXT: movw %ax, (%rdi)
2289 ; SSE41-LABEL: subus_v2i8:
2291 ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2292 ; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2293 ; SSE41-NEXT: psubusb %xmm1, %xmm0
2294 ; SSE41-NEXT: pextrw $0, %xmm0, (%rdi)
2297 ; AVX1-LABEL: subus_v2i8:
2299 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2300 ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2301 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2302 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi)
2305 ; AVX2-LABEL: subus_v2i8:
2307 ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2308 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2309 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2310 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi)
2313 ; AVX512-LABEL: subus_v2i8:
2315 ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2316 ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2317 ; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
2318 ; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
2319 ; AVX512-NEXT: vpmovqb %xmm0, (%rdi)
2321 %ld1 = load <2 x i8>, <2 x i8>* %p1, align 8
2322 %ld2 = load <2 x i8>, <2 x i8>* %p2, align 8
2323 %1 = sub <2 x i8> %ld1, %ld2
2324 %2 = icmp ugt <2 x i8> %ld1, %ld2
2325 %sh3 = select <2 x i1> %2, <2 x i8> %1, <2 x i8> zeroinitializer
2326 store <2 x i8> %sh3, <2 x i8>* %p1, align 8
2330 define void @subus_v4i16(<4 x i16>* %p1, <4 x i16>* %p2) {
2331 ; SSE-LABEL: subus_v4i16:
2333 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2334 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2335 ; SSE-NEXT: psubusw %xmm1, %xmm0
2336 ; SSE-NEXT: movq %xmm0, (%rdi)
2339 ; AVX1-LABEL: subus_v4i16:
2341 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2342 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2343 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2344 ; AVX1-NEXT: vmovq %xmm0, (%rdi)
2347 ; AVX2-LABEL: subus_v4i16:
2349 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2350 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2351 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2352 ; AVX2-NEXT: vmovq %xmm0, (%rdi)
2355 ; AVX512-LABEL: subus_v4i16:
2357 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2358 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2359 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2360 ; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2361 ; AVX512-NEXT: vpmovdw %xmm0, (%rdi)
2363 %ld1 = load <4 x i16>, <4 x i16>* %p1, align 8
2364 %ld2 = load <4 x i16>, <4 x i16>* %p2, align 8
2365 %1 = sub <4 x i16> %ld1, %ld2
2366 %2 = icmp ugt <4 x i16> %ld1, %ld2
2367 %sh3 = select <4 x i1> %2, <4 x i16> %1, <4 x i16> zeroinitializer
2368 store <4 x i16> %sh3, <4 x i16>* %p1, align 8
2372 define void @subus_v2i16(<2 x i16>* %p1, <2 x i16>* %p2) {
2373 ; SSE-LABEL: subus_v2i16:
2375 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2376 ; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2377 ; SSE-NEXT: psubusw %xmm1, %xmm0
2378 ; SSE-NEXT: movd %xmm0, (%rdi)
2381 ; AVX1-LABEL: subus_v2i16:
2383 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2384 ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2385 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2386 ; AVX1-NEXT: vmovd %xmm0, (%rdi)
2389 ; AVX2-LABEL: subus_v2i16:
2391 ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2392 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2393 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2394 ; AVX2-NEXT: vmovd %xmm0, (%rdi)
2397 ; AVX512-LABEL: subus_v2i16:
2399 ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2400 ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2401 ; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
2402 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2403 ; AVX512-NEXT: vpmovqw %xmm0, (%rdi)
2405 %ld1 = load <2 x i16>, <2 x i16>* %p1, align 8
2406 %ld2 = load <2 x i16>, <2 x i16>* %p2, align 8
2407 %1 = sub <2 x i16> %ld1, %ld2
2408 %2 = icmp ugt <2 x i16> %ld1, %ld2
2409 %sh3 = select <2 x i1> %2, <2 x i16> %1, <2 x i16> zeroinitializer
2410 store <2 x i16> %sh3, <2 x i16>* %p1, align 8
2414 define <16 x i8> @test19(<16 x i8> %x) {
2415 ; SSE-LABEL: test19:
2416 ; SSE: # %bb.0: # %entry
2417 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
2420 ; AVX-LABEL: test19:
2421 ; AVX: # %bb.0: # %entry
2422 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
2425 %0 = icmp ugt <16 x i8> %x, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2426 %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2427 %2 = add <16 x i8> %1, <i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70>
2431 define <16 x i8> @test20(<16 x i8> %x) {
2432 ; SSE-LABEL: test20:
2433 ; SSE: # %bb.0: # %entry
2434 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
2437 ; AVX-LABEL: test20:
2438 ; AVX: # %bb.0: # %entry
2439 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
2442 %0 = icmp ugt <16 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70>
2443 %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70>
2444 %2 = add <16 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70>
2448 define <8 x i16> @test21(<8 x i16> %x) {
2449 ; SSE-LABEL: test21:
2450 ; SSE: # %bb.0: # %entry
2451 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
2454 ; AVX-LABEL: test21:
2455 ; AVX: # %bb.0: # %entry
2456 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
2459 %0 = icmp ugt <8 x i16> %x, <i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700>
2460 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700, i16 700>
2461 %2 = add <8 x i16> %1, <i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700, i16 -700>
2465 define <8 x i16> @test22(<8 x i16> %x) {
2466 ; SSE-LABEL: test22:
2467 ; SSE: # %bb.0: # %entry
2468 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
2471 ; AVX-LABEL: test22:
2472 ; AVX: # %bb.0: # %entry
2473 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
2476 %0 = icmp ugt <8 x i16> %x, <i16 1, i16 -22000, i16 -770, i16 98, i16 19, i16 1000, i16 3456, i16 70>
2477 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 1, i16 -22000, i16 -770, i16 98, i16 19, i16 1000, i16 3456, i16 70>
2478 %2 = add <8 x i16> %1, <i16 -1, i16 22000, i16 770, i16 -98, i16 -19, i16 -1000, i16 -3456, i16 -70>
2482 define <32 x i8> @test23(<32 x i8> %x) {
2483 ; SSE-LABEL: test23:
2484 ; SSE: # %bb.0: # %entry
2485 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70]
2486 ; SSE-NEXT: psubusb %xmm2, %xmm0
2487 ; SSE-NEXT: psubusb %xmm2, %xmm1
2490 ; AVX1-LABEL: test23:
2491 ; AVX1: # %bb.0: # %entry
2492 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2493 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70]
2494 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
2495 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
2496 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2499 ; AVX2-LABEL: test23:
2500 ; AVX2: # %bb.0: # %entry
2501 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
2504 ; AVX512-LABEL: test23:
2505 ; AVX512: # %bb.0: # %entry
2506 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
2509 %0 = icmp ugt <32 x i8> %x, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2510 %1 = select <32 x i1> %0, <32 x i8> %x, <32 x i8> <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70>
2511 %2 = add <32 x i8> %1, <i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70, i8 -70>
2515 define <32 x i8> @test24(<32 x i8> %x) {
2516 ; SSE-LABEL: test24:
2517 ; SSE: # %bb.0: # %entry
2518 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
2519 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm1
2522 ; AVX1-LABEL: test24:
2523 ; AVX1: # %bb.0: # %entry
2524 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm1
2525 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2526 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
2527 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2530 ; AVX2-LABEL: test24:
2531 ; AVX2: # %bb.0: # %entry
2532 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
2535 ; AVX512-LABEL: test24:
2536 ; AVX512: # %bb.0: # %entry
2537 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
2540 %0 = icmp ugt <32 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2541 %1 = select <32 x i1> %0, <32 x i8> %x, <32 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2542 %2 = add <32 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70>
2546 define <16 x i16> @test25(<16 x i16> %x) {
2547 ; SSE-LABEL: test25:
2548 ; SSE: # %bb.0: # %entry
2549 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000]
2550 ; SSE-NEXT: psubusw %xmm2, %xmm0
2551 ; SSE-NEXT: psubusw %xmm2, %xmm1
2554 ; AVX1-LABEL: test25:
2555 ; AVX1: # %bb.0: # %entry
2556 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2557 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000]
2558 ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
2559 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
2560 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2563 ; AVX2-LABEL: test25:
2564 ; AVX2: # %bb.0: # %entry
2565 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
2568 ; AVX512-LABEL: test25:
2569 ; AVX512: # %bb.0: # %entry
2570 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
2573 %0 = icmp ugt <16 x i16> %x, <i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000>
2574 %1 = select <16 x i1> %0, <16 x i16> %x, <16 x i16> <i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000, i16 5000>
2575 %2 = add <16 x i16> %1, <i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000, i16 -5000>
2579 define <16 x i16> @test26(<16 x i16> %x) {
2580 ; SSE-LABEL: test26:
2581 ; SSE: # %bb.0: # %entry
2582 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
2583 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm1
2586 ; AVX1-LABEL: test26:
2587 ; AVX1: # %bb.0: # %entry
2588 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm1
2589 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2590 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
2591 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2594 ; AVX2-LABEL: test26:
2595 ; AVX2: # %bb.0: # %entry
2596 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
2599 ; AVX512-LABEL: test26:
2600 ; AVX512: # %bb.0: # %entry
2601 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
2604 %0 = icmp ugt <16 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70>
2605 %1 = select <16 x i1> %0, <16 x i16> %x, <16 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70>
2606 %2 = add <16 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9800, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -34, i16 -55, i16 -70>
2610 define <64 x i8> @test27(<64 x i8> %x) {
2611 ; SSE-LABEL: test27:
2612 ; SSE: # %bb.0: # %entry
2613 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
2614 ; SSE-NEXT: psubusb %xmm4, %xmm0
2615 ; SSE-NEXT: psubusb %xmm4, %xmm1
2616 ; SSE-NEXT: psubusb %xmm4, %xmm2
2617 ; SSE-NEXT: psubusb %xmm4, %xmm3
2620 ; AVX1-LABEL: test27:
2621 ; AVX1: # %bb.0: # %entry
2622 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2623 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
2624 ; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2
2625 ; AVX1-NEXT: vpsubusb %xmm3, %xmm0, %xmm0
2626 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2627 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2628 ; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2
2629 ; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1
2630 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2633 ; AVX2-LABEL: test27:
2634 ; AVX2: # %bb.0: # %entry
2635 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154]
2636 ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
2637 ; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1
2640 ; AVX512-LABEL: test27:
2641 ; AVX512: # %bb.0: # %entry
2642 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %zmm0, %zmm0
2645 %0 = icmp ugt <64 x i8> %x, <i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154>
2646 %1 = select <64 x i1> %0, <64 x i8> %x, <64 x i8> <i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154, i8 154>
2647 %2 = add <64 x i8> %1, <i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154, i8 -154>
2651 define <64 x i8> @test28(<64 x i8> %x) {
2652 ; SSE-LABEL: test28:
2653 ; SSE: # %bb.0: # %entry
2654 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70]
2655 ; SSE-NEXT: psubusb %xmm4, %xmm0
2656 ; SSE-NEXT: psubusb %xmm4, %xmm2
2657 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm1
2658 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm3
2661 ; AVX1-LABEL: test28:
2662 ; AVX1: # %bb.0: # %entry
2663 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70]
2664 ; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm3
2665 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2666 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
2667 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
2668 ; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm2
2669 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2670 ; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm1, %xmm1
2671 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2674 ; AVX2-LABEL: test28:
2675 ; AVX2: # %bb.0: # %entry
2676 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
2677 ; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm1, %ymm1
2680 ; AVX512-LABEL: test28:
2681 ; AVX512: # %bb.0: # %entry
2682 ; AVX512-NEXT: vpsubusb {{.*}}(%rip), %zmm0, %zmm0
2685 %0 = icmp ugt <64 x i8> %x, <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70, i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -116, i8 -77, i8 -70, i8 123, i8 98, i8 67, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2686 %1 = select <64 x i1> %0, <64 x i8> %x, <64 x i8> <i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70, i8 1, i8 -22, i8 -50, i8 -114, i8 -77, i8 -70, i8 123, i8 98, i8 63, i8 19, i8 -22, i8 100, i8 25, i8 34, i8 55, i8 70, i8 2, i8 -23, i8 -49, i8 -116, i8 -77, i8 -70, i8 123, i8 98, i8 67, i8 19, i8 -22, i8 110, i8 25, i8 34, i8 55, i8 70>
2687 %2 = add <64 x i8> %1, <i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70, i8 -1, i8 22, i8 50, i8 114, i8 77, i8 70, i8 -123, i8 -98, i8 -63, i8 -19, i8 22, i8 -100, i8 -25, i8 -34, i8 -55, i8 -70, i8 -2, i8 23, i8 49, i8 116, i8 77, i8 70, i8 -123, i8 -98, i8 -67, i8 -19, i8 22, i8 -110, i8 -25, i8 -34, i8 -55, i8 -70>
2691 define <32 x i16> @test29(<32 x i16> %x) {
2692 ; SSE-LABEL: test29:
2693 ; SSE: # %bb.0: # %entry
2694 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
2695 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm1
2696 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm2
2697 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm3
2700 ; AVX1-LABEL: test29:
2701 ; AVX1: # %bb.0: # %entry
2702 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm2
2703 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2704 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
2705 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2706 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm1, %xmm2
2707 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2708 ; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm1, %xmm1
2709 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2712 ; AVX2-LABEL: test29:
2713 ; AVX2: # %bb.0: # %entry
2714 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
2715 ; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm1, %ymm1
2718 ; AVX512-LABEL: test29:
2719 ; AVX512: # %bb.0: # %entry
2720 ; AVX512-NEXT: vpsubusw {{.*}}(%rip), %zmm0, %zmm0
2723 %0 = icmp ugt <32 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70, i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9805, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 346, i16 55, i16 70>
2724 %1 = select <32 x i1> %0, <32 x i16> %x, <32 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9800, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 34, i16 55, i16 70, i16 1, i16 -2200, i16 -50, i16 -114, i16 -77, i16 -70, i16 123, i16 9805, i16 635, i16 19567, i16 -22, i16 100, i16 2534, i16 346, i16 55, i16 70>
2725 %2 = add <32 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9800, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -34, i16 -55, i16 -70, i16 -1, i16 2200, i16 50, i16 114, i16 77, i16 70, i16 -123, i16 -9805, i16 -635, i16 -19567, i16 22, i16 -100, i16 -2534, i16 -346, i16 -55, i16 -70>
2730 define i64 @test30(<8 x i16> %x) {
2731 ; SSE-LABEL: test30:
2732 ; SSE: # %bb.0: # %entry
2733 ; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
2734 ; SSE-NEXT: movq %xmm0, %rax
2737 ; AVX-LABEL: test30:
2738 ; AVX: # %bb.0: # %entry
2739 ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
2740 ; AVX-NEXT: vmovq %xmm0, %rax
2743 %0 = icmp ugt <8 x i16> %x, <i16 1, i16 -2200, i16 -50, i16 -114, i16 undef, i16 undef, i16 undef, i16 undef>
2744 %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> <i16 1, i16 -2200, i16 -50, i16 -114, i16 undef, i16 undef, i16 undef, i16 undef>
2745 %2 = add <8 x i16> %1, <i16 -1, i16 2200, i16 50, i16 114, i16 undef, i16 undef, i16 undef, i16 undef>
2746 %3 = bitcast <8 x i16> %2 to <2 x i64>
2747 %4 = extractelement <2 x i64> %3, i32 0
2752 define i64 @test31(<2 x i64> %x) {
2753 ; SSE-LABEL: test31:
2755 ; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
2756 ; SSE-NEXT: movq %xmm0, %rax
2759 ; AVX-LABEL: test31:
2761 ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
2762 ; AVX-NEXT: vmovq %xmm0, %rax
2764 %t0 = bitcast <2 x i64> %x to <16 x i8>
2765 %cmp = icmp ugt <16 x i8> %t0, <i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 70, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
2766 %bop = add <16 x i8> %t0, <i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 -71, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
2767 %sel = select <16 x i1> %cmp, <16 x i8> %bop, <16 x i8> zeroinitializer
2768 %bc = bitcast <16 x i8> %sel to <2 x i64>
2769 %ext = extractelement <2 x i64> %bc, i32 0