1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512VL-FALLBACK
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW
7 ; These test cases are inspired by C++2a std::midpoint().
8 ; See https://bugs.llvm.org/show_bug.cgi?id=40965
10 ; Using 512-bit vector regs.
12 ; ---------------------------------------------------------------------------- ;
13 ; 32-bit width. 512 / 32 = 16 elts.
14 ; ---------------------------------------------------------------------------- ;
16 ; Values come from regs
18 define <16 x i32> @vec512_i32_signed_reg_reg(<16 x i32> %a1, <16 x i32> %a2) nounwind {
19 ; ALL-LABEL: vec512_i32_signed_reg_reg:
21 ; ALL-NEXT: vpminsd %zmm1, %zmm0, %zmm2
22 ; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm1
23 ; ALL-NEXT: vpsubd %zmm2, %zmm1, %zmm1
24 ; ALL-NEXT: vpsrld $1, %zmm1, %zmm1
25 ; ALL-NEXT: vpmulld %zmm1, %zmm1, %zmm1
26 ; ALL-NEXT: vpaddd %zmm0, %zmm1, %zmm0
28 %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed
29 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
30 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1
31 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2
32 %t7 = sub <16 x i32> %t6, %t5
33 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
34 %t9 = mul nsw <16 x i32> %t16, %t16 ; signed
35 %a10 = add nsw <16 x i32> %t9, %a1 ; signed
39 define <16 x i32> @vec512_i32_unsigned_reg_reg(<16 x i32> %a1, <16 x i32> %a2) nounwind {
40 ; ALL-LABEL: vec512_i32_unsigned_reg_reg:
42 ; ALL-NEXT: vpminud %zmm1, %zmm0, %zmm2
43 ; ALL-NEXT: vpmaxud %zmm1, %zmm0, %zmm1
44 ; ALL-NEXT: vpsubd %zmm2, %zmm1, %zmm1
45 ; ALL-NEXT: vpsrld $1, %zmm1, %zmm1
46 ; ALL-NEXT: vpmulld %zmm1, %zmm1, %zmm1
47 ; ALL-NEXT: vpaddd %zmm0, %zmm1, %zmm0
49 %t3 = icmp ugt <16 x i32> %a1, %a2
50 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
51 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1
52 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2
53 %t7 = sub <16 x i32> %t6, %t5
54 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
55 %t9 = mul <16 x i32> %t16, %t16
56 %a10 = add <16 x i32> %t9, %a1
60 ; Values are loaded. Only check signed case.
62 define <16 x i32> @vec512_i32_signed_mem_reg(ptr %a1_addr, <16 x i32> %a2) nounwind {
63 ; ALL-LABEL: vec512_i32_signed_mem_reg:
65 ; ALL-NEXT: vmovdqa64 (%rdi), %zmm1
66 ; ALL-NEXT: vpminsd %zmm0, %zmm1, %zmm2
67 ; ALL-NEXT: vpmaxsd %zmm0, %zmm1, %zmm0
68 ; ALL-NEXT: vpsubd %zmm2, %zmm0, %zmm0
69 ; ALL-NEXT: vpsrld $1, %zmm0, %zmm0
70 ; ALL-NEXT: vpmulld %zmm0, %zmm0, %zmm0
71 ; ALL-NEXT: vpaddd %zmm1, %zmm0, %zmm0
73 %a1 = load <16 x i32>, ptr %a1_addr
74 %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed
75 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
76 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1
77 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2
78 %t7 = sub <16 x i32> %t6, %t5
79 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
80 %t9 = mul nsw <16 x i32> %t16, %t16 ; signed
81 %a10 = add nsw <16 x i32> %t9, %a1 ; signed
85 define <16 x i32> @vec512_i32_signed_reg_mem(<16 x i32> %a1, ptr %a2_addr) nounwind {
86 ; ALL-LABEL: vec512_i32_signed_reg_mem:
88 ; ALL-NEXT: vmovdqa64 (%rdi), %zmm1
89 ; ALL-NEXT: vpminsd %zmm1, %zmm0, %zmm2
90 ; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm1
91 ; ALL-NEXT: vpsubd %zmm2, %zmm1, %zmm1
92 ; ALL-NEXT: vpsrld $1, %zmm1, %zmm1
93 ; ALL-NEXT: vpmulld %zmm1, %zmm1, %zmm1
94 ; ALL-NEXT: vpaddd %zmm0, %zmm1, %zmm0
96 %a2 = load <16 x i32>, ptr %a2_addr
97 %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed
98 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
99 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1
100 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2
101 %t7 = sub <16 x i32> %t6, %t5
102 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
103 %t9 = mul nsw <16 x i32> %t16, %t16 ; signed
104 %a10 = add nsw <16 x i32> %t9, %a1 ; signed
108 define <16 x i32> @vec512_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
109 ; ALL-LABEL: vec512_i32_signed_mem_mem:
111 ; ALL-NEXT: vmovdqa64 (%rdi), %zmm0
112 ; ALL-NEXT: vmovdqa64 (%rsi), %zmm1
113 ; ALL-NEXT: vpminsd %zmm1, %zmm0, %zmm2
114 ; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm1
115 ; ALL-NEXT: vpsubd %zmm2, %zmm1, %zmm1
116 ; ALL-NEXT: vpsrld $1, %zmm1, %zmm1
117 ; ALL-NEXT: vpmulld %zmm1, %zmm1, %zmm1
118 ; ALL-NEXT: vpaddd %zmm0, %zmm1, %zmm0
120 %a1 = load <16 x i32>, ptr %a1_addr
121 %a2 = load <16 x i32>, ptr %a2_addr
122 %t3 = icmp sgt <16 x i32> %a1, %a2 ; signed
123 %t4 = select <16 x i1> %t3, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
124 %t5 = select <16 x i1> %t3, <16 x i32> %a2, <16 x i32> %a1
125 %t6 = select <16 x i1> %t3, <16 x i32> %a1, <16 x i32> %a2
126 %t7 = sub <16 x i32> %t6, %t5
127 %t16 = lshr <16 x i32> %t7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
128 %t9 = mul nsw <16 x i32> %t16, %t16 ; signed
129 %a10 = add nsw <16 x i32> %t9, %a1 ; signed
133 ; ---------------------------------------------------------------------------- ;
134 ; 64-bit width. 512 / 64 = 8 elts.
135 ; ---------------------------------------------------------------------------- ;
137 ; Values come from regs
139 define <8 x i64> @vec512_i64_signed_reg_reg(<8 x i64> %a1, <8 x i64> %a2) nounwind {
140 ; ALL-LABEL: vec512_i64_signed_reg_reg:
142 ; ALL-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
143 ; ALL-NEXT: vpminsq %zmm1, %zmm0, %zmm2
144 ; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
145 ; ALL-NEXT: vpsubq %zmm2, %zmm1, %zmm1
146 ; ALL-NEXT: vpsrlq $1, %zmm1, %zmm1
147 ; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
148 ; ALL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 {%k1}
149 ; ALL-NEXT: vpaddq %zmm0, %zmm1, %zmm0
151 %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed
152 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
153 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1
154 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2
155 %t7 = sub <8 x i64> %t6, %t5
156 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
157 %t9 = mul nsw <8 x i64> %t8, %t4 ; signed
158 %a10 = add nsw <8 x i64> %t9, %a1 ; signed
162 define <8 x i64> @vec512_i64_unsigned_reg_reg(<8 x i64> %a1, <8 x i64> %a2) nounwind {
163 ; ALL-LABEL: vec512_i64_unsigned_reg_reg:
165 ; ALL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
166 ; ALL-NEXT: vpminuq %zmm1, %zmm0, %zmm2
167 ; ALL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1
168 ; ALL-NEXT: vpsubq %zmm2, %zmm1, %zmm1
169 ; ALL-NEXT: vpsrlq $1, %zmm1, %zmm1
170 ; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
171 ; ALL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 {%k1}
172 ; ALL-NEXT: vpaddq %zmm0, %zmm1, %zmm0
174 %t3 = icmp ugt <8 x i64> %a1, %a2
175 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
176 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1
177 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2
178 %t7 = sub <8 x i64> %t6, %t5
179 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
180 %t9 = mul <8 x i64> %t8, %t4
181 %a10 = add <8 x i64> %t9, %a1
185 ; Values are loaded. Only check signed case.
187 define <8 x i64> @vec512_i64_signed_mem_reg(ptr %a1_addr, <8 x i64> %a2) nounwind {
188 ; ALL-LABEL: vec512_i64_signed_mem_reg:
190 ; ALL-NEXT: vmovdqa64 (%rdi), %zmm1
191 ; ALL-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
192 ; ALL-NEXT: vpminsq %zmm0, %zmm1, %zmm2
193 ; ALL-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
194 ; ALL-NEXT: vpsubq %zmm2, %zmm0, %zmm0
195 ; ALL-NEXT: vpsrlq $1, %zmm0, %zmm0
196 ; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
197 ; ALL-NEXT: vpsubq %zmm0, %zmm2, %zmm0 {%k1}
198 ; ALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
200 %a1 = load <8 x i64>, ptr %a1_addr
201 %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed
202 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
203 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1
204 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2
205 %t7 = sub <8 x i64> %t6, %t5
206 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
207 %t9 = mul nsw <8 x i64> %t8, %t4 ; signed
208 %a10 = add nsw <8 x i64> %t9, %a1 ; signed
212 define <8 x i64> @vec512_i64_signed_reg_mem(<8 x i64> %a1, ptr %a2_addr) nounwind {
213 ; ALL-LABEL: vec512_i64_signed_reg_mem:
215 ; ALL-NEXT: vmovdqa64 (%rdi), %zmm1
216 ; ALL-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
217 ; ALL-NEXT: vpminsq %zmm1, %zmm0, %zmm2
218 ; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
219 ; ALL-NEXT: vpsubq %zmm2, %zmm1, %zmm1
220 ; ALL-NEXT: vpsrlq $1, %zmm1, %zmm1
221 ; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
222 ; ALL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 {%k1}
223 ; ALL-NEXT: vpaddq %zmm0, %zmm1, %zmm0
225 %a2 = load <8 x i64>, ptr %a2_addr
226 %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed
227 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
228 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1
229 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2
230 %t7 = sub <8 x i64> %t6, %t5
231 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
232 %t9 = mul nsw <8 x i64> %t8, %t4 ; signed
233 %a10 = add nsw <8 x i64> %t9, %a1 ; signed
237 define <8 x i64> @vec512_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
238 ; ALL-LABEL: vec512_i64_signed_mem_mem:
240 ; ALL-NEXT: vmovdqa64 (%rdi), %zmm0
241 ; ALL-NEXT: vmovdqa64 (%rsi), %zmm1
242 ; ALL-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
243 ; ALL-NEXT: vpminsq %zmm1, %zmm0, %zmm2
244 ; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
245 ; ALL-NEXT: vpsubq %zmm2, %zmm1, %zmm1
246 ; ALL-NEXT: vpsrlq $1, %zmm1, %zmm1
247 ; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
248 ; ALL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 {%k1}
249 ; ALL-NEXT: vpaddq %zmm0, %zmm1, %zmm0
251 %a1 = load <8 x i64>, ptr %a1_addr
252 %a2 = load <8 x i64>, ptr %a2_addr
253 %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed
254 %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
255 %t5 = select <8 x i1> %t3, <8 x i64> %a2, <8 x i64> %a1
256 %t6 = select <8 x i1> %t3, <8 x i64> %a1, <8 x i64> %a2
257 %t7 = sub <8 x i64> %t6, %t5
258 %t8 = lshr <8 x i64> %t7, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
259 %t9 = mul nsw <8 x i64> %t8, %t4 ; signed
260 %a10 = add nsw <8 x i64> %t9, %a1 ; signed
264 ; ---------------------------------------------------------------------------- ;
265 ; 16-bit width. 512 / 16 = 32 elts.
266 ; ---------------------------------------------------------------------------- ;
268 ; Values come from regs
270 define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nounwind {
271 ; AVX512F-LABEL: vec512_i16_signed_reg_reg:
273 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
274 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
275 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
276 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
277 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
278 ; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5
279 ; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
280 ; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2
281 ; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5
282 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
283 ; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1
284 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
285 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
286 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
287 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
288 ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
289 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
290 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
291 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
292 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
293 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
294 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
295 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
298 ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_reg:
299 ; AVX512VL-FALLBACK: # %bb.0:
300 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
301 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
302 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
303 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
304 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
305 ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5
306 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
307 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2
308 ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5
309 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
310 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1
311 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
312 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
313 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
314 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
315 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2
316 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
317 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
318 ; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
319 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
320 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
321 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
322 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
323 ; AVX512VL-FALLBACK-NEXT: retq
325 ; AVX512BW-LABEL: vec512_i16_signed_reg_reg:
327 ; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
328 ; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm2
329 ; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm1
330 ; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1
331 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
332 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
333 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm1 {%k1}
334 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
335 ; AVX512BW-NEXT: retq
336 %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed
337 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
338 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1
339 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2
340 %t7 = sub <32 x i16> %t6, %t5
341 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
342 %t9 = mul nsw <32 x i16> %t16, %t4 ; signed
343 %a10 = add nsw <32 x i16> %t9, %a1 ; signed
347 define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nounwind {
348 ; AVX512F-LABEL: vec512_i16_unsigned_reg_reg:
350 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
351 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
352 ; AVX512F-NEXT: vpminuw %ymm2, %ymm3, %ymm4
353 ; AVX512F-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5
354 ; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm6
355 ; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7
356 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5
357 ; AVX512F-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2
358 ; AVX512F-NEXT: vpsubw %ymm4, %ymm2, %ymm2
359 ; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
360 ; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1
361 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
362 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
363 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
364 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
365 ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
366 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
367 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
368 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4))
369 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
370 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
371 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
372 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
375 ; AVX512VL-FALLBACK-LABEL: vec512_i16_unsigned_reg_reg:
376 ; AVX512VL-FALLBACK: # %bb.0:
377 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
378 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
379 ; AVX512VL-FALLBACK-NEXT: vpminuw %ymm2, %ymm3, %ymm4
380 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5
381 ; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm6
382 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7
383 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5
384 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2
385 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2
386 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
387 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1
388 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
389 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
390 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
391 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
392 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2
393 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
394 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
395 ; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4))
396 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
397 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
398 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
399 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
400 ; AVX512VL-FALLBACK-NEXT: retq
402 ; AVX512BW-LABEL: vec512_i16_unsigned_reg_reg:
404 ; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1
405 ; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2
406 ; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm1
407 ; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1
408 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
409 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
410 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm1 {%k1}
411 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
412 ; AVX512BW-NEXT: retq
413 %t3 = icmp ugt <32 x i16> %a1, %a2
414 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
415 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1
416 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2
417 %t7 = sub <32 x i16> %t6, %t5
418 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
419 %t9 = mul <32 x i16> %t16, %t4
420 %a10 = add <32 x i16> %t9, %a1
424 ; Values are loaded. Only check signed case.
426 define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounwind {
427 ; AVX512F-LABEL: vec512_i16_signed_mem_reg:
429 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
430 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
431 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
432 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
433 ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5
434 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
435 ; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm5
436 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
437 ; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1
438 ; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5
439 ; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
440 ; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0
441 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
442 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
443 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
444 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
445 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
446 ; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0
447 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
448 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
449 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
450 ; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
451 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
452 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
455 ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_reg:
456 ; AVX512VL-FALLBACK: # %bb.0:
457 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
458 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
459 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
460 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
461 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5
462 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
463 ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm5
464 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
465 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1
466 ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5
467 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
468 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0
469 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
470 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
471 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
472 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
473 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
474 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0
475 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
476 ; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
477 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
478 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
479 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
480 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
481 ; AVX512VL-FALLBACK-NEXT: retq
483 ; AVX512BW-LABEL: vec512_i16_signed_mem_reg:
485 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
486 ; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1
487 ; AVX512BW-NEXT: vpminsw %zmm0, %zmm1, %zmm2
488 ; AVX512BW-NEXT: vpmaxsw %zmm0, %zmm1, %zmm0
489 ; AVX512BW-NEXT: vpsubw %zmm2, %zmm0, %zmm0
490 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0
491 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
492 ; AVX512BW-NEXT: vpsubw %zmm0, %zmm2, %zmm0 {%k1}
493 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
494 ; AVX512BW-NEXT: retq
495 %a1 = load <32 x i16>, ptr %a1_addr
496 %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed
497 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
498 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1
499 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2
500 %t7 = sub <32 x i16> %t6, %t5
501 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
502 %t9 = mul nsw <32 x i16> %t16, %t4 ; signed
503 %a10 = add nsw <32 x i16> %t9, %a1 ; signed
507 define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounwind {
508 ; AVX512F-LABEL: vec512_i16_signed_reg_mem:
510 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
511 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2
512 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
513 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
514 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
515 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
516 ; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5
517 ; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
518 ; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2
519 ; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5
520 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
521 ; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1
522 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
523 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
524 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
525 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
526 ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
527 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
528 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
529 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
530 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
531 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
532 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
533 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
536 ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem:
537 ; AVX512VL-FALLBACK: # %bb.0:
538 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
539 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2
540 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
541 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
542 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
543 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
544 ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5
545 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
546 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2
547 ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5
548 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
549 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1
550 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
551 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
552 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
553 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
554 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2
555 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
556 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
557 ; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
558 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
559 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
560 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
561 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
562 ; AVX512VL-FALLBACK-NEXT: retq
564 ; AVX512BW-LABEL: vec512_i16_signed_reg_mem:
566 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
567 ; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
568 ; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm2
569 ; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm1
570 ; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1
571 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
572 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
573 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm1 {%k1}
574 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
575 ; AVX512BW-NEXT: retq
576 %a2 = load <32 x i16>, ptr %a2_addr
577 %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed
578 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
579 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1
580 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2
581 %t7 = sub <32 x i16> %t6, %t5
582 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
583 %t9 = mul nsw <32 x i16> %t16, %t4 ; signed
584 %a10 = add nsw <32 x i16> %t9, %a1 ; signed
588 define <32 x i16> @vec512_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
589 ; AVX512F-LABEL: vec512_i16_signed_mem_mem:
591 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
592 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
593 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
594 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
595 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
596 ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5
597 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
598 ; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm5
599 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
600 ; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1
601 ; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5
602 ; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
603 ; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0
604 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
605 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
606 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
607 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
608 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
609 ; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0
610 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
611 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
612 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
613 ; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
614 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
615 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
618 ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_mem:
619 ; AVX512VL-FALLBACK: # %bb.0:
620 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0
621 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1
622 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
623 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
624 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
625 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5
626 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
627 ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm5
628 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
629 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1
630 ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5
631 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
632 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0
633 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
634 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
635 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
636 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
637 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
638 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0
639 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
640 ; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
641 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
642 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
643 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
644 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
645 ; AVX512VL-FALLBACK-NEXT: retq
647 ; AVX512BW-LABEL: vec512_i16_signed_mem_mem:
649 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
650 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1
651 ; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
652 ; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm2
653 ; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm1
654 ; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1
655 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
656 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
657 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm1 {%k1}
658 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
659 ; AVX512BW-NEXT: retq
660 %a1 = load <32 x i16>, ptr %a1_addr
661 %a2 = load <32 x i16>, ptr %a2_addr
662 %t3 = icmp sgt <32 x i16> %a1, %a2 ; signed
663 %t4 = select <32 x i1> %t3, <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <32 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
664 %t5 = select <32 x i1> %t3, <32 x i16> %a2, <32 x i16> %a1
665 %t6 = select <32 x i1> %t3, <32 x i16> %a1, <32 x i16> %a2
666 %t7 = sub <32 x i16> %t6, %t5
667 %t16 = lshr <32 x i16> %t7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
668 %t9 = mul nsw <32 x i16> %t16, %t4 ; signed
669 %a10 = add nsw <32 x i16> %t9, %a1 ; signed
673 ; ---------------------------------------------------------------------------- ;
674 ; 8-bit width. 512 / 8 = 64 elts.
675 ; ---------------------------------------------------------------------------- ;
677 ; Values come from regs
679 define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind {
680 ; AVX512F-LABEL: vec512_i8_signed_reg_reg:
682 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
683 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
684 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
685 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
686 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
687 ; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5
688 ; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
689 ; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
690 ; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5
691 ; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
692 ; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
693 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
694 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
695 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
696 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
697 ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
698 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
699 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
700 ; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
701 ; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
702 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
703 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
704 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
705 ; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
706 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
707 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
710 ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_reg:
711 ; AVX512VL-FALLBACK: # %bb.0:
712 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
713 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
714 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
715 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
716 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
717 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5
718 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
719 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
720 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5
721 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
722 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
723 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
724 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
725 ; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm2, %ymm2
726 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
727 ; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1
728 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
729 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
730 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2
731 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
732 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
733 ; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
734 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
735 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2
736 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
737 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
738 ; AVX512VL-FALLBACK-NEXT: retq
740 ; AVX512BW-LABEL: vec512_i8_signed_reg_reg:
742 ; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
743 ; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm2
744 ; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1
745 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1
746 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
747 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
748 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
749 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1}
750 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
751 ; AVX512BW-NEXT: retq
752 %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed
753 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
754 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1
755 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2
756 %t7 = sub <64 x i8> %t6, %t5
757 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
758 %t9 = mul nsw <64 x i8> %t8, %t4 ; signed
759 %a10 = add nsw <64 x i8> %t9, %a1 ; signed
763 define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind {
764 ; AVX512F-LABEL: vec512_i8_unsigned_reg_reg:
766 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
767 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
768 ; AVX512F-NEXT: vpminub %ymm2, %ymm3, %ymm4
769 ; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5
770 ; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm6
771 ; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7
772 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5
773 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
774 ; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1
775 ; AVX512F-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
776 ; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
777 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
778 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
779 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
780 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
781 ; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
782 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
783 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
784 ; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
785 ; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
786 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
787 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4))
788 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
789 ; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
790 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
791 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
794 ; AVX512VL-FALLBACK-LABEL: vec512_i8_unsigned_reg_reg:
795 ; AVX512VL-FALLBACK: # %bb.0:
796 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
797 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
798 ; AVX512VL-FALLBACK-NEXT: vpminub %ymm2, %ymm3, %ymm4
799 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5
800 ; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6
801 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7
802 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5
803 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
804 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1
805 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
806 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2
807 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
808 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
809 ; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm2, %ymm2
810 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
811 ; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1
812 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
813 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
814 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2
815 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
816 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
817 ; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4))
818 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
819 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2
820 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
821 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
822 ; AVX512VL-FALLBACK-NEXT: retq
824 ; AVX512BW-LABEL: vec512_i8_unsigned_reg_reg:
826 ; AVX512BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k1
827 ; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2
828 ; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm1
829 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1
830 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
831 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
832 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
833 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1}
834 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
835 ; AVX512BW-NEXT: retq
836 %t3 = icmp ugt <64 x i8> %a1, %a2
837 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
838 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1
839 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2
840 %t7 = sub <64 x i8> %t6, %t5
841 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
842 %t9 = mul <64 x i8> %t8, %t4
843 %a10 = add <64 x i8> %t9, %a1
847 ; Values are loaded. Only check signed case.
849 define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind {
850 ; AVX512F-LABEL: vec512_i8_signed_mem_reg:
852 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
853 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
854 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
855 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4
856 ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5
857 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
858 ; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5
859 ; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
860 ; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
861 ; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5
862 ; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
863 ; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0
864 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
865 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
866 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
867 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
868 ; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
869 ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
870 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
871 ; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
872 ; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
873 ; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0
874 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
875 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
876 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
877 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
878 ; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
879 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
882 ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg:
883 ; AVX512VL-FALLBACK: # %bb.0:
884 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
885 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
886 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
887 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4
888 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5
889 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
890 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5
891 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
892 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
893 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5
894 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
895 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0
896 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
897 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
898 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
899 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
900 ; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
901 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
902 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
903 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
904 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0
905 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0
906 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
907 ; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
908 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
909 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1
910 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0
911 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
912 ; AVX512VL-FALLBACK-NEXT: retq
914 ; AVX512BW-LABEL: vec512_i8_signed_mem_reg:
916 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
917 ; AVX512BW-NEXT: vpcmpgtb %zmm0, %zmm1, %k1
918 ; AVX512BW-NEXT: vpminsb %zmm0, %zmm1, %zmm2
919 ; AVX512BW-NEXT: vpmaxsb %zmm0, %zmm1, %zmm0
920 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0
921 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0
922 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
923 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
924 ; AVX512BW-NEXT: vpsubb %zmm0, %zmm2, %zmm0 {%k1}
925 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
926 ; AVX512BW-NEXT: retq
927 %a1 = load <64 x i8>, ptr %a1_addr
928 %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed
929 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
930 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1
931 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2
932 %t7 = sub <64 x i8> %t6, %t5
933 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
934 %t9 = mul nsw <64 x i8> %t8, %t4 ; signed
935 %a10 = add nsw <64 x i8> %t9, %a1 ; signed
939 define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind {
940 ; AVX512F-LABEL: vec512_i8_signed_reg_mem:
942 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
943 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2
944 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
945 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
946 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
947 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
948 ; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5
949 ; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
950 ; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
951 ; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5
952 ; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
953 ; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
954 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
955 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
956 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
957 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
958 ; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
959 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
960 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
961 ; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
962 ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
963 ; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
964 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
965 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
966 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
967 ; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
968 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
969 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
972 ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem:
973 ; AVX512VL-FALLBACK: # %bb.0:
974 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
975 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2
976 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
977 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
978 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
979 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
980 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5
981 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
982 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
983 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5
984 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
985 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
986 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
987 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
988 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
989 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
990 ; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
991 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
992 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
993 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
994 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
995 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
996 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
997 ; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
998 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
999 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2
1000 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
1001 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1002 ; AVX512VL-FALLBACK-NEXT: retq
1004 ; AVX512BW-LABEL: vec512_i8_signed_reg_mem:
1005 ; AVX512BW: # %bb.0:
1006 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
1007 ; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
1008 ; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm2
1009 ; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1
1010 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1
1011 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
1012 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
1013 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
1014 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1}
1015 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
1016 ; AVX512BW-NEXT: retq
1017 %a2 = load <64 x i8>, ptr %a2_addr
1018 %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed
1019 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1020 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1
1021 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2
1022 %t7 = sub <64 x i8> %t6, %t5
1023 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1024 %t9 = mul nsw <64 x i8> %t8, %t4 ; signed
1025 %a10 = add nsw <64 x i8> %t9, %a1 ; signed
1029 define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
1030 ; AVX512F-LABEL: vec512_i8_signed_mem_mem:
1032 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
1033 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
1034 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
1035 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
1036 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4
1037 ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5
1038 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
1039 ; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5
1040 ; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
1041 ; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
1042 ; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5
1043 ; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
1044 ; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0
1045 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
1046 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
1047 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
1048 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1049 ; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
1050 ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
1051 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
1052 ; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
1053 ; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
1054 ; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0
1055 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1056 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
1057 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1058 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
1059 ; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
1060 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1061 ; AVX512F-NEXT: retq
1063 ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem:
1064 ; AVX512VL-FALLBACK: # %bb.0:
1065 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0
1066 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1
1067 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
1068 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
1069 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4
1070 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5
1071 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
1072 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5
1073 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
1074 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
1075 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5
1076 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
1077 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0
1078 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
1079 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
1080 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
1081 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1082 ; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
1083 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
1084 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
1085 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
1086 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0
1087 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0
1088 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1089 ; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
1090 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1091 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1
1092 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0
1093 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1094 ; AVX512VL-FALLBACK-NEXT: retq
1096 ; AVX512BW-LABEL: vec512_i8_signed_mem_mem:
1097 ; AVX512BW: # %bb.0:
1098 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1099 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1
1100 ; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
1101 ; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm2
1102 ; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1
1103 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1
1104 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1
1105 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
1106 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
1107 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1}
1108 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
1109 ; AVX512BW-NEXT: retq
1110 %a1 = load <64 x i8>, ptr %a1_addr
1111 %a2 = load <64 x i8>, ptr %a2_addr
1112 %t3 = icmp sgt <64 x i8> %a1, %a2 ; signed
1113 %t4 = select <64 x i1> %t3, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1114 %t5 = select <64 x i1> %t3, <64 x i8> %a2, <64 x i8> %a1
1115 %t6 = select <64 x i1> %t3, <64 x i8> %a1, <64 x i8> %a2
1116 %t7 = sub <64 x i8> %t6, %t5
1117 %t8 = lshr <64 x i8> %t7, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1118 %t9 = mul nsw <64 x i8> %t8, %t4 ; signed
1119 %a10 = add nsw <64 x i8> %t9, %a1 ; signed