1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
3 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
6 ; trunc(abs(sub(zext(a),zext(b)))) -> abdu(a,b)
9 define <64 x i8> @abd_ext_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
10 ; AVX512BW-LABEL: abd_ext_v64i8:
12 ; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2
13 ; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
14 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0
17 ; AVX512DQ-LABEL: abd_ext_v64i8:
19 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
20 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
21 ; AVX512DQ-NEXT: vpminub %ymm2, %ymm3, %ymm4
22 ; AVX512DQ-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
23 ; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2
24 ; AVX512DQ-NEXT: vpminub %ymm1, %ymm0, %ymm3
25 ; AVX512DQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
26 ; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0
27 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
29 %aext = zext <64 x i8> %a to <64 x i64>
30 %bext = zext <64 x i8> %b to <64 x i64>
31 %sub = sub <64 x i64> %aext, %bext
32 %abs = call <64 x i64> @llvm.abs.v64i64(<64 x i64> %sub, i1 false)
33 %trunc = trunc <64 x i64> %abs to <64 x i8>
37 define <64 x i8> @abd_ext_v64i8_undef(<64 x i8> %a, <64 x i8> %b) nounwind {
38 ; AVX512BW-LABEL: abd_ext_v64i8_undef:
40 ; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2
41 ; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
42 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0
45 ; AVX512DQ-LABEL: abd_ext_v64i8_undef:
47 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
48 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
49 ; AVX512DQ-NEXT: vpminub %ymm2, %ymm3, %ymm4
50 ; AVX512DQ-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
51 ; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2
52 ; AVX512DQ-NEXT: vpminub %ymm1, %ymm0, %ymm3
53 ; AVX512DQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
54 ; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0
55 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
57 %aext = zext <64 x i8> %a to <64 x i64>
58 %bext = zext <64 x i8> %b to <64 x i64>
59 %sub = sub <64 x i64> %aext, %bext
60 %abs = call <64 x i64> @llvm.abs.v64i64(<64 x i64> %sub, i1 true)
61 %trunc = trunc <64 x i64> %abs to <64 x i8>
65 define <32 x i16> @abd_ext_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
66 ; AVX512BW-LABEL: abd_ext_v32i16:
68 ; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2
69 ; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
70 ; AVX512BW-NEXT: vpsubw %zmm2, %zmm0, %zmm0
73 ; AVX512DQ-LABEL: abd_ext_v32i16:
75 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
76 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
77 ; AVX512DQ-NEXT: vpminuw %ymm2, %ymm3, %ymm4
78 ; AVX512DQ-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2
79 ; AVX512DQ-NEXT: vpsubw %ymm4, %ymm2, %ymm2
80 ; AVX512DQ-NEXT: vpminuw %ymm1, %ymm0, %ymm3
81 ; AVX512DQ-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
82 ; AVX512DQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0
83 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
85 %aext = zext <32 x i16> %a to <32 x i64>
86 %bext = zext <32 x i16> %b to <32 x i64>
87 %sub = sub <32 x i64> %aext, %bext
88 %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 false)
89 %trunc = trunc <32 x i64> %abs to <32 x i16>
93 define <32 x i16> @abd_ext_v32i16_undef(<32 x i16> %a, <32 x i16> %b) nounwind {
94 ; AVX512BW-LABEL: abd_ext_v32i16_undef:
96 ; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2
97 ; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
98 ; AVX512BW-NEXT: vpsubw %zmm2, %zmm0, %zmm0
101 ; AVX512DQ-LABEL: abd_ext_v32i16_undef:
103 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
104 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
105 ; AVX512DQ-NEXT: vpminuw %ymm2, %ymm3, %ymm4
106 ; AVX512DQ-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2
107 ; AVX512DQ-NEXT: vpsubw %ymm4, %ymm2, %ymm2
108 ; AVX512DQ-NEXT: vpminuw %ymm1, %ymm0, %ymm3
109 ; AVX512DQ-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
110 ; AVX512DQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0
111 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
112 ; AVX512DQ-NEXT: retq
113 %aext = zext <32 x i16> %a to <32 x i64>
114 %bext = zext <32 x i16> %b to <32 x i64>
115 %sub = sub <32 x i64> %aext, %bext
116 %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 true)
117 %trunc = trunc <32 x i64> %abs to <32 x i16>
118 ret <32 x i16> %trunc
121 define <16 x i32> @abd_ext_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
122 ; AVX512-LABEL: abd_ext_v16i32:
124 ; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm2
125 ; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
126 ; AVX512-NEXT: vpsubd %zmm2, %zmm0, %zmm0
128 %aext = zext <16 x i32> %a to <16 x i64>
129 %bext = zext <16 x i32> %b to <16 x i64>
130 %sub = sub <16 x i64> %aext, %bext
131 %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 false)
132 %trunc = trunc <16 x i64> %abs to <16 x i32>
133 ret <16 x i32> %trunc
136 define <16 x i32> @abd_ext_v16i32_undef(<16 x i32> %a, <16 x i32> %b) nounwind {
137 ; AVX512-LABEL: abd_ext_v16i32_undef:
139 ; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm2
140 ; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
141 ; AVX512-NEXT: vpsubd %zmm2, %zmm0, %zmm0
143 %aext = zext <16 x i32> %a to <16 x i64>
144 %bext = zext <16 x i32> %b to <16 x i64>
145 %sub = sub <16 x i64> %aext, %bext
146 %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true)
147 %trunc = trunc <16 x i64> %abs to <16 x i32>
148 ret <16 x i32> %trunc
151 define <8 x i64> @abd_ext_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
152 ; AVX512-LABEL: abd_ext_v8i64:
154 ; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm2
155 ; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
156 ; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
158 %aext = zext <8 x i64> %a to <8 x i128>
159 %bext = zext <8 x i64> %b to <8 x i128>
160 %sub = sub <8 x i128> %aext, %bext
161 %abs = call <8 x i128> @llvm.abs.v8i128(<8 x i128> %sub, i1 false)
162 %trunc = trunc <8 x i128> %abs to <8 x i64>
166 define <8 x i64> @abd_ext_v8i64_undef(<8 x i64> %a, <8 x i64> %b) nounwind {
167 ; AVX512-LABEL: abd_ext_v8i64_undef:
169 ; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm2
170 ; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
171 ; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
173 %aext = zext <8 x i64> %a to <8 x i128>
174 %bext = zext <8 x i64> %b to <8 x i128>
175 %sub = sub <8 x i128> %aext, %bext
176 %abs = call <8 x i128> @llvm.abs.v8i128(<8 x i128> %sub, i1 true)
177 %trunc = trunc <8 x i128> %abs to <8 x i64>
182 ; sub(umax(a,b),umin(a,b)) -> abdu(a,b)
185 define <64 x i8> @abd_minmax_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
186 ; AVX512BW-LABEL: abd_minmax_v64i8:
188 ; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2
189 ; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
190 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0
191 ; AVX512BW-NEXT: retq
193 ; AVX512DQ-LABEL: abd_minmax_v64i8:
195 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
196 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
197 ; AVX512DQ-NEXT: vpminub %ymm2, %ymm3, %ymm4
198 ; AVX512DQ-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
199 ; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2
200 ; AVX512DQ-NEXT: vpminub %ymm1, %ymm0, %ymm3
201 ; AVX512DQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
202 ; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0
203 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
204 ; AVX512DQ-NEXT: retq
205 %min = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %a, <64 x i8> %b)
206 %max = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %a, <64 x i8> %b)
207 %sub = sub <64 x i8> %max, %min
211 define <32 x i16> @abd_minmax_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
212 ; AVX512BW-LABEL: abd_minmax_v32i16:
214 ; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2
215 ; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
216 ; AVX512BW-NEXT: vpsubw %zmm2, %zmm0, %zmm0
217 ; AVX512BW-NEXT: retq
219 ; AVX512DQ-LABEL: abd_minmax_v32i16:
221 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
222 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
223 ; AVX512DQ-NEXT: vpminuw %ymm2, %ymm3, %ymm4
224 ; AVX512DQ-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2
225 ; AVX512DQ-NEXT: vpsubw %ymm4, %ymm2, %ymm2
226 ; AVX512DQ-NEXT: vpminuw %ymm1, %ymm0, %ymm3
227 ; AVX512DQ-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
228 ; AVX512DQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0
229 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
230 ; AVX512DQ-NEXT: retq
231 %min = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %a, <32 x i16> %b)
232 %max = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %a, <32 x i16> %b)
233 %sub = sub <32 x i16> %max, %min
237 define <16 x i32> @abd_minmax_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
238 ; AVX512-LABEL: abd_minmax_v16i32:
240 ; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm2
241 ; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
242 ; AVX512-NEXT: vpsubd %zmm2, %zmm0, %zmm0
244 %min = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %a, <16 x i32> %b)
245 %max = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %a, <16 x i32> %b)
246 %sub = sub <16 x i32> %max, %min
250 define <8 x i64> @abd_minmax_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
251 ; AVX512-LABEL: abd_minmax_v8i64:
253 ; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm2
254 ; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
255 ; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
257 %min = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %a, <8 x i64> %b)
258 %max = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %a, <8 x i64> %b)
259 %sub = sub <8 x i64> %max, %min
264 ; select(icmp(a,b),sub(a,b),sub(b,a)) -> abdu(a,b)
267 define <64 x i8> @abd_cmp_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
268 ; AVX512BW-LABEL: abd_cmp_v64i8:
270 ; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2
271 ; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
272 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0
273 ; AVX512BW-NEXT: retq
275 ; AVX512DQ-LABEL: abd_cmp_v64i8:
277 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
278 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
279 ; AVX512DQ-NEXT: vpminub %ymm2, %ymm3, %ymm4
280 ; AVX512DQ-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
281 ; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2
282 ; AVX512DQ-NEXT: vpminub %ymm1, %ymm0, %ymm3
283 ; AVX512DQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
284 ; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0
285 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
286 ; AVX512DQ-NEXT: retq
287 %cmp = icmp ugt <64 x i8> %a, %b
288 %ab = sub <64 x i8> %a, %b
289 %ba = sub <64 x i8> %b, %a
290 %sel = select <64 x i1> %cmp, <64 x i8> %ab, <64 x i8> %ba
294 define <32 x i16> @abd_cmp_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
295 ; AVX512BW-LABEL: abd_cmp_v32i16:
297 ; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2
298 ; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
299 ; AVX512BW-NEXT: vpsubw %zmm2, %zmm0, %zmm0
300 ; AVX512BW-NEXT: retq
302 ; AVX512DQ-LABEL: abd_cmp_v32i16:
304 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
305 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
306 ; AVX512DQ-NEXT: vpminuw %ymm2, %ymm3, %ymm4
307 ; AVX512DQ-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2
308 ; AVX512DQ-NEXT: vpsubw %ymm4, %ymm2, %ymm2
309 ; AVX512DQ-NEXT: vpminuw %ymm1, %ymm0, %ymm3
310 ; AVX512DQ-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
311 ; AVX512DQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0
312 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
313 ; AVX512DQ-NEXT: retq
314 %cmp = icmp uge <32 x i16> %a, %b
315 %ab = sub <32 x i16> %a, %b
316 %ba = sub <32 x i16> %b, %a
317 %sel = select <32 x i1> %cmp, <32 x i16> %ab, <32 x i16> %ba
321 define <16 x i32> @abd_cmp_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
322 ; AVX512-LABEL: abd_cmp_v16i32:
324 ; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm2
325 ; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
326 ; AVX512-NEXT: vpsubd %zmm2, %zmm0, %zmm0
328 %cmp = icmp ult <16 x i32> %a, %b
329 %ab = sub <16 x i32> %a, %b
330 %ba = sub <16 x i32> %b, %a
331 %sel = select <16 x i1> %cmp, <16 x i32> %ba, <16 x i32> %ab
335 define <8 x i64> @abd_cmp_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
336 ; AVX512-LABEL: abd_cmp_v8i64:
338 ; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm2
339 ; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
340 ; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
342 %cmp = icmp uge <8 x i64> %a, %b
343 %ab = sub <8 x i64> %a, %b
344 %ba = sub <8 x i64> %b, %a
345 %sel = select <8 x i1> %cmp, <8 x i64> %ab, <8 x i64> %ba
349 declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1)
350 declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1)
351 declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1)
352 declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)
353 declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1)
354 declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1)
355 declare <64 x i64> @llvm.abs.v64i64(<64 x i64>, i1)
356 declare <8 x i128> @llvm.abs.v8i128(<8 x i128>, i1)
358 declare <64 x i8> @llvm.umax.v64i8(<64 x i8>, <64 x i8>)
359 declare <32 x i16> @llvm.umax.v32i16(<32 x i16>, <32 x i16>)
360 declare <16 x i32> @llvm.umax.v16i32(<16 x i32>, <16 x i32>)
361 declare <8 x i64> @llvm.umax.v8i64(<8 x i64>, <8 x i64>)
363 declare <64 x i8> @llvm.umin.v64i8(<64 x i8>, <64 x i8>)
364 declare <32 x i16> @llvm.umin.v32i16(<32 x i16>, <32 x i16>)
365 declare <16 x i32> @llvm.umin.v16i32(<16 x i32>, <16 x i32>)
366 declare <8 x i64> @llvm.umin.v8i64(<8 x i64>, <8 x i64>)