1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
5 define <16 x i8> @avg_v16i8_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %src, i16 %mask) nounwind {
6 ; AVX512F-LABEL: avg_v16i8_mask:
8 ; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0
9 ; AVX512F-NEXT: kmovw %edi, %k1
10 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
11 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
12 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
13 ; AVX512F-NEXT: vzeroupper
16 ; AVX512BWVL-LABEL: avg_v16i8_mask:
18 ; AVX512BWVL-NEXT: kmovd %edi, %k1
19 ; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1}
20 ; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0
21 ; AVX512BWVL-NEXT: retq
22 %za = zext <16 x i8> %a to <16 x i16>
23 %zb = zext <16 x i8> %b to <16 x i16>
24 %add = add nuw nsw <16 x i16> %za, %zb
25 %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
26 %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
27 %trunc = trunc <16 x i16> %lshr to <16 x i8>
28 %mask1 = bitcast i16 %mask to <16 x i1>
29 %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> %src
33 define <16 x i8> @avg_v16i8_maskz(<16 x i8> %a, <16 x i8> %b, i16 %mask) nounwind {
34 ; AVX512F-LABEL: avg_v16i8_maskz:
36 ; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0
37 ; AVX512F-NEXT: kmovw %edi, %k1
38 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
39 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
40 ; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0
41 ; AVX512F-NEXT: vzeroupper
44 ; AVX512BWVL-LABEL: avg_v16i8_maskz:
46 ; AVX512BWVL-NEXT: kmovd %edi, %k1
47 ; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 {%k1} {z}
48 ; AVX512BWVL-NEXT: retq
49 %za = zext <16 x i8> %a to <16 x i16>
50 %zb = zext <16 x i8> %b to <16 x i16>
51 %add = add nuw nsw <16 x i16> %za, %zb
52 %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
53 %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
54 %trunc = trunc <16 x i16> %lshr to <16 x i8>
55 %mask1 = bitcast i16 %mask to <16 x i1>
56 %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> zeroinitializer
60 define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32 %mask) nounwind {
61 ; AVX512F-LABEL: avg_v32i8_mask:
63 ; AVX512F-NEXT: pushq %rbp
64 ; AVX512F-NEXT: movq %rsp, %rbp
65 ; AVX512F-NEXT: andq $-32, %rsp
66 ; AVX512F-NEXT: subq $32, %rsp
67 ; AVX512F-NEXT: movl %edi, (%rsp)
68 ; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
69 ; AVX512F-NEXT: kmovw (%rsp), %k1
70 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
71 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
72 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
73 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
74 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
75 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
76 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
77 ; AVX512F-NEXT: movq %rbp, %rsp
78 ; AVX512F-NEXT: popq %rbp
81 ; AVX512BWVL-LABEL: avg_v32i8_mask:
83 ; AVX512BWVL-NEXT: kmovd %edi, %k1
84 ; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1}
85 ; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0
86 ; AVX512BWVL-NEXT: retq
87 %za = zext <32 x i8> %a to <32 x i16>
88 %zb = zext <32 x i8> %b to <32 x i16>
89 %add = add nuw nsw <32 x i16> %za, %zb
90 %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
91 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
92 %trunc = trunc <32 x i16> %lshr to <32 x i8>
93 %mask1 = bitcast i32 %mask to <32 x i1>
94 %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> %src
98 define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwind {
99 ; AVX512F-LABEL: avg_v32i8_maskz:
101 ; AVX512F-NEXT: pushq %rbp
102 ; AVX512F-NEXT: movq %rsp, %rbp
103 ; AVX512F-NEXT: andq $-32, %rsp
104 ; AVX512F-NEXT: subq $32, %rsp
105 ; AVX512F-NEXT: movl %edi, (%rsp)
106 ; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
107 ; AVX512F-NEXT: kmovw (%rsp), %k1
108 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
109 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
110 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
111 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
112 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
113 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
114 ; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
115 ; AVX512F-NEXT: movq %rbp, %rsp
116 ; AVX512F-NEXT: popq %rbp
119 ; AVX512BWVL-LABEL: avg_v32i8_maskz:
120 ; AVX512BWVL: # BB#0:
121 ; AVX512BWVL-NEXT: kmovd %edi, %k1
122 ; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 {%k1} {z}
123 ; AVX512BWVL-NEXT: retq
124 %za = zext <32 x i8> %a to <32 x i16>
125 %zb = zext <32 x i8> %b to <32 x i16>
126 %add = add nuw nsw <32 x i16> %za, %zb
127 %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
128 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
129 %trunc = trunc <32 x i16> %lshr to <32 x i8>
130 %mask1 = bitcast i32 %mask to <32 x i1>
131 %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> zeroinitializer
135 define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind {
136 ; AVX512F-LABEL: avg_v64i8_mask:
138 ; AVX512F-NEXT: pushq %rbp
139 ; AVX512F-NEXT: movq %rsp, %rbp
140 ; AVX512F-NEXT: andq $-32, %rsp
141 ; AVX512F-NEXT: subq $64, %rsp
142 ; AVX512F-NEXT: movq %rdi, %rax
143 ; AVX512F-NEXT: shrq $32, %rax
144 ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp)
145 ; AVX512F-NEXT: movl %edi, (%rsp)
146 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm6
147 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm8
148 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm7
149 ; AVX512F-NEXT: vpavgb %xmm7, %xmm6, %xmm6
150 ; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm7
151 ; AVX512F-NEXT: vpavgb %xmm7, %xmm8, %xmm7
152 ; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
153 ; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1
154 ; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
155 ; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
156 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
157 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
158 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
159 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
160 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
161 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
162 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
163 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
164 ; AVX512F-NEXT: kmovw (%rsp), %k1
165 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
166 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
167 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
168 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
169 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
170 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
171 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
172 ; AVX512F-NEXT: movq %rbp, %rsp
173 ; AVX512F-NEXT: popq %rbp
176 ; AVX512BWVL-LABEL: avg_v64i8_mask:
177 ; AVX512BWVL: # BB#0:
178 ; AVX512BWVL-NEXT: kmovq %rdi, %k1
179 ; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
180 ; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0
181 ; AVX512BWVL-NEXT: retq
182 %za = zext <64 x i8> %a to <64 x i16>
183 %zb = zext <64 x i8> %b to <64 x i16>
184 %add = add nuw nsw <64 x i16> %za, %zb
185 %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
186 %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
187 %trunc = trunc <64 x i16> %lshr to <64 x i8>
188 %mask1 = bitcast i64 %mask to <64 x i1>
189 %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> %src
193 define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
194 ; AVX512F-LABEL: avg_v64i8_maskz:
196 ; AVX512F-NEXT: pushq %rbp
197 ; AVX512F-NEXT: movq %rsp, %rbp
198 ; AVX512F-NEXT: andq $-32, %rsp
199 ; AVX512F-NEXT: subq $64, %rsp
200 ; AVX512F-NEXT: movq %rdi, %rax
201 ; AVX512F-NEXT: shrq $32, %rax
202 ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp)
203 ; AVX512F-NEXT: movl %edi, (%rsp)
204 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
205 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
206 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm6
207 ; AVX512F-NEXT: vpavgb %xmm6, %xmm4, %xmm4
208 ; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6
209 ; AVX512F-NEXT: vpavgb %xmm6, %xmm5, %xmm5
210 ; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
211 ; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
212 ; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
213 ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
214 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
215 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
216 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
217 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
218 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
219 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
220 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
221 ; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
222 ; AVX512F-NEXT: kmovw (%rsp), %k1
223 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
224 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
225 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
226 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
227 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
228 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
229 ; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
230 ; AVX512F-NEXT: movq %rbp, %rsp
231 ; AVX512F-NEXT: popq %rbp
234 ; AVX512BWVL-LABEL: avg_v64i8_maskz:
235 ; AVX512BWVL: # BB#0:
236 ; AVX512BWVL-NEXT: kmovq %rdi, %k1
237 ; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm0 {%k1} {z}
238 ; AVX512BWVL-NEXT: retq
239 %za = zext <64 x i8> %a to <64 x i16>
240 %zb = zext <64 x i8> %b to <64 x i16>
241 %add = add nuw nsw <64 x i16> %za, %zb
242 %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
243 %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
244 %trunc = trunc <64 x i16> %lshr to <64 x i8>
245 %mask1 = bitcast i64 %mask to <64 x i1>
246 %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> zeroinitializer
250 define <8 x i16> @avg_v8i16_mask(<8 x i16> %a, <8 x i16> %b, <8 x i16> %src, i8 %mask) nounwind {
251 ; AVX512F-LABEL: avg_v8i16_mask:
253 ; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0
254 ; AVX512F-NEXT: kmovw %edi, %k1
255 ; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
256 ; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
257 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
258 ; AVX512F-NEXT: vzeroupper
261 ; AVX512BWVL-LABEL: avg_v8i16_mask:
262 ; AVX512BWVL: # BB#0:
263 ; AVX512BWVL-NEXT: kmovd %edi, %k1
264 ; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1}
265 ; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0
266 ; AVX512BWVL-NEXT: retq
267 %za = zext <8 x i16> %a to <8 x i32>
268 %zb = zext <8 x i16> %b to <8 x i32>
269 %add = add nuw nsw <8 x i32> %za, %zb
270 %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
271 %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
272 %trunc = trunc <8 x i32> %lshr to <8 x i16>
273 %mask1 = bitcast i8 %mask to <8 x i1>
274 %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> %src
278 define <8 x i16> @avg_v8i16_maskz(<8 x i16> %a, <8 x i16> %b, i8 %mask) nounwind {
279 ; AVX512F-LABEL: avg_v8i16_maskz:
281 ; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0
282 ; AVX512F-NEXT: kmovw %edi, %k1
283 ; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
284 ; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
285 ; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0
286 ; AVX512F-NEXT: vzeroupper
289 ; AVX512BWVL-LABEL: avg_v8i16_maskz:
290 ; AVX512BWVL: # BB#0:
291 ; AVX512BWVL-NEXT: kmovd %edi, %k1
292 ; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 {%k1} {z}
293 ; AVX512BWVL-NEXT: retq
294 %za = zext <8 x i16> %a to <8 x i32>
295 %zb = zext <8 x i16> %b to <8 x i32>
296 %add = add nuw nsw <8 x i32> %za, %zb
297 %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
298 %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
299 %trunc = trunc <8 x i32> %lshr to <8 x i16>
300 %mask1 = bitcast i8 %mask to <8 x i1>
301 %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> zeroinitializer
305 define <16 x i16> @avg_v16i16_mask(<16 x i16> %a, <16 x i16> %b, <16 x i16> %src, i16 %mask) nounwind {
306 ; AVX512F-LABEL: avg_v16i16_mask:
308 ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
309 ; AVX512F-NEXT: kmovw %edi, %k1
310 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
311 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
312 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
315 ; AVX512BWVL-LABEL: avg_v16i16_mask:
316 ; AVX512BWVL: # BB#0:
317 ; AVX512BWVL-NEXT: kmovd %edi, %k1
318 ; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1}
319 ; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0
320 ; AVX512BWVL-NEXT: retq
321 %za = zext <16 x i16> %a to <16 x i32>
322 %zb = zext <16 x i16> %b to <16 x i32>
323 %add = add nuw nsw <16 x i32> %za, %zb
324 %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
325 %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
326 %trunc = trunc <16 x i32> %lshr to <16 x i16>
327 %mask1 = bitcast i16 %mask to <16 x i1>
328 %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> %src
332 define <16 x i16> @avg_v16i16_maskz(<16 x i16> %a, <16 x i16> %b, i16 %mask) nounwind {
333 ; AVX512F-LABEL: avg_v16i16_maskz:
335 ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
336 ; AVX512F-NEXT: kmovw %edi, %k1
337 ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
338 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
339 ; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
342 ; AVX512BWVL-LABEL: avg_v16i16_maskz:
343 ; AVX512BWVL: # BB#0:
344 ; AVX512BWVL-NEXT: kmovd %edi, %k1
345 ; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 {%k1} {z}
346 ; AVX512BWVL-NEXT: retq
347 %za = zext <16 x i16> %a to <16 x i32>
348 %zb = zext <16 x i16> %b to <16 x i32>
349 %add = add nuw nsw <16 x i32> %za, %zb
350 %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
351 %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
352 %trunc = trunc <16 x i32> %lshr to <16 x i16>
353 %mask1 = bitcast i16 %mask to <16 x i1>
354 %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> zeroinitializer
358 define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind {
359 ; AVX512F-LABEL: avg_v32i16_mask:
361 ; AVX512F-NEXT: pushq %rbp
362 ; AVX512F-NEXT: movq %rsp, %rbp
363 ; AVX512F-NEXT: andq $-32, %rsp
364 ; AVX512F-NEXT: subq $32, %rsp
365 ; AVX512F-NEXT: movl %edi, (%rsp)
366 ; AVX512F-NEXT: kmovw (%rsp), %k1
367 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
368 ; AVX512F-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
369 ; AVX512F-NEXT: vpmovdb %zmm6, %xmm6
370 ; AVX512F-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z}
371 ; AVX512F-NEXT: vpmovdb %zmm7, %xmm7
372 ; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
373 ; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
374 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero
375 ; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
376 ; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
377 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
378 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
379 ; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
380 ; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
381 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
382 ; AVX512F-NEXT: movq %rbp, %rsp
383 ; AVX512F-NEXT: popq %rbp
386 ; AVX512BWVL-LABEL: avg_v32i16_mask:
387 ; AVX512BWVL: # BB#0:
388 ; AVX512BWVL-NEXT: kmovd %edi, %k1
389 ; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
390 ; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0
391 ; AVX512BWVL-NEXT: retq
392 %za = zext <32 x i16> %a to <32 x i32>
393 %zb = zext <32 x i16> %b to <32 x i32>
394 %add = add nuw nsw <32 x i32> %za, %zb
395 %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
396 %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
397 %trunc = trunc <32 x i32> %lshr to <32 x i16>
398 %mask1 = bitcast i32 %mask to <32 x i1>
399 %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> %src
403 define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
404 ; AVX512F-LABEL: avg_v32i16_maskz:
406 ; AVX512F-NEXT: pushq %rbp
407 ; AVX512F-NEXT: movq %rsp, %rbp
408 ; AVX512F-NEXT: andq $-32, %rsp
409 ; AVX512F-NEXT: subq $32, %rsp
410 ; AVX512F-NEXT: movl %edi, (%rsp)
411 ; AVX512F-NEXT: kmovw (%rsp), %k1
412 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
413 ; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
414 ; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
415 ; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
416 ; AVX512F-NEXT: vpmovdb %zmm5, %xmm5
417 ; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
418 ; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
419 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
420 ; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
421 ; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
422 ; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
423 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
424 ; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
425 ; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
426 ; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
427 ; AVX512F-NEXT: movq %rbp, %rsp
428 ; AVX512F-NEXT: popq %rbp
431 ; AVX512BWVL-LABEL: avg_v32i16_maskz:
432 ; AVX512BWVL: # BB#0:
433 ; AVX512BWVL-NEXT: kmovd %edi, %k1
434 ; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm0 {%k1} {z}
435 ; AVX512BWVL-NEXT: retq
436 %za = zext <32 x i16> %a to <32 x i32>
437 %zb = zext <32 x i16> %b to <32 x i32>
438 %add = add nuw nsw <32 x i32> %za, %zb
439 %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
440 %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
441 %trunc = trunc <32 x i32> %lshr to <32 x i16>
442 %mask1 = bitcast i32 %mask to <32 x i1>
443 %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> zeroinitializer