1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
8 define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) nounwind {
9 ; SSE2-LABEL: avg_v4i8:
11 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
12 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
13 ; SSE2-NEXT: pavgb %xmm0, %xmm1
14 ; SSE2-NEXT: movd %xmm1, (%rax)
17 ; AVX-LABEL: avg_v4i8:
19 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
20 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
21 ; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0
22 ; AVX-NEXT: vmovd %xmm0, (%rax)
24 %1 = load <4 x i8>, <4 x i8>* %a
25 %2 = load <4 x i8>, <4 x i8>* %b
26 %3 = zext <4 x i8> %1 to <4 x i32>
27 %4 = zext <4 x i8> %2 to <4 x i32>
28 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
29 %6 = add nuw nsw <4 x i32> %5, %4
30 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
31 %8 = trunc <4 x i32> %7 to <4 x i8>
32 store <4 x i8> %8, <4 x i8>* undef, align 4
36 define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) nounwind {
37 ; SSE2-LABEL: avg_v8i8:
39 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
40 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
41 ; SSE2-NEXT: pavgb %xmm0, %xmm1
42 ; SSE2-NEXT: movq %xmm1, (%rax)
45 ; AVX-LABEL: avg_v8i8:
47 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
48 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
49 ; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0
50 ; AVX-NEXT: vmovq %xmm0, (%rax)
52 %1 = load <8 x i8>, <8 x i8>* %a
53 %2 = load <8 x i8>, <8 x i8>* %b
54 %3 = zext <8 x i8> %1 to <8 x i32>
55 %4 = zext <8 x i8> %2 to <8 x i32>
56 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
57 %6 = add nuw nsw <8 x i32> %5, %4
58 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
59 %8 = trunc <8 x i32> %7 to <8 x i8>
60 store <8 x i8> %8, <8 x i8>* undef, align 4
64 define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind {
65 ; SSE2-LABEL: avg_v16i8:
67 ; SSE2-NEXT: movdqa (%rsi), %xmm0
68 ; SSE2-NEXT: pavgb (%rdi), %xmm0
69 ; SSE2-NEXT: movdqu %xmm0, (%rax)
72 ; AVX-LABEL: avg_v16i8:
74 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
75 ; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0
76 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
78 %1 = load <16 x i8>, <16 x i8>* %a
79 %2 = load <16 x i8>, <16 x i8>* %b
80 %3 = zext <16 x i8> %1 to <16 x i32>
81 %4 = zext <16 x i8> %2 to <16 x i32>
82 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
83 %6 = add nuw nsw <16 x i32> %5, %4
84 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
85 %8 = trunc <16 x i32> %7 to <16 x i8>
86 store <16 x i8> %8, <16 x i8>* undef, align 4
90 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
91 ; SSE2-LABEL: avg_v32i8:
93 ; SSE2-NEXT: movdqa (%rsi), %xmm0
94 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
95 ; SSE2-NEXT: pavgb (%rdi), %xmm0
96 ; SSE2-NEXT: pavgb 16(%rdi), %xmm1
97 ; SSE2-NEXT: movdqu %xmm1, (%rax)
98 ; SSE2-NEXT: movdqu %xmm0, (%rax)
101 ; AVX1-LABEL: avg_v32i8:
103 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
104 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
105 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0
106 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1
107 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
108 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
111 ; AVX2-LABEL: avg_v32i8:
113 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
114 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
115 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
116 ; AVX2-NEXT: vzeroupper
119 ; AVX512-LABEL: avg_v32i8:
121 ; AVX512-NEXT: vmovdqa (%rsi), %ymm0
122 ; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0
123 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
124 ; AVX512-NEXT: vzeroupper
126 %1 = load <32 x i8>, <32 x i8>* %a
127 %2 = load <32 x i8>, <32 x i8>* %b
128 %3 = zext <32 x i8> %1 to <32 x i32>
129 %4 = zext <32 x i8> %2 to <32 x i32>
130 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
131 %6 = add nuw nsw <32 x i32> %5, %4
132 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
133 %8 = trunc <32 x i32> %7 to <32 x i8>
134 store <32 x i8> %8, <32 x i8>* undef, align 4
138 define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
139 ; SSE2-LABEL: avg_v48i8:
141 ; SSE2-NEXT: movdqa (%rdi), %xmm1
142 ; SSE2-NEXT: movdqa 16(%rdi), %xmm6
143 ; SSE2-NEXT: movdqa 32(%rdi), %xmm11
144 ; SSE2-NEXT: movdqa (%rsi), %xmm12
145 ; SSE2-NEXT: movdqa 16(%rsi), %xmm13
146 ; SSE2-NEXT: movdqa 32(%rsi), %xmm0
147 ; SSE2-NEXT: pxor %xmm7, %xmm7
148 ; SSE2-NEXT: movdqa %xmm1, %xmm4
149 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
150 ; SSE2-NEXT: movdqa %xmm4, %xmm2
151 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
152 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
153 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
154 ; SSE2-NEXT: movdqa %xmm1, %xmm10
155 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
156 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
157 ; SSE2-NEXT: movdqa %xmm6, %xmm5
158 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
159 ; SSE2-NEXT: movdqa %xmm5, %xmm15
160 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
161 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
162 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
163 ; SSE2-NEXT: movdqa %xmm6, %xmm14
164 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
165 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
166 ; SSE2-NEXT: movdqa %xmm12, %xmm3
167 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15]
168 ; SSE2-NEXT: movdqa %xmm3, %xmm8
169 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
170 ; SSE2-NEXT: paddd %xmm2, %xmm8
171 ; SSE2-NEXT: movdqa %xmm11, %xmm2
172 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
173 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
174 ; SSE2-NEXT: paddd %xmm4, %xmm3
175 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
176 ; SSE2-NEXT: movdqa %xmm12, %xmm9
177 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
178 ; SSE2-NEXT: paddd %xmm10, %xmm9
179 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
180 ; SSE2-NEXT: paddd %xmm1, %xmm12
181 ; SSE2-NEXT: movdqa %xmm13, %xmm4
182 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
183 ; SSE2-NEXT: movdqa %xmm4, %xmm10
184 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
185 ; SSE2-NEXT: paddd %xmm15, %xmm10
186 ; SSE2-NEXT: movdqa %xmm2, %xmm15
187 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
188 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
189 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
190 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
191 ; SSE2-NEXT: paddd %xmm5, %xmm4
192 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
193 ; SSE2-NEXT: movdqa %xmm13, %xmm1
194 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
195 ; SSE2-NEXT: paddd %xmm14, %xmm1
196 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3]
197 ; SSE2-NEXT: paddd %xmm6, %xmm13
198 ; SSE2-NEXT: movdqa %xmm0, %xmm6
199 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
200 ; SSE2-NEXT: movdqa %xmm6, %xmm14
201 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
202 ; SSE2-NEXT: paddd %xmm15, %xmm14
203 ; SSE2-NEXT: movdqa %xmm11, %xmm5
204 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
205 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
206 ; SSE2-NEXT: paddd %xmm2, %xmm6
207 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
208 ; SSE2-NEXT: movdqa %xmm0, %xmm2
209 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
210 ; SSE2-NEXT: paddd %xmm5, %xmm2
211 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
212 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
213 ; SSE2-NEXT: paddd %xmm11, %xmm0
214 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
215 ; SSE2-NEXT: psubd %xmm5, %xmm8
216 ; SSE2-NEXT: psubd %xmm5, %xmm3
217 ; SSE2-NEXT: psubd %xmm5, %xmm9
218 ; SSE2-NEXT: psubd %xmm5, %xmm12
219 ; SSE2-NEXT: psubd %xmm5, %xmm10
220 ; SSE2-NEXT: psubd %xmm5, %xmm4
221 ; SSE2-NEXT: psubd %xmm5, %xmm1
222 ; SSE2-NEXT: psubd %xmm5, %xmm13
223 ; SSE2-NEXT: psubd %xmm5, %xmm14
224 ; SSE2-NEXT: psubd %xmm5, %xmm6
225 ; SSE2-NEXT: psubd %xmm5, %xmm2
226 ; SSE2-NEXT: psubd %xmm5, %xmm0
227 ; SSE2-NEXT: psrld $1, %xmm3
228 ; SSE2-NEXT: psrld $1, %xmm8
229 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255]
230 ; SSE2-NEXT: pand %xmm7, %xmm8
231 ; SSE2-NEXT: pand %xmm7, %xmm3
232 ; SSE2-NEXT: packuswb %xmm8, %xmm3
233 ; SSE2-NEXT: psrld $1, %xmm12
234 ; SSE2-NEXT: psrld $1, %xmm9
235 ; SSE2-NEXT: pand %xmm7, %xmm9
236 ; SSE2-NEXT: pand %xmm7, %xmm12
237 ; SSE2-NEXT: packuswb %xmm9, %xmm12
238 ; SSE2-NEXT: packuswb %xmm3, %xmm12
239 ; SSE2-NEXT: psrld $1, %xmm4
240 ; SSE2-NEXT: psrld $1, %xmm10
241 ; SSE2-NEXT: pand %xmm7, %xmm10
242 ; SSE2-NEXT: pand %xmm7, %xmm4
243 ; SSE2-NEXT: packuswb %xmm10, %xmm4
244 ; SSE2-NEXT: psrld $1, %xmm13
245 ; SSE2-NEXT: psrld $1, %xmm1
246 ; SSE2-NEXT: pand %xmm7, %xmm1
247 ; SSE2-NEXT: pand %xmm7, %xmm13
248 ; SSE2-NEXT: packuswb %xmm1, %xmm13
249 ; SSE2-NEXT: packuswb %xmm4, %xmm13
250 ; SSE2-NEXT: psrld $1, %xmm6
251 ; SSE2-NEXT: psrld $1, %xmm14
252 ; SSE2-NEXT: pand %xmm7, %xmm14
253 ; SSE2-NEXT: pand %xmm7, %xmm6
254 ; SSE2-NEXT: packuswb %xmm14, %xmm6
255 ; SSE2-NEXT: psrld $1, %xmm0
256 ; SSE2-NEXT: psrld $1, %xmm2
257 ; SSE2-NEXT: pand %xmm7, %xmm2
258 ; SSE2-NEXT: pand %xmm7, %xmm0
259 ; SSE2-NEXT: packuswb %xmm2, %xmm0
260 ; SSE2-NEXT: packuswb %xmm6, %xmm0
261 ; SSE2-NEXT: movdqu %xmm0, (%rax)
262 ; SSE2-NEXT: movdqu %xmm13, (%rax)
263 ; SSE2-NEXT: movdqu %xmm12, (%rax)
266 ; AVX1-LABEL: avg_v48i8:
268 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
269 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4
270 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1
271 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
272 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
273 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1]
274 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
275 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
276 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
277 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
278 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
279 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
280 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1]
281 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
282 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
283 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
284 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
285 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
286 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
287 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
288 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,0,1]
289 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
290 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
291 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
292 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
293 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
294 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
295 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
296 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
297 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4
298 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3
299 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
300 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
301 ; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm12
302 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,0,1]
303 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
304 ; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm10
305 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3]
306 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
307 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm9
308 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
309 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm8
310 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
311 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
312 ; AVX1-NEXT: vpaddd %xmm3, %xmm15, %xmm15
313 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,0,1]
314 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
315 ; AVX1-NEXT: vpaddd %xmm7, %xmm11, %xmm7
316 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
317 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
318 ; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm14
319 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
320 ; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13
321 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
322 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
323 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
324 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,0,1]
325 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
326 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
327 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,3]
328 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
329 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
330 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
331 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
332 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
333 ; AVX1-NEXT: vpsubd %xmm3, %xmm12, %xmm11
334 ; AVX1-NEXT: vpsubd %xmm3, %xmm10, %xmm10
335 ; AVX1-NEXT: vpsubd %xmm3, %xmm9, %xmm9
336 ; AVX1-NEXT: vpsubd %xmm3, %xmm8, %xmm8
337 ; AVX1-NEXT: vpsubd %xmm3, %xmm15, %xmm12
338 ; AVX1-NEXT: vpsubd %xmm3, %xmm7, %xmm7
339 ; AVX1-NEXT: vpsubd %xmm3, %xmm14, %xmm0
340 ; AVX1-NEXT: vpsubd %xmm3, %xmm13, %xmm2
341 ; AVX1-NEXT: vpsubd %xmm3, %xmm5, %xmm5
342 ; AVX1-NEXT: vpsubd %xmm3, %xmm6, %xmm6
343 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
344 ; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3
345 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
346 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
347 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
348 ; AVX1-NEXT: vpsrld $1, %xmm6, %xmm3
349 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm4
350 ; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
351 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
352 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
353 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
354 ; AVX1-NEXT: vpsrld $1, %xmm7, %xmm2
355 ; AVX1-NEXT: vpsrld $1, %xmm12, %xmm4
356 ; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
357 ; AVX1-NEXT: vpsrld $1, %xmm8, %xmm4
358 ; AVX1-NEXT: vpsrld $1, %xmm9, %xmm5
359 ; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
360 ; AVX1-NEXT: vpsrld $1, %xmm10, %xmm5
361 ; AVX1-NEXT: vpsrld $1, %xmm11, %xmm6
362 ; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
363 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
364 ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
365 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
366 ; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
367 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
368 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
369 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
370 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm2
371 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
372 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
373 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
374 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
375 ; AVX1-NEXT: vmovdqu %xmm4, (%rax)
378 ; AVX2-LABEL: avg_v48i8:
380 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
381 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1
382 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
383 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
384 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
385 ; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm3
386 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
387 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
388 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
389 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
390 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
391 ; AVX2-NEXT: vmovdqa (%rsi), %xmm6
392 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm7
393 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1]
394 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
395 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
396 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
397 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
398 ; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm2
399 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
400 ; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
401 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
402 ; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3
403 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,0,1]
404 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
405 ; AVX2-NEXT: vpaddd %ymm4, %ymm5, %ymm4
406 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero
407 ; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5
408 ; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
409 ; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1
410 ; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0
411 ; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2
412 ; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3
413 ; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4
414 ; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5
415 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
416 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
417 ; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
418 ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
419 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
420 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
421 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
422 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
423 ; AVX2-NEXT: vpackusdw %ymm6, %ymm0, %ymm0
424 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
425 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
426 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm2[2,3]
427 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
428 ; AVX2-NEXT: vpackusdw %ymm6, %ymm2, %ymm2
429 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2
430 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
431 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
432 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
433 ; AVX2-NEXT: vpackuswb %ymm0, %ymm3, %ymm0
434 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],ymm4[2,3]
435 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3
436 ; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2
437 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
438 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
439 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
440 ; AVX2-NEXT: vmovdqu %xmm1, (%rax)
441 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
442 ; AVX2-NEXT: vzeroupper
445 ; AVX512F-LABEL: avg_v48i8:
447 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
448 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
449 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
450 ; AVX512F-NEXT: vpavgb (%rsi), %xmm0, %xmm0
451 ; AVX512F-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
452 ; AVX512F-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2
453 ; AVX512F-NEXT: vmovdqu %xmm1, (%rax)
454 ; AVX512F-NEXT: vmovdqu %xmm0, (%rax)
455 ; AVX512F-NEXT: vmovdqu %xmm2, (%rax)
458 ; AVX512BW-LABEL: avg_v48i8:
460 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
461 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
462 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
463 ; AVX512BW-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
464 ; AVX512BW-NEXT: vpavgb (%rsi), %xmm0, %xmm0
465 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
466 ; AVX512BW-NEXT: vpavgb 32(%rsi), %xmm2, %xmm1
467 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
468 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
469 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, (%rax)
470 ; AVX512BW-NEXT: vzeroupper
471 ; AVX512BW-NEXT: retq
472 %1 = load <48 x i8>, <48 x i8>* %a
473 %2 = load <48 x i8>, <48 x i8>* %b
474 %3 = zext <48 x i8> %1 to <48 x i32>
475 %4 = zext <48 x i8> %2 to <48 x i32>
476 %5 = add nuw nsw <48 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
477 %6 = add nuw nsw <48 x i32> %5, %4
478 %7 = lshr <48 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
479 %8 = trunc <48 x i32> %7 to <48 x i8>
480 store <48 x i8> %8, <48 x i8>* undef, align 4
484 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
485 ; SSE2-LABEL: avg_v64i8:
487 ; SSE2-NEXT: movdqa (%rsi), %xmm0
488 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
489 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
490 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
491 ; SSE2-NEXT: pavgb (%rdi), %xmm0
492 ; SSE2-NEXT: pavgb 16(%rdi), %xmm1
493 ; SSE2-NEXT: pavgb 32(%rdi), %xmm2
494 ; SSE2-NEXT: pavgb 48(%rdi), %xmm3
495 ; SSE2-NEXT: movdqu %xmm3, (%rax)
496 ; SSE2-NEXT: movdqu %xmm2, (%rax)
497 ; SSE2-NEXT: movdqu %xmm1, (%rax)
498 ; SSE2-NEXT: movdqu %xmm0, (%rax)
501 ; AVX1-LABEL: avg_v64i8:
503 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
504 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
505 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2
506 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3
507 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0
508 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1
509 ; AVX1-NEXT: vpavgb 32(%rdi), %xmm2, %xmm2
510 ; AVX1-NEXT: vpavgb 48(%rdi), %xmm3, %xmm3
511 ; AVX1-NEXT: vmovdqu %xmm3, (%rax)
512 ; AVX1-NEXT: vmovdqu %xmm2, (%rax)
513 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
514 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
517 ; AVX2-LABEL: avg_v64i8:
519 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
520 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
521 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
522 ; AVX2-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
523 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
524 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
525 ; AVX2-NEXT: vzeroupper
528 ; AVX512F-LABEL: avg_v64i8:
530 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
531 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
532 ; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0
533 ; AVX512F-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
534 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
535 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
536 ; AVX512F-NEXT: vzeroupper
539 ; AVX512BW-LABEL: avg_v64i8:
541 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
542 ; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
543 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
544 ; AVX512BW-NEXT: vzeroupper
545 ; AVX512BW-NEXT: retq
546 %1 = load <64 x i8>, <64 x i8>* %a
547 %2 = load <64 x i8>, <64 x i8>* %b
548 %3 = zext <64 x i8> %1 to <64 x i32>
549 %4 = zext <64 x i8> %2 to <64 x i32>
550 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
551 %6 = add nuw nsw <64 x i32> %5, %4
552 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
553 %8 = trunc <64 x i32> %7 to <64 x i8>
554 store <64 x i8> %8, <64 x i8>* undef, align 4
558 define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) nounwind {
559 ; SSE2-LABEL: avg_v4i16:
561 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
562 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
563 ; SSE2-NEXT: pavgw %xmm0, %xmm1
564 ; SSE2-NEXT: movq %xmm1, (%rax)
567 ; AVX-LABEL: avg_v4i16:
569 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
570 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
571 ; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0
572 ; AVX-NEXT: vmovq %xmm0, (%rax)
574 %1 = load <4 x i16>, <4 x i16>* %a
575 %2 = load <4 x i16>, <4 x i16>* %b
576 %3 = zext <4 x i16> %1 to <4 x i32>
577 %4 = zext <4 x i16> %2 to <4 x i32>
578 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
579 %6 = add nuw nsw <4 x i32> %5, %4
580 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
581 %8 = trunc <4 x i32> %7 to <4 x i16>
582 store <4 x i16> %8, <4 x i16>* undef, align 4
586 define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind {
587 ; SSE2-LABEL: avg_v8i16:
589 ; SSE2-NEXT: movdqa (%rsi), %xmm0
590 ; SSE2-NEXT: pavgw (%rdi), %xmm0
591 ; SSE2-NEXT: movdqu %xmm0, (%rax)
594 ; AVX-LABEL: avg_v8i16:
596 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
597 ; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0
598 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
600 %1 = load <8 x i16>, <8 x i16>* %a
601 %2 = load <8 x i16>, <8 x i16>* %b
602 %3 = zext <8 x i16> %1 to <8 x i32>
603 %4 = zext <8 x i16> %2 to <8 x i32>
604 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
605 %6 = add nuw nsw <8 x i32> %5, %4
606 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
607 %8 = trunc <8 x i32> %7 to <8 x i16>
608 store <8 x i16> %8, <8 x i16>* undef, align 4
612 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
613 ; SSE2-LABEL: avg_v16i16:
615 ; SSE2-NEXT: movdqa (%rsi), %xmm0
616 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
617 ; SSE2-NEXT: pavgw (%rdi), %xmm0
618 ; SSE2-NEXT: pavgw 16(%rdi), %xmm1
619 ; SSE2-NEXT: movdqu %xmm1, (%rax)
620 ; SSE2-NEXT: movdqu %xmm0, (%rax)
623 ; AVX1-LABEL: avg_v16i16:
625 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
626 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
627 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0
628 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1
629 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
630 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
633 ; AVX2-LABEL: avg_v16i16:
635 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
636 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
637 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
638 ; AVX2-NEXT: vzeroupper
641 ; AVX512-LABEL: avg_v16i16:
643 ; AVX512-NEXT: vmovdqa (%rsi), %ymm0
644 ; AVX512-NEXT: vpavgw (%rdi), %ymm0, %ymm0
645 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
646 ; AVX512-NEXT: vzeroupper
648 %1 = load <16 x i16>, <16 x i16>* %a
649 %2 = load <16 x i16>, <16 x i16>* %b
650 %3 = zext <16 x i16> %1 to <16 x i32>
651 %4 = zext <16 x i16> %2 to <16 x i32>
652 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
653 %6 = add nuw nsw <16 x i32> %5, %4
654 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
655 %8 = trunc <16 x i32> %7 to <16 x i16>
656 store <16 x i16> %8, <16 x i16>* undef, align 4
660 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
661 ; SSE2-LABEL: avg_v32i16:
663 ; SSE2-NEXT: movdqa (%rsi), %xmm0
664 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
665 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
666 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
667 ; SSE2-NEXT: pavgw (%rdi), %xmm0
668 ; SSE2-NEXT: pavgw 16(%rdi), %xmm1
669 ; SSE2-NEXT: pavgw 32(%rdi), %xmm2
670 ; SSE2-NEXT: pavgw 48(%rdi), %xmm3
671 ; SSE2-NEXT: movdqu %xmm3, (%rax)
672 ; SSE2-NEXT: movdqu %xmm2, (%rax)
673 ; SSE2-NEXT: movdqu %xmm1, (%rax)
674 ; SSE2-NEXT: movdqu %xmm0, (%rax)
677 ; AVX1-LABEL: avg_v32i16:
679 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
680 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
681 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2
682 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3
683 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0
684 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1
685 ; AVX1-NEXT: vpavgw 32(%rdi), %xmm2, %xmm2
686 ; AVX1-NEXT: vpavgw 48(%rdi), %xmm3, %xmm3
687 ; AVX1-NEXT: vmovdqu %xmm3, (%rax)
688 ; AVX1-NEXT: vmovdqu %xmm2, (%rax)
689 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
690 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
693 ; AVX2-LABEL: avg_v32i16:
695 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
696 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
697 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
698 ; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
699 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
700 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
701 ; AVX2-NEXT: vzeroupper
704 ; AVX512F-LABEL: avg_v32i16:
706 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
707 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
708 ; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0
709 ; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
710 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
711 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
712 ; AVX512F-NEXT: vzeroupper
715 ; AVX512BW-LABEL: avg_v32i16:
717 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
718 ; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
719 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
720 ; AVX512BW-NEXT: vzeroupper
721 ; AVX512BW-NEXT: retq
722 %1 = load <32 x i16>, <32 x i16>* %a
723 %2 = load <32 x i16>, <32 x i16>* %b
724 %3 = zext <32 x i16> %1 to <32 x i32>
725 %4 = zext <32 x i16> %2 to <32 x i32>
726 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
727 %6 = add nuw nsw <32 x i32> %5, %4
728 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
729 %8 = trunc <32 x i32> %7 to <32 x i16>
730 store <32 x i16> %8, <32 x i16>* undef, align 4
734 define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) nounwind {
735 ; SSE2-LABEL: avg_v4i8_2:
737 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
738 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
739 ; SSE2-NEXT: pavgb %xmm0, %xmm1
740 ; SSE2-NEXT: movd %xmm1, (%rax)
743 ; AVX-LABEL: avg_v4i8_2:
745 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
746 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
747 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
748 ; AVX-NEXT: vmovd %xmm0, (%rax)
750 %1 = load <4 x i8>, <4 x i8>* %a
751 %2 = load <4 x i8>, <4 x i8>* %b
752 %3 = zext <4 x i8> %1 to <4 x i32>
753 %4 = zext <4 x i8> %2 to <4 x i32>
754 %5 = add nuw nsw <4 x i32> %3, %4
755 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
756 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
757 %8 = trunc <4 x i32> %7 to <4 x i8>
758 store <4 x i8> %8, <4 x i8>* undef, align 4
762 define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) nounwind {
763 ; SSE2-LABEL: avg_v8i8_2:
765 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
766 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
767 ; SSE2-NEXT: pavgb %xmm0, %xmm1
768 ; SSE2-NEXT: movq %xmm1, (%rax)
771 ; AVX-LABEL: avg_v8i8_2:
773 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
774 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
775 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
776 ; AVX-NEXT: vmovq %xmm0, (%rax)
778 %1 = load <8 x i8>, <8 x i8>* %a
779 %2 = load <8 x i8>, <8 x i8>* %b
780 %3 = zext <8 x i8> %1 to <8 x i32>
781 %4 = zext <8 x i8> %2 to <8 x i32>
782 %5 = add nuw nsw <8 x i32> %3, %4
783 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
784 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
785 %8 = trunc <8 x i32> %7 to <8 x i8>
786 store <8 x i8> %8, <8 x i8>* undef, align 4
790 define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind {
791 ; SSE2-LABEL: avg_v16i8_2:
793 ; SSE2-NEXT: movdqa (%rdi), %xmm0
794 ; SSE2-NEXT: pavgb (%rsi), %xmm0
795 ; SSE2-NEXT: movdqu %xmm0, (%rax)
798 ; AVX-LABEL: avg_v16i8_2:
800 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
801 ; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0
802 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
804 %1 = load <16 x i8>, <16 x i8>* %a
805 %2 = load <16 x i8>, <16 x i8>* %b
806 %3 = zext <16 x i8> %1 to <16 x i32>
807 %4 = zext <16 x i8> %2 to <16 x i32>
808 %5 = add nuw nsw <16 x i32> %3, %4
809 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
810 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
811 %8 = trunc <16 x i32> %7 to <16 x i8>
812 store <16 x i8> %8, <16 x i8>* undef, align 4
816 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
817 ; SSE2-LABEL: avg_v32i8_2:
819 ; SSE2-NEXT: movdqa (%rdi), %xmm0
820 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
821 ; SSE2-NEXT: pavgb (%rsi), %xmm0
822 ; SSE2-NEXT: pavgb 16(%rsi), %xmm1
823 ; SSE2-NEXT: movdqu %xmm1, (%rax)
824 ; SSE2-NEXT: movdqu %xmm0, (%rax)
827 ; AVX1-LABEL: avg_v32i8_2:
829 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
830 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
831 ; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0
832 ; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
833 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
834 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
837 ; AVX2-LABEL: avg_v32i8_2:
839 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
840 ; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
841 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
842 ; AVX2-NEXT: vzeroupper
845 ; AVX512-LABEL: avg_v32i8_2:
847 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
848 ; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0
849 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
850 ; AVX512-NEXT: vzeroupper
852 %1 = load <32 x i8>, <32 x i8>* %a
853 %2 = load <32 x i8>, <32 x i8>* %b
854 %3 = zext <32 x i8> %1 to <32 x i32>
855 %4 = zext <32 x i8> %2 to <32 x i32>
856 %5 = add nuw nsw <32 x i32> %3, %4
857 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
858 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
859 %8 = trunc <32 x i32> %7 to <32 x i8>
860 store <32 x i8> %8, <32 x i8>* undef, align 4
864 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
865 ; SSE2-LABEL: avg_v64i8_2:
867 ; SSE2-NEXT: movdqa (%rsi), %xmm0
868 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
869 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
870 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
871 ; SSE2-NEXT: pavgb %xmm0, %xmm0
872 ; SSE2-NEXT: pavgb %xmm1, %xmm1
873 ; SSE2-NEXT: pavgb %xmm2, %xmm2
874 ; SSE2-NEXT: pavgb %xmm3, %xmm3
875 ; SSE2-NEXT: movdqu %xmm3, (%rax)
876 ; SSE2-NEXT: movdqu %xmm2, (%rax)
877 ; SSE2-NEXT: movdqu %xmm1, (%rax)
878 ; SSE2-NEXT: movdqu %xmm0, (%rax)
881 ; AVX1-LABEL: avg_v64i8_2:
883 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
884 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
885 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2
886 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3
887 ; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm0
888 ; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm1
889 ; AVX1-NEXT: vpavgb %xmm2, %xmm2, %xmm2
890 ; AVX1-NEXT: vpavgb %xmm3, %xmm3, %xmm3
891 ; AVX1-NEXT: vmovdqu %xmm3, (%rax)
892 ; AVX1-NEXT: vmovdqu %xmm2, (%rax)
893 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
894 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
897 ; AVX2-LABEL: avg_v64i8_2:
899 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
900 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
901 ; AVX2-NEXT: vpavgb %ymm0, %ymm0, %ymm0
902 ; AVX2-NEXT: vpavgb %ymm1, %ymm1, %ymm1
903 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
904 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
905 ; AVX2-NEXT: vzeroupper
908 ; AVX512F-LABEL: avg_v64i8_2:
910 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
911 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
912 ; AVX512F-NEXT: vpavgb %ymm0, %ymm0, %ymm0
913 ; AVX512F-NEXT: vpavgb %ymm1, %ymm1, %ymm1
914 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
915 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
916 ; AVX512F-NEXT: vzeroupper
919 ; AVX512BW-LABEL: avg_v64i8_2:
921 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
922 ; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
923 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
924 ; AVX512BW-NEXT: vzeroupper
925 ; AVX512BW-NEXT: retq
926 %1 = load <64 x i8>, <64 x i8>* %a
927 %2 = load <64 x i8>, <64 x i8>* %b
928 %3 = zext <64 x i8> %1 to <64 x i32>
929 %4 = zext <64 x i8> %2 to <64 x i32>
930 %5 = add nuw nsw <64 x i32> %4, %4
931 %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
932 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
933 %8 = trunc <64 x i32> %7 to <64 x i8>
934 store <64 x i8> %8, <64 x i8>* undef, align 4
939 define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) nounwind {
940 ; SSE2-LABEL: avg_v4i16_2:
942 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
943 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
944 ; SSE2-NEXT: pavgw %xmm0, %xmm1
945 ; SSE2-NEXT: movq %xmm1, (%rax)
948 ; AVX-LABEL: avg_v4i16_2:
950 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
951 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
952 ; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0
953 ; AVX-NEXT: vmovq %xmm0, (%rax)
955 %1 = load <4 x i16>, <4 x i16>* %a
956 %2 = load <4 x i16>, <4 x i16>* %b
957 %3 = zext <4 x i16> %1 to <4 x i32>
958 %4 = zext <4 x i16> %2 to <4 x i32>
959 %5 = add nuw nsw <4 x i32> %3, %4
960 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
961 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
962 %8 = trunc <4 x i32> %7 to <4 x i16>
963 store <4 x i16> %8, <4 x i16>* undef, align 4
967 define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind {
968 ; SSE2-LABEL: avg_v8i16_2:
970 ; SSE2-NEXT: movdqa (%rdi), %xmm0
971 ; SSE2-NEXT: pavgw (%rsi), %xmm0
972 ; SSE2-NEXT: movdqu %xmm0, (%rax)
975 ; AVX-LABEL: avg_v8i16_2:
977 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
978 ; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0
979 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
981 %1 = load <8 x i16>, <8 x i16>* %a
982 %2 = load <8 x i16>, <8 x i16>* %b
983 %3 = zext <8 x i16> %1 to <8 x i32>
984 %4 = zext <8 x i16> %2 to <8 x i32>
985 %5 = add nuw nsw <8 x i32> %3, %4
986 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
987 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
988 %8 = trunc <8 x i32> %7 to <8 x i16>
989 store <8 x i16> %8, <8 x i16>* undef, align 4
993 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
994 ; SSE2-LABEL: avg_v16i16_2:
996 ; SSE2-NEXT: movdqa (%rdi), %xmm0
997 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
998 ; SSE2-NEXT: pavgw (%rsi), %xmm0
999 ; SSE2-NEXT: pavgw 16(%rsi), %xmm1
1000 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1001 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1004 ; AVX1-LABEL: avg_v16i16_2:
1006 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
1007 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
1008 ; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0
1009 ; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1
1010 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
1011 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1014 ; AVX2-LABEL: avg_v16i16_2:
1016 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1017 ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
1018 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1019 ; AVX2-NEXT: vzeroupper
1022 ; AVX512-LABEL: avg_v16i16_2:
1024 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
1025 ; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0
1026 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
1027 ; AVX512-NEXT: vzeroupper
1029 %1 = load <16 x i16>, <16 x i16>* %a
1030 %2 = load <16 x i16>, <16 x i16>* %b
1031 %3 = zext <16 x i16> %1 to <16 x i32>
1032 %4 = zext <16 x i16> %2 to <16 x i32>
1033 %5 = add nuw nsw <16 x i32> %3, %4
1034 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1035 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1036 %8 = trunc <16 x i32> %7 to <16 x i16>
1037 store <16 x i16> %8, <16 x i16>* undef, align 4
1041 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
1042 ; SSE2-LABEL: avg_v32i16_2:
1044 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1045 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
1046 ; SSE2-NEXT: movdqa 32(%rdi), %xmm2
1047 ; SSE2-NEXT: movdqa 48(%rdi), %xmm3
1048 ; SSE2-NEXT: pavgw (%rsi), %xmm0
1049 ; SSE2-NEXT: pavgw 16(%rsi), %xmm1
1050 ; SSE2-NEXT: pavgw 32(%rsi), %xmm2
1051 ; SSE2-NEXT: pavgw 48(%rsi), %xmm3
1052 ; SSE2-NEXT: movdqu %xmm3, (%rax)
1053 ; SSE2-NEXT: movdqu %xmm2, (%rax)
1054 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1055 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1058 ; AVX1-LABEL: avg_v32i16_2:
1060 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
1061 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
1062 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
1063 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
1064 ; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0
1065 ; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1
1066 ; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2
1067 ; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm3
1068 ; AVX1-NEXT: vmovdqu %xmm3, (%rax)
1069 ; AVX1-NEXT: vmovdqu %xmm2, (%rax)
1070 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
1071 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1074 ; AVX2-LABEL: avg_v32i16_2:
1076 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1077 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
1078 ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
1079 ; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1
1080 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
1081 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1082 ; AVX2-NEXT: vzeroupper
1085 ; AVX512F-LABEL: avg_v32i16_2:
1087 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1088 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
1089 ; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0
1090 ; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1
1091 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
1092 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
1093 ; AVX512F-NEXT: vzeroupper
1094 ; AVX512F-NEXT: retq
1096 ; AVX512BW-LABEL: avg_v32i16_2:
1097 ; AVX512BW: # %bb.0:
1098 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1099 ; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
1100 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
1101 ; AVX512BW-NEXT: vzeroupper
1102 ; AVX512BW-NEXT: retq
1103 %1 = load <32 x i16>, <32 x i16>* %a
1104 %2 = load <32 x i16>, <32 x i16>* %b
1105 %3 = zext <32 x i16> %1 to <32 x i32>
1106 %4 = zext <32 x i16> %2 to <32 x i32>
1107 %5 = add nuw nsw <32 x i32> %3, %4
1108 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1109 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1110 %8 = trunc <32 x i32> %7 to <32 x i16>
1111 store <32 x i16> %8, <32 x i16>* undef, align 4
1115 define void @avg_v4i8_const(<4 x i8>* %a) nounwind {
1116 ; SSE2-LABEL: avg_v4i8_const:
1118 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1119 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
1120 ; SSE2-NEXT: movd %xmm0, (%rax)
1123 ; AVX-LABEL: avg_v4i8_const:
1125 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1126 ; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
1127 ; AVX-NEXT: vmovd %xmm0, (%rax)
1129 %1 = load <4 x i8>, <4 x i8>* %a
1130 %2 = zext <4 x i8> %1 to <4 x i32>
1131 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
1132 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
1133 %5 = trunc <4 x i32> %4 to <4 x i8>
1134 store <4 x i8> %5, <4 x i8>* undef, align 4
1138 define void @avg_v8i8_const(<8 x i8>* %a) nounwind {
1139 ; SSE2-LABEL: avg_v8i8_const:
1141 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1142 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
1143 ; SSE2-NEXT: movq %xmm0, (%rax)
1146 ; AVX-LABEL: avg_v8i8_const:
1148 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1149 ; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
1150 ; AVX-NEXT: vmovq %xmm0, (%rax)
1152 %1 = load <8 x i8>, <8 x i8>* %a
1153 %2 = zext <8 x i8> %1 to <8 x i32>
1154 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1155 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1156 %5 = trunc <8 x i32> %4 to <8 x i8>
1157 store <8 x i8> %5, <8 x i8>* undef, align 4
1161 define void @avg_v16i8_const(<16 x i8>* %a) nounwind {
1162 ; SSE2-LABEL: avg_v16i8_const:
1164 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1165 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
1166 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1169 ; AVX-LABEL: avg_v16i8_const:
1171 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1172 ; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
1173 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
1175 %1 = load <16 x i8>, <16 x i8>* %a
1176 %2 = zext <16 x i8> %1 to <16 x i32>
1177 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1178 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1179 %5 = trunc <16 x i32> %4 to <16 x i8>
1180 store <16 x i8> %5, <16 x i8>* undef, align 4
1184 define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
1185 ; SSE2-LABEL: avg_v32i8_const:
1187 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1188 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1189 ; SSE2-NEXT: pavgb %xmm0, %xmm1
1190 ; SSE2-NEXT: pavgb 16(%rdi), %xmm0
1191 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1192 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1195 ; AVX1-LABEL: avg_v32i8_const:
1197 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275]
1198 ; AVX1-NEXT: # xmm0 = mem[0,0]
1199 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1
1200 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm0
1201 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1202 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
1205 ; AVX2-LABEL: avg_v32i8_const:
1207 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1208 ; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
1209 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1210 ; AVX2-NEXT: vzeroupper
1213 ; AVX512-LABEL: avg_v32i8_const:
1215 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
1216 ; AVX512-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
1217 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
1218 ; AVX512-NEXT: vzeroupper
1220 %1 = load <32 x i8>, <32 x i8>* %a
1221 %2 = zext <32 x i8> %1 to <32 x i32>
1222 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1223 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1224 %5 = trunc <32 x i32> %4 to <32 x i8>
1225 store <32 x i8> %5, <32 x i8>* undef, align 4
1229 define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
1230 ; SSE2-LABEL: avg_v64i8_const:
1232 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1233 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1234 ; SSE2-NEXT: pavgb %xmm0, %xmm1
1235 ; SSE2-NEXT: movdqa 16(%rdi), %xmm2
1236 ; SSE2-NEXT: pavgb %xmm0, %xmm2
1237 ; SSE2-NEXT: movdqa 32(%rdi), %xmm3
1238 ; SSE2-NEXT: pavgb %xmm0, %xmm3
1239 ; SSE2-NEXT: pavgb 48(%rdi), %xmm0
1240 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1241 ; SSE2-NEXT: movdqu %xmm3, (%rax)
1242 ; SSE2-NEXT: movdqu %xmm2, (%rax)
1243 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1246 ; AVX1-LABEL: avg_v64i8_const:
1248 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275]
1249 ; AVX1-NEXT: # xmm0 = mem[0,0]
1250 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1
1251 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm2
1252 ; AVX1-NEXT: vpavgb 32(%rdi), %xmm0, %xmm3
1253 ; AVX1-NEXT: vpavgb 48(%rdi), %xmm0, %xmm0
1254 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1255 ; AVX1-NEXT: vmovdqu %xmm3, (%rax)
1256 ; AVX1-NEXT: vmovdqu %xmm2, (%rax)
1257 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
1260 ; AVX2-LABEL: avg_v64i8_const:
1262 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
1263 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm1
1264 ; AVX2-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
1265 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1266 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
1267 ; AVX2-NEXT: vzeroupper
1270 ; AVX512F-LABEL: avg_v64i8_const:
1272 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
1273 ; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm1
1274 ; AVX512F-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
1275 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
1276 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
1277 ; AVX512F-NEXT: vzeroupper
1278 ; AVX512F-NEXT: retq
1280 ; AVX512BW-LABEL: avg_v64i8_const:
1281 ; AVX512BW: # %bb.0:
1282 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1283 ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0
1284 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
1285 ; AVX512BW-NEXT: vzeroupper
1286 ; AVX512BW-NEXT: retq
1287 %1 = load <64 x i8>, <64 x i8>* %a
1288 %2 = zext <64 x i8> %1 to <64 x i32>
1289 %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1290 %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1291 %5 = trunc <64 x i32> %4 to <64 x i8>
1292 store <64 x i8> %5, <64 x i8>* undef, align 4
1296 define void @avg_v4i16_const(<4 x i16>* %a) nounwind {
1297 ; SSE2-LABEL: avg_v4i16_const:
1299 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1300 ; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
1301 ; SSE2-NEXT: movq %xmm0, (%rax)
1304 ; AVX-LABEL: avg_v4i16_const:
1306 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1307 ; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
1308 ; AVX-NEXT: vmovq %xmm0, (%rax)
1310 %1 = load <4 x i16>, <4 x i16>* %a
1311 %2 = zext <4 x i16> %1 to <4 x i32>
1312 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
1313 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
1314 %5 = trunc <4 x i32> %4 to <4 x i16>
1315 store <4 x i16> %5, <4 x i16>* undef, align 4
1319 define void @avg_v8i16_const(<8 x i16>* %a) nounwind {
1320 ; SSE2-LABEL: avg_v8i16_const:
1322 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1323 ; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
1324 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1327 ; AVX-LABEL: avg_v8i16_const:
1329 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1330 ; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
1331 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
1333 %1 = load <8 x i16>, <8 x i16>* %a
1334 %2 = zext <8 x i16> %1 to <8 x i32>
1335 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1336 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1337 %5 = trunc <8 x i32> %4 to <8 x i16>
1338 store <8 x i16> %5, <8 x i16>* undef, align 4
1342 define void @avg_v16i16_const(<16 x i16>* %a) nounwind {
1343 ; SSE2-LABEL: avg_v16i16_const:
1345 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1346 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1347 ; SSE2-NEXT: pavgw %xmm0, %xmm1
1348 ; SSE2-NEXT: pavgw 16(%rdi), %xmm0
1349 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1350 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1353 ; AVX1-LABEL: avg_v16i16_const:
1355 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1356 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1
1357 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm0
1358 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1359 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
1362 ; AVX2-LABEL: avg_v16i16_const:
1364 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1365 ; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
1366 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1367 ; AVX2-NEXT: vzeroupper
1370 ; AVX512-LABEL: avg_v16i16_const:
1372 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
1373 ; AVX512-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
1374 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
1375 ; AVX512-NEXT: vzeroupper
1377 %1 = load <16 x i16>, <16 x i16>* %a
1378 %2 = zext <16 x i16> %1 to <16 x i32>
1379 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1380 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1381 %5 = trunc <16 x i32> %4 to <16 x i16>
1382 store <16 x i16> %5, <16 x i16>* undef, align 4
1386 define void @avg_v32i16_const(<32 x i16>* %a) nounwind {
1387 ; SSE2-LABEL: avg_v32i16_const:
1389 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1390 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1391 ; SSE2-NEXT: pavgw %xmm0, %xmm1
1392 ; SSE2-NEXT: movdqa 16(%rdi), %xmm2
1393 ; SSE2-NEXT: pavgw %xmm0, %xmm2
1394 ; SSE2-NEXT: movdqa 32(%rdi), %xmm3
1395 ; SSE2-NEXT: pavgw %xmm0, %xmm3
1396 ; SSE2-NEXT: pavgw 48(%rdi), %xmm0
1397 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1398 ; SSE2-NEXT: movdqu %xmm3, (%rax)
1399 ; SSE2-NEXT: movdqu %xmm2, (%rax)
1400 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1403 ; AVX1-LABEL: avg_v32i16_const:
1405 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1406 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1
1407 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm2
1408 ; AVX1-NEXT: vpavgw 32(%rdi), %xmm0, %xmm3
1409 ; AVX1-NEXT: vpavgw 48(%rdi), %xmm0, %xmm0
1410 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1411 ; AVX1-NEXT: vmovdqu %xmm3, (%rax)
1412 ; AVX1-NEXT: vmovdqu %xmm2, (%rax)
1413 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
1416 ; AVX2-LABEL: avg_v32i16_const:
1418 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1419 ; AVX2-NEXT: # ymm0 = mem[0,1,0,1]
1420 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm1
1421 ; AVX2-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0
1422 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1423 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
1424 ; AVX2-NEXT: vzeroupper
1427 ; AVX512F-LABEL: avg_v32i16_const:
1429 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1430 ; AVX512F-NEXT: # ymm0 = mem[0,1,0,1]
1431 ; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm1
1432 ; AVX512F-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0
1433 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
1434 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
1435 ; AVX512F-NEXT: vzeroupper
1436 ; AVX512F-NEXT: retq
1438 ; AVX512BW-LABEL: avg_v32i16_const:
1439 ; AVX512BW: # %bb.0:
1440 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1441 ; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0
1442 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
1443 ; AVX512BW-NEXT: vzeroupper
1444 ; AVX512BW-NEXT: retq
1445 %1 = load <32 x i16>, <32 x i16>* %a
1446 %2 = zext <32 x i16> %1 to <32 x i32>
1447 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1448 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1449 %5 = trunc <32 x i32> %4 to <32 x i16>
1450 store <32 x i16> %5, <32 x i16>* undef, align 4
1454 define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind {
1455 ; SSE2-LABEL: avg_v16i8_3:
1457 ; SSE2-NEXT: pavgb %xmm1, %xmm0
1460 ; AVX-LABEL: avg_v16i8_3:
1462 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
1464 %za = zext <16 x i8> %a to <16 x i16>
1465 %zb = zext <16 x i8> %b to <16 x i16>
1466 %add = add nuw nsw <16 x i16> %za, %zb
1467 %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1468 %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1469 %res = trunc <16 x i16> %lshr to <16 x i8>
1473 define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {
1474 ; SSE2-LABEL: avg_v32i8_3:
1476 ; SSE2-NEXT: pavgb %xmm2, %xmm0
1477 ; SSE2-NEXT: pavgb %xmm3, %xmm1
1480 ; AVX1-LABEL: avg_v32i8_3:
1482 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1483 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1484 ; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2
1485 ; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0
1486 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1489 ; AVX2-LABEL: avg_v32i8_3:
1491 ; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0
1494 ; AVX512-LABEL: avg_v32i8_3:
1496 ; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
1498 %za = zext <32 x i8> %a to <32 x i16>
1499 %zb = zext <32 x i8> %b to <32 x i16>
1500 %add = add nuw nsw <32 x i16> %za, %zb
1501 %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1502 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1503 %res = trunc <32 x i16> %lshr to <32 x i8>
1507 define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind {
1508 ; SSE2-LABEL: avg_v64i8_3:
1510 ; SSE2-NEXT: pavgb %xmm4, %xmm0
1511 ; SSE2-NEXT: pavgb %xmm5, %xmm1
1512 ; SSE2-NEXT: pavgb %xmm6, %xmm2
1513 ; SSE2-NEXT: pavgb %xmm7, %xmm3
1516 ; AVX1-LABEL: avg_v64i8_3:
1518 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1519 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
1520 ; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4
1521 ; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0
1522 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1523 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
1524 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1525 ; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2
1526 ; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1
1527 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1530 ; AVX2-LABEL: avg_v64i8_3:
1532 ; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0
1533 ; AVX2-NEXT: vpavgb %ymm3, %ymm1, %ymm1
1536 ; AVX512F-LABEL: avg_v64i8_3:
1538 ; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
1539 ; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
1540 ; AVX512F-NEXT: retq
1542 ; AVX512BW-LABEL: avg_v64i8_3:
1543 ; AVX512BW: # %bb.0:
1544 ; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0
1545 ; AVX512BW-NEXT: retq
1546 %za = zext <64 x i8> %a to <64 x i16>
1547 %zb = zext <64 x i8> %b to <64 x i16>
1548 %add = add nuw nsw <64 x i16> %za, %zb
1549 %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1550 %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1551 %res = trunc <64 x i16> %lshr to <64 x i8>
1555 define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind {
1556 ; SSE2-LABEL: avg_v512i8_3:
1558 ; SSE2-NEXT: movq %rdi, %rax
1559 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1560 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1561 ; SSE2-NEXT: movdqa %xmm8, 496(%rdi)
1562 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1563 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1564 ; SSE2-NEXT: movdqa %xmm8, 480(%rdi)
1565 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1566 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1567 ; SSE2-NEXT: movdqa %xmm8, 464(%rdi)
1568 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1569 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1570 ; SSE2-NEXT: movdqa %xmm8, 448(%rdi)
1571 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1572 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1573 ; SSE2-NEXT: movdqa %xmm8, 432(%rdi)
1574 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1575 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1576 ; SSE2-NEXT: movdqa %xmm8, 416(%rdi)
1577 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1578 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1579 ; SSE2-NEXT: movdqa %xmm8, 400(%rdi)
1580 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1581 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1582 ; SSE2-NEXT: movdqa %xmm8, 384(%rdi)
1583 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1584 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1585 ; SSE2-NEXT: movdqa %xmm8, 368(%rdi)
1586 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1587 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1588 ; SSE2-NEXT: movdqa %xmm8, 352(%rdi)
1589 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1590 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1591 ; SSE2-NEXT: movdqa %xmm8, 336(%rdi)
1592 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1593 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1594 ; SSE2-NEXT: movdqa %xmm8, 320(%rdi)
1595 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1596 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1597 ; SSE2-NEXT: movdqa %xmm8, 304(%rdi)
1598 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1599 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1600 ; SSE2-NEXT: movdqa %xmm8, 288(%rdi)
1601 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1602 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1603 ; SSE2-NEXT: movdqa %xmm8, 272(%rdi)
1604 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1605 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1606 ; SSE2-NEXT: movdqa %xmm8, 256(%rdi)
1607 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1608 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1609 ; SSE2-NEXT: movdqa %xmm8, 240(%rdi)
1610 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1611 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1612 ; SSE2-NEXT: movdqa %xmm8, 224(%rdi)
1613 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1614 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1615 ; SSE2-NEXT: movdqa %xmm8, 208(%rdi)
1616 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1617 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1618 ; SSE2-NEXT: movdqa %xmm8, 192(%rdi)
1619 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1620 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1621 ; SSE2-NEXT: movdqa %xmm8, 176(%rdi)
1622 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1623 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1624 ; SSE2-NEXT: movdqa %xmm8, 160(%rdi)
1625 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1626 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1627 ; SSE2-NEXT: movdqa %xmm8, 144(%rdi)
1628 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1629 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1630 ; SSE2-NEXT: movdqa %xmm8, 128(%rdi)
1631 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm7
1632 ; SSE2-NEXT: movdqa %xmm7, 112(%rdi)
1633 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm6
1634 ; SSE2-NEXT: movdqa %xmm6, 96(%rdi)
1635 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm5
1636 ; SSE2-NEXT: movdqa %xmm5, 80(%rdi)
1637 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm4
1638 ; SSE2-NEXT: movdqa %xmm4, 64(%rdi)
1639 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm3
1640 ; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
1641 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm2
1642 ; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
1643 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm1
1644 ; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
1645 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm0
1646 ; SSE2-NEXT: movdqa %xmm0, (%rdi)
1649 ; AVX1-LABEL: avg_v512i8_3:
1651 ; AVX1-NEXT: pushq %rbp
1652 ; AVX1-NEXT: movq %rsp, %rbp
1653 ; AVX1-NEXT: andq $-32, %rsp
1654 ; AVX1-NEXT: subq $32, %rsp
1655 ; AVX1-NEXT: movq %rdi, %rax
1656 ; AVX1-NEXT: vmovdqa 256(%rbp), %xmm8
1657 ; AVX1-NEXT: vpavgb 768(%rbp), %xmm8, %xmm8
1658 ; AVX1-NEXT: vmovdqa %xmm8, 496(%rdi)
1659 ; AVX1-NEXT: vmovdqa 240(%rbp), %xmm8
1660 ; AVX1-NEXT: vpavgb 752(%rbp), %xmm8, %xmm8
1661 ; AVX1-NEXT: vmovdqa %xmm8, 480(%rdi)
1662 ; AVX1-NEXT: vmovdqa 224(%rbp), %xmm8
1663 ; AVX1-NEXT: vpavgb 736(%rbp), %xmm8, %xmm8
1664 ; AVX1-NEXT: vmovdqa %xmm8, 464(%rdi)
1665 ; AVX1-NEXT: vmovdqa 208(%rbp), %xmm8
1666 ; AVX1-NEXT: vpavgb 720(%rbp), %xmm8, %xmm8
1667 ; AVX1-NEXT: vmovdqa %xmm8, 448(%rdi)
1668 ; AVX1-NEXT: vmovdqa 192(%rbp), %xmm8
1669 ; AVX1-NEXT: vpavgb 704(%rbp), %xmm8, %xmm8
1670 ; AVX1-NEXT: vmovdqa %xmm8, 432(%rdi)
1671 ; AVX1-NEXT: vmovdqa 176(%rbp), %xmm8
1672 ; AVX1-NEXT: vpavgb 688(%rbp), %xmm8, %xmm8
1673 ; AVX1-NEXT: vmovdqa %xmm8, 416(%rdi)
1674 ; AVX1-NEXT: vmovdqa 160(%rbp), %xmm8
1675 ; AVX1-NEXT: vpavgb 672(%rbp), %xmm8, %xmm8
1676 ; AVX1-NEXT: vmovdqa %xmm8, 400(%rdi)
1677 ; AVX1-NEXT: vmovdqa 144(%rbp), %xmm8
1678 ; AVX1-NEXT: vpavgb 656(%rbp), %xmm8, %xmm8
1679 ; AVX1-NEXT: vmovdqa %xmm8, 384(%rdi)
1680 ; AVX1-NEXT: vmovdqa 128(%rbp), %xmm8
1681 ; AVX1-NEXT: vpavgb 640(%rbp), %xmm8, %xmm8
1682 ; AVX1-NEXT: vmovdqa %xmm8, 368(%rdi)
1683 ; AVX1-NEXT: vmovdqa 112(%rbp), %xmm8
1684 ; AVX1-NEXT: vpavgb 624(%rbp), %xmm8, %xmm8
1685 ; AVX1-NEXT: vmovdqa %xmm8, 352(%rdi)
1686 ; AVX1-NEXT: vmovdqa 96(%rbp), %xmm8
1687 ; AVX1-NEXT: vpavgb 608(%rbp), %xmm8, %xmm8
1688 ; AVX1-NEXT: vmovdqa %xmm8, 336(%rdi)
1689 ; AVX1-NEXT: vmovdqa 80(%rbp), %xmm8
1690 ; AVX1-NEXT: vpavgb 592(%rbp), %xmm8, %xmm8
1691 ; AVX1-NEXT: vmovdqa %xmm8, 320(%rdi)
1692 ; AVX1-NEXT: vmovdqa 64(%rbp), %xmm8
1693 ; AVX1-NEXT: vpavgb 576(%rbp), %xmm8, %xmm8
1694 ; AVX1-NEXT: vmovdqa %xmm8, 304(%rdi)
1695 ; AVX1-NEXT: vmovdqa 48(%rbp), %xmm8
1696 ; AVX1-NEXT: vpavgb 560(%rbp), %xmm8, %xmm8
1697 ; AVX1-NEXT: vmovdqa %xmm8, 288(%rdi)
1698 ; AVX1-NEXT: vmovdqa 32(%rbp), %xmm8
1699 ; AVX1-NEXT: vpavgb 544(%rbp), %xmm8, %xmm8
1700 ; AVX1-NEXT: vmovdqa %xmm8, 272(%rdi)
1701 ; AVX1-NEXT: vmovdqa 16(%rbp), %xmm8
1702 ; AVX1-NEXT: vpavgb 528(%rbp), %xmm8, %xmm8
1703 ; AVX1-NEXT: vmovdqa %xmm8, 256(%rdi)
1704 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
1705 ; AVX1-NEXT: vpavgb 512(%rbp), %xmm8, %xmm8
1706 ; AVX1-NEXT: vmovdqa %xmm8, 240(%rdi)
1707 ; AVX1-NEXT: vpavgb 496(%rbp), %xmm7, %xmm7
1708 ; AVX1-NEXT: vmovdqa %xmm7, 224(%rdi)
1709 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
1710 ; AVX1-NEXT: vpavgb 480(%rbp), %xmm7, %xmm7
1711 ; AVX1-NEXT: vmovdqa %xmm7, 208(%rdi)
1712 ; AVX1-NEXT: vpavgb 464(%rbp), %xmm6, %xmm6
1713 ; AVX1-NEXT: vmovdqa %xmm6, 192(%rdi)
1714 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
1715 ; AVX1-NEXT: vpavgb 448(%rbp), %xmm6, %xmm6
1716 ; AVX1-NEXT: vmovdqa %xmm6, 176(%rdi)
1717 ; AVX1-NEXT: vpavgb 432(%rbp), %xmm5, %xmm5
1718 ; AVX1-NEXT: vmovdqa %xmm5, 160(%rdi)
1719 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
1720 ; AVX1-NEXT: vpavgb 416(%rbp), %xmm5, %xmm5
1721 ; AVX1-NEXT: vmovdqa %xmm5, 144(%rdi)
1722 ; AVX1-NEXT: vpavgb 400(%rbp), %xmm4, %xmm4
1723 ; AVX1-NEXT: vmovdqa %xmm4, 128(%rdi)
1724 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
1725 ; AVX1-NEXT: vpavgb 384(%rbp), %xmm4, %xmm4
1726 ; AVX1-NEXT: vmovdqa %xmm4, 112(%rdi)
1727 ; AVX1-NEXT: vpavgb 368(%rbp), %xmm3, %xmm3
1728 ; AVX1-NEXT: vmovdqa %xmm3, 96(%rdi)
1729 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1730 ; AVX1-NEXT: vpavgb 352(%rbp), %xmm3, %xmm3
1731 ; AVX1-NEXT: vmovdqa %xmm3, 80(%rdi)
1732 ; AVX1-NEXT: vpavgb 336(%rbp), %xmm2, %xmm2
1733 ; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi)
1734 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1735 ; AVX1-NEXT: vpavgb 320(%rbp), %xmm2, %xmm2
1736 ; AVX1-NEXT: vmovdqa %xmm2, 48(%rdi)
1737 ; AVX1-NEXT: vpavgb 304(%rbp), %xmm1, %xmm1
1738 ; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi)
1739 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1740 ; AVX1-NEXT: vpavgb 288(%rbp), %xmm1, %xmm1
1741 ; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi)
1742 ; AVX1-NEXT: vpavgb 272(%rbp), %xmm0, %xmm0
1743 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
1744 ; AVX1-NEXT: movq %rbp, %rsp
1745 ; AVX1-NEXT: popq %rbp
1746 ; AVX1-NEXT: vzeroupper
1749 ; AVX2-LABEL: avg_v512i8_3:
1751 ; AVX2-NEXT: pushq %rbp
1752 ; AVX2-NEXT: movq %rsp, %rbp
1753 ; AVX2-NEXT: andq $-32, %rsp
1754 ; AVX2-NEXT: subq $32, %rsp
1755 ; AVX2-NEXT: movq %rdi, %rax
1756 ; AVX2-NEXT: vmovdqa 240(%rbp), %ymm8
1757 ; AVX2-NEXT: vmovdqa 208(%rbp), %ymm9
1758 ; AVX2-NEXT: vmovdqa 176(%rbp), %ymm10
1759 ; AVX2-NEXT: vmovdqa 144(%rbp), %ymm11
1760 ; AVX2-NEXT: vmovdqa 112(%rbp), %ymm12
1761 ; AVX2-NEXT: vmovdqa 80(%rbp), %ymm13
1762 ; AVX2-NEXT: vmovdqa 48(%rbp), %ymm14
1763 ; AVX2-NEXT: vmovdqa 16(%rbp), %ymm15
1764 ; AVX2-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0
1765 ; AVX2-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1
1766 ; AVX2-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2
1767 ; AVX2-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3
1768 ; AVX2-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4
1769 ; AVX2-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5
1770 ; AVX2-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6
1771 ; AVX2-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7
1772 ; AVX2-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15
1773 ; AVX2-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14
1774 ; AVX2-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13
1775 ; AVX2-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12
1776 ; AVX2-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11
1777 ; AVX2-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10
1778 ; AVX2-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9
1779 ; AVX2-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8
1780 ; AVX2-NEXT: vmovdqa %ymm8, 480(%rdi)
1781 ; AVX2-NEXT: vmovdqa %ymm9, 448(%rdi)
1782 ; AVX2-NEXT: vmovdqa %ymm10, 416(%rdi)
1783 ; AVX2-NEXT: vmovdqa %ymm11, 384(%rdi)
1784 ; AVX2-NEXT: vmovdqa %ymm12, 352(%rdi)
1785 ; AVX2-NEXT: vmovdqa %ymm13, 320(%rdi)
1786 ; AVX2-NEXT: vmovdqa %ymm14, 288(%rdi)
1787 ; AVX2-NEXT: vmovdqa %ymm15, 256(%rdi)
1788 ; AVX2-NEXT: vmovdqa %ymm7, 224(%rdi)
1789 ; AVX2-NEXT: vmovdqa %ymm6, 192(%rdi)
1790 ; AVX2-NEXT: vmovdqa %ymm5, 160(%rdi)
1791 ; AVX2-NEXT: vmovdqa %ymm4, 128(%rdi)
1792 ; AVX2-NEXT: vmovdqa %ymm3, 96(%rdi)
1793 ; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi)
1794 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi)
1795 ; AVX2-NEXT: vmovdqa %ymm0, (%rdi)
1796 ; AVX2-NEXT: movq %rbp, %rsp
1797 ; AVX2-NEXT: popq %rbp
1798 ; AVX2-NEXT: vzeroupper
1801 ; AVX512F-LABEL: avg_v512i8_3:
1803 ; AVX512F-NEXT: pushq %rbp
1804 ; AVX512F-NEXT: movq %rsp, %rbp
1805 ; AVX512F-NEXT: andq $-32, %rsp
1806 ; AVX512F-NEXT: subq $32, %rsp
1807 ; AVX512F-NEXT: movq %rdi, %rax
1808 ; AVX512F-NEXT: vmovdqa 240(%rbp), %ymm8
1809 ; AVX512F-NEXT: vmovdqa 208(%rbp), %ymm9
1810 ; AVX512F-NEXT: vmovdqa 176(%rbp), %ymm10
1811 ; AVX512F-NEXT: vmovdqa 144(%rbp), %ymm11
1812 ; AVX512F-NEXT: vmovdqa 112(%rbp), %ymm12
1813 ; AVX512F-NEXT: vmovdqa 80(%rbp), %ymm13
1814 ; AVX512F-NEXT: vmovdqa 48(%rbp), %ymm14
1815 ; AVX512F-NEXT: vmovdqa 16(%rbp), %ymm15
1816 ; AVX512F-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0
1817 ; AVX512F-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1
1818 ; AVX512F-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2
1819 ; AVX512F-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3
1820 ; AVX512F-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4
1821 ; AVX512F-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5
1822 ; AVX512F-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6
1823 ; AVX512F-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7
1824 ; AVX512F-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15
1825 ; AVX512F-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14
1826 ; AVX512F-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13
1827 ; AVX512F-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12
1828 ; AVX512F-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11
1829 ; AVX512F-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10
1830 ; AVX512F-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9
1831 ; AVX512F-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8
1832 ; AVX512F-NEXT: vmovdqa %ymm8, 480(%rdi)
1833 ; AVX512F-NEXT: vmovdqa %ymm9, 448(%rdi)
1834 ; AVX512F-NEXT: vmovdqa %ymm10, 416(%rdi)
1835 ; AVX512F-NEXT: vmovdqa %ymm11, 384(%rdi)
1836 ; AVX512F-NEXT: vmovdqa %ymm12, 352(%rdi)
1837 ; AVX512F-NEXT: vmovdqa %ymm13, 320(%rdi)
1838 ; AVX512F-NEXT: vmovdqa %ymm14, 288(%rdi)
1839 ; AVX512F-NEXT: vmovdqa %ymm15, 256(%rdi)
1840 ; AVX512F-NEXT: vmovdqa %ymm7, 224(%rdi)
1841 ; AVX512F-NEXT: vmovdqa %ymm6, 192(%rdi)
1842 ; AVX512F-NEXT: vmovdqa %ymm5, 160(%rdi)
1843 ; AVX512F-NEXT: vmovdqa %ymm4, 128(%rdi)
1844 ; AVX512F-NEXT: vmovdqa %ymm3, 96(%rdi)
1845 ; AVX512F-NEXT: vmovdqa %ymm2, 64(%rdi)
1846 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdi)
1847 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdi)
1848 ; AVX512F-NEXT: movq %rbp, %rsp
1849 ; AVX512F-NEXT: popq %rbp
1850 ; AVX512F-NEXT: vzeroupper
1851 ; AVX512F-NEXT: retq
1853 ; AVX512BW-LABEL: avg_v512i8_3:
1854 ; AVX512BW: # %bb.0:
1855 ; AVX512BW-NEXT: pushq %rbp
1856 ; AVX512BW-NEXT: movq %rsp, %rbp
1857 ; AVX512BW-NEXT: andq $-64, %rsp
1858 ; AVX512BW-NEXT: subq $64, %rsp
1859 ; AVX512BW-NEXT: movq %rdi, %rax
1860 ; AVX512BW-NEXT: vpavgb 16(%rbp), %zmm0, %zmm0
1861 ; AVX512BW-NEXT: vpavgb 80(%rbp), %zmm1, %zmm1
1862 ; AVX512BW-NEXT: vpavgb 144(%rbp), %zmm2, %zmm2
1863 ; AVX512BW-NEXT: vpavgb 208(%rbp), %zmm3, %zmm3
1864 ; AVX512BW-NEXT: vpavgb 272(%rbp), %zmm4, %zmm4
1865 ; AVX512BW-NEXT: vpavgb 336(%rbp), %zmm5, %zmm5
1866 ; AVX512BW-NEXT: vpavgb 400(%rbp), %zmm6, %zmm6
1867 ; AVX512BW-NEXT: vpavgb 464(%rbp), %zmm7, %zmm7
1868 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdi)
1869 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdi)
1870 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdi)
1871 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdi)
1872 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdi)
1873 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdi)
1874 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdi)
1875 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdi)
1876 ; AVX512BW-NEXT: movq %rbp, %rsp
1877 ; AVX512BW-NEXT: popq %rbp
1878 ; AVX512BW-NEXT: vzeroupper
1879 ; AVX512BW-NEXT: retq
1880 %za = zext <512 x i8> %a to <512 x i16>
1881 %zb = zext <512 x i8> %b to <512 x i16>
1882 %add = add nuw nsw <512 x i16> %za, %zb
1883 %add1 = add nuw nsw <512 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1884 %lshr = lshr <512 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1885 %res = trunc <512 x i16> %lshr to <512 x i8>
1889 ; This is not an avg, but its structurally similar and previously caused a crash
1890 ; because the constants can't be read with APInt::getZExtValue.
1891 define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind {
1892 ; SSE2-LABEL: not_avg_v16i8_wide_constants:
1894 ; SSE2-NEXT: pushq %rbp
1895 ; SSE2-NEXT: pushq %r15
1896 ; SSE2-NEXT: pushq %r14
1897 ; SSE2-NEXT: pushq %r13
1898 ; SSE2-NEXT: pushq %r12
1899 ; SSE2-NEXT: pushq %rbx
1900 ; SSE2-NEXT: movaps (%rdi), %xmm0
1901 ; SSE2-NEXT: movaps (%rsi), %xmm1
1902 ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1903 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1904 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1905 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
1906 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1907 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1908 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1909 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1910 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
1911 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
1912 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
1913 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
1914 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
1915 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
1916 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
1917 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1918 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
1919 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
1920 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1921 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
1922 ; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
1923 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1924 ; SSE2-NEXT: leal -1(%rdx,%rsi), %edx
1925 ; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1926 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
1927 ; SSE2-NEXT: leal -1(%rbx,%rdx), %edx
1928 ; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1929 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
1930 ; SSE2-NEXT: leal -1(%rbp,%rdx), %edx
1931 ; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1932 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
1933 ; SSE2-NEXT: leal -1(%rdi,%rdx), %r8d
1934 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
1935 ; SSE2-NEXT: leal -1(%rax,%rdx), %edi
1936 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1937 ; SSE2-NEXT: leal -1(%rcx,%rax), %edx
1938 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1939 ; SSE2-NEXT: leal -1(%r9,%rax), %ecx
1940 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1941 ; SSE2-NEXT: leal -1(%r10,%rsi), %eax
1942 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1943 ; SSE2-NEXT: leaq -1(%r11,%rsi), %rsi
1944 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1945 ; SSE2-NEXT: leaq -1(%r12,%rbx), %r12
1946 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1947 ; SSE2-NEXT: leaq -1(%r15,%rbx), %r15
1948 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1949 ; SSE2-NEXT: leaq -1(%r14,%rbx), %r14
1950 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1951 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
1952 ; SSE2-NEXT: leaq -1(%rbp,%rbx), %r11
1953 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1954 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
1955 ; SSE2-NEXT: leaq -1(%rbp,%rbx), %r10
1956 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1957 ; SSE2-NEXT: leaq -1(%r13,%rbx), %r9
1958 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1959 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
1960 ; SSE2-NEXT: leaq -1(%r13,%rbx), %rbx
1961 ; SSE2-NEXT: shrl %eax
1962 ; SSE2-NEXT: movd %eax, %xmm8
1963 ; SSE2-NEXT: shrl %ecx
1964 ; SSE2-NEXT: movd %ecx, %xmm15
1965 ; SSE2-NEXT: shrl %edx
1966 ; SSE2-NEXT: movd %edx, %xmm9
1967 ; SSE2-NEXT: shrl %edi
1968 ; SSE2-NEXT: movd %edi, %xmm2
1969 ; SSE2-NEXT: shrl %r8d
1970 ; SSE2-NEXT: movd %r8d, %xmm10
1971 ; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1972 ; SSE2-NEXT: shrl %eax
1973 ; SSE2-NEXT: movd %eax, %xmm6
1974 ; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1975 ; SSE2-NEXT: shrl %eax
1976 ; SSE2-NEXT: movd %eax, %xmm11
1977 ; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1978 ; SSE2-NEXT: shrl %eax
1979 ; SSE2-NEXT: movd %eax, %xmm4
1980 ; SSE2-NEXT: shrq %rsi
1981 ; SSE2-NEXT: movd %esi, %xmm12
1982 ; SSE2-NEXT: shrq %r12
1983 ; SSE2-NEXT: movd %r12d, %xmm3
1984 ; SSE2-NEXT: shrq %r15
1985 ; SSE2-NEXT: movd %r15d, %xmm13
1986 ; SSE2-NEXT: shrq %r14
1987 ; SSE2-NEXT: movd %r14d, %xmm7
1988 ; SSE2-NEXT: shrq %r11
1989 ; SSE2-NEXT: movd %r11d, %xmm14
1990 ; SSE2-NEXT: shrq %r10
1991 ; SSE2-NEXT: movd %r10d, %xmm5
1992 ; SSE2-NEXT: shrq %r9
1993 ; SSE2-NEXT: movd %r9d, %xmm0
1994 ; SSE2-NEXT: shrq %rbx
1995 ; SSE2-NEXT: movd %ebx, %xmm1
1996 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
1997 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
1998 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
1999 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
2000 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
2001 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
2002 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
2003 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
2004 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
2005 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
2006 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
2007 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2008 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
2009 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
2010 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
2011 ; SSE2-NEXT: movdqu %xmm4, (%rax)
2012 ; SSE2-NEXT: popq %rbx
2013 ; SSE2-NEXT: popq %r12
2014 ; SSE2-NEXT: popq %r13
2015 ; SSE2-NEXT: popq %r14
2016 ; SSE2-NEXT: popq %r15
2017 ; SSE2-NEXT: popq %rbp
2020 ; AVX1-LABEL: not_avg_v16i8_wide_constants:
2022 ; AVX1-NEXT: pushq %rbp
2023 ; AVX1-NEXT: pushq %r15
2024 ; AVX1-NEXT: pushq %r14
2025 ; AVX1-NEXT: pushq %r13
2026 ; AVX1-NEXT: pushq %r12
2027 ; AVX1-NEXT: pushq %rbx
2028 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2029 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2030 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2031 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2032 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2033 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2034 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2035 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
2036 ; AVX1-NEXT: vpextrq $1, %xmm7, %r15
2037 ; AVX1-NEXT: vmovq %xmm7, %r14
2038 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
2039 ; AVX1-NEXT: vpextrq $1, %xmm4, %r11
2040 ; AVX1-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2041 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2042 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
2043 ; AVX1-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2044 ; AVX1-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2045 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
2046 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
2047 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
2048 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
2049 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2050 ; AVX1-NEXT: vmovd %xmm6, %ecx
2051 ; AVX1-NEXT: vpextrd $1, %xmm6, %edx
2052 ; AVX1-NEXT: vpextrd $2, %xmm6, %r13d
2053 ; AVX1-NEXT: vpextrd $3, %xmm6, %r12d
2054 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
2055 ; AVX1-NEXT: vmovd %xmm1, %ebx
2056 ; AVX1-NEXT: vpextrd $1, %xmm1, %ebp
2057 ; AVX1-NEXT: vpextrd $2, %xmm1, %esi
2058 ; AVX1-NEXT: vpextrd $3, %xmm1, %edi
2059 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
2060 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
2061 ; AVX1-NEXT: vmovd %xmm7, %r8d
2062 ; AVX1-NEXT: leal -1(%r12,%rdi), %eax
2063 ; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2064 ; AVX1-NEXT: vpextrd $2, %xmm7, %eax
2065 ; AVX1-NEXT: leal -1(%r13,%rsi), %esi
2066 ; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2067 ; AVX1-NEXT: vpextrd $2, %xmm4, %edi
2068 ; AVX1-NEXT: leal -1(%rdx,%rbp), %edx
2069 ; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2070 ; AVX1-NEXT: vpextrd $3, %xmm4, %edx
2071 ; AVX1-NEXT: leal -1(%rcx,%rbx), %r10d
2072 ; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
2073 ; AVX1-NEXT: leal -1(%rdx,%rcx), %r9d
2074 ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
2075 ; AVX1-NEXT: leal -1(%rdi,%rcx), %edi
2076 ; AVX1-NEXT: vpextrd $2, %xmm5, %ecx
2077 ; AVX1-NEXT: leal -1(%rax,%rcx), %eax
2078 ; AVX1-NEXT: vmovd %xmm5, %ecx
2079 ; AVX1-NEXT: leal -1(%r8,%rcx), %r8d
2080 ; AVX1-NEXT: vpextrq $1, %xmm6, %rdx
2081 ; AVX1-NEXT: leal -1(%r15,%rdx), %r15d
2082 ; AVX1-NEXT: vmovq %xmm6, %rdx
2083 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
2084 ; AVX1-NEXT: leal -1(%r14,%rdx), %r14d
2085 ; AVX1-NEXT: vpextrq $1, %xmm1, %rdx
2086 ; AVX1-NEXT: leal -1(%r11,%rdx), %edx
2087 ; AVX1-NEXT: vmovq %xmm1, %rcx
2088 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2089 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2090 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
2091 ; AVX1-NEXT: leal -1(%rsi,%rcx), %ecx
2092 ; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
2093 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
2094 ; AVX1-NEXT: leal -1(%rbp,%rsi), %esi
2095 ; AVX1-NEXT: vmovq %xmm1, %rbx
2096 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
2097 ; AVX1-NEXT: leal -1(%rbp,%rbx), %ebx
2098 ; AVX1-NEXT: vpextrq $1, %xmm8, %r11
2099 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2100 ; AVX1-NEXT: vpextrq $1, %xmm0, %r12
2101 ; AVX1-NEXT: leal -1(%r11,%r12), %r11d
2102 ; AVX1-NEXT: vmovq %xmm8, %r12
2103 ; AVX1-NEXT: vmovq %xmm0, %r13
2104 ; AVX1-NEXT: leal -1(%r12,%r13), %ebp
2105 ; AVX1-NEXT: shrl %ebp
2106 ; AVX1-NEXT: vmovd %ebp, %xmm0
2107 ; AVX1-NEXT: shrl %r11d
2108 ; AVX1-NEXT: vpinsrb $1, %r11d, %xmm0, %xmm0
2109 ; AVX1-NEXT: shrl %ebx
2110 ; AVX1-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
2111 ; AVX1-NEXT: shrl %esi
2112 ; AVX1-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
2113 ; AVX1-NEXT: shrl %ecx
2114 ; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
2115 ; AVX1-NEXT: shrl %edx
2116 ; AVX1-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
2117 ; AVX1-NEXT: shrl %r14d
2118 ; AVX1-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
2119 ; AVX1-NEXT: shrl %r15d
2120 ; AVX1-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0
2121 ; AVX1-NEXT: shrl %r8d
2122 ; AVX1-NEXT: vpinsrb $8, %r8d, %xmm0, %xmm0
2123 ; AVX1-NEXT: shrl %eax
2124 ; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
2125 ; AVX1-NEXT: shrl %edi
2126 ; AVX1-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
2127 ; AVX1-NEXT: shrl %r9d
2128 ; AVX1-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0
2129 ; AVX1-NEXT: shrl %r10d
2130 ; AVX1-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
2131 ; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2132 ; AVX1-NEXT: shrl %eax
2133 ; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
2134 ; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2135 ; AVX1-NEXT: shrl %eax
2136 ; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
2137 ; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2138 ; AVX1-NEXT: shrl %eax
2139 ; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
2140 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
2141 ; AVX1-NEXT: popq %rbx
2142 ; AVX1-NEXT: popq %r12
2143 ; AVX1-NEXT: popq %r13
2144 ; AVX1-NEXT: popq %r14
2145 ; AVX1-NEXT: popq %r15
2146 ; AVX1-NEXT: popq %rbp
2149 ; AVX2-LABEL: not_avg_v16i8_wide_constants:
2151 ; AVX2-NEXT: pushq %rbp
2152 ; AVX2-NEXT: pushq %r15
2153 ; AVX2-NEXT: pushq %r14
2154 ; AVX2-NEXT: pushq %r13
2155 ; AVX2-NEXT: pushq %r12
2156 ; AVX2-NEXT: pushq %rbx
2157 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2158 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2159 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2160 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2161 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
2162 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2163 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
2164 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
2165 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm4
2166 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2167 ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm7
2168 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
2169 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2170 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2171 ; AVX2-NEXT: vpextrq $1, %xmm2, %r15
2172 ; AVX2-NEXT: vmovq %xmm2, %r14
2173 ; AVX2-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2174 ; AVX2-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2175 ; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm1
2176 ; AVX2-NEXT: vpextrq $1, %xmm1, %r13
2177 ; AVX2-NEXT: vmovq %xmm1, %r11
2178 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2179 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm11 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2180 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2181 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2182 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2183 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2184 ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm1
2185 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2186 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6
2187 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
2188 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2189 ; AVX2-NEXT: vmovd %xmm9, %r12d
2190 ; AVX2-NEXT: vpextrd $2, %xmm9, %r9d
2191 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
2192 ; AVX2-NEXT: vmovd %xmm7, %ecx
2193 ; AVX2-NEXT: vpextrd $2, %xmm7, %edi
2194 ; AVX2-NEXT: vmovd %xmm5, %ebx
2195 ; AVX2-NEXT: vpextrd $2, %xmm5, %esi
2196 ; AVX2-NEXT: vmovd %xmm4, %edx
2197 ; AVX2-NEXT: vpextrd $2, %xmm4, %ebp
2198 ; AVX2-NEXT: vpextrd $2, %xmm1, %eax
2199 ; AVX2-NEXT: leal -1(%rbp,%rax), %eax
2200 ; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2201 ; AVX2-NEXT: vmovd %xmm1, %eax
2202 ; AVX2-NEXT: leal -1(%rdx,%rax), %eax
2203 ; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2204 ; AVX2-NEXT: vpextrd $2, %xmm8, %eax
2205 ; AVX2-NEXT: leal -1(%rsi,%rax), %eax
2206 ; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2207 ; AVX2-NEXT: vmovd %xmm8, %eax
2208 ; AVX2-NEXT: leal -1(%rbx,%rax), %r10d
2209 ; AVX2-NEXT: vpextrd $2, %xmm6, %eax
2210 ; AVX2-NEXT: leal -1(%rdi,%rax), %r8d
2211 ; AVX2-NEXT: vmovd %xmm6, %eax
2212 ; AVX2-NEXT: leal -1(%rcx,%rax), %edi
2213 ; AVX2-NEXT: vpextrd $2, %xmm3, %eax
2214 ; AVX2-NEXT: leal -1(%r9,%rax), %r9d
2215 ; AVX2-NEXT: vmovd %xmm3, %ecx
2216 ; AVX2-NEXT: leal -1(%r12,%rcx), %r12d
2217 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
2218 ; AVX2-NEXT: leal -1(%r15,%rcx), %r15d
2219 ; AVX2-NEXT: vmovq %xmm0, %rcx
2220 ; AVX2-NEXT: leal -1(%r14,%rcx), %r14d
2221 ; AVX2-NEXT: vpextrq $1, %xmm2, %rdx
2222 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2223 ; AVX2-NEXT: leal -1(%rax,%rdx), %edx
2224 ; AVX2-NEXT: vmovq %xmm2, %rax
2225 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm0
2226 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2227 ; AVX2-NEXT: leal -1(%rcx,%rax), %eax
2228 ; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
2229 ; AVX2-NEXT: leal -1(%r13,%rsi), %esi
2230 ; AVX2-NEXT: vmovq %xmm0, %rbx
2231 ; AVX2-NEXT: leal -1(%r11,%rbx), %ebx
2232 ; AVX2-NEXT: vpextrq $1, %xmm10, %rcx
2233 ; AVX2-NEXT: vpextrq $1, %xmm11, %r13
2234 ; AVX2-NEXT: leal -1(%rcx,%r13), %ecx
2235 ; AVX2-NEXT: vmovq %xmm10, %r13
2236 ; AVX2-NEXT: vmovq %xmm11, %r11
2237 ; AVX2-NEXT: leaq -1(%r13,%r11), %rbp
2238 ; AVX2-NEXT: shrq %rbp
2239 ; AVX2-NEXT: vmovd %ebp, %xmm0
2240 ; AVX2-NEXT: shrl %ecx
2241 ; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
2242 ; AVX2-NEXT: shrl %ebx
2243 ; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
2244 ; AVX2-NEXT: shrl %esi
2245 ; AVX2-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
2246 ; AVX2-NEXT: shrl %eax
2247 ; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
2248 ; AVX2-NEXT: shrl %edx
2249 ; AVX2-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
2250 ; AVX2-NEXT: shrl %r14d
2251 ; AVX2-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
2252 ; AVX2-NEXT: shrl %r15d
2253 ; AVX2-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0
2254 ; AVX2-NEXT: shrl %r12d
2255 ; AVX2-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0
2256 ; AVX2-NEXT: shrl %r9d
2257 ; AVX2-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
2258 ; AVX2-NEXT: shrl %edi
2259 ; AVX2-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
2260 ; AVX2-NEXT: shrl %r8d
2261 ; AVX2-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0
2262 ; AVX2-NEXT: shrl %r10d
2263 ; AVX2-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
2264 ; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2265 ; AVX2-NEXT: shrl %eax
2266 ; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
2267 ; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2268 ; AVX2-NEXT: shrl %eax
2269 ; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
2270 ; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2271 ; AVX2-NEXT: shrl %eax
2272 ; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
2273 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
2274 ; AVX2-NEXT: popq %rbx
2275 ; AVX2-NEXT: popq %r12
2276 ; AVX2-NEXT: popq %r13
2277 ; AVX2-NEXT: popq %r14
2278 ; AVX2-NEXT: popq %r15
2279 ; AVX2-NEXT: popq %rbp
2280 ; AVX2-NEXT: vzeroupper
2283 ; AVX512-LABEL: not_avg_v16i8_wide_constants:
2285 ; AVX512-NEXT: pushq %rbp
2286 ; AVX512-NEXT: pushq %r15
2287 ; AVX512-NEXT: pushq %r14
2288 ; AVX512-NEXT: pushq %r13
2289 ; AVX512-NEXT: pushq %r12
2290 ; AVX512-NEXT: pushq %rbx
2291 ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2292 ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2293 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2294 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2295 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
2296 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2297 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
2298 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
2299 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm4
2300 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2301 ; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm7
2302 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1
2303 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2304 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
2305 ; AVX512-NEXT: vpextrq $1, %xmm2, %r15
2306 ; AVX512-NEXT: vmovq %xmm2, %r14
2307 ; AVX512-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2308 ; AVX512-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2309 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm1
2310 ; AVX512-NEXT: vpextrq $1, %xmm1, %r13
2311 ; AVX512-NEXT: vmovq %xmm1, %r11
2312 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2313 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm11 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2314 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
2315 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2316 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
2317 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2318 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm1
2319 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2320 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6
2321 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0
2322 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2323 ; AVX512-NEXT: vmovd %xmm9, %r12d
2324 ; AVX512-NEXT: vpextrd $2, %xmm9, %r9d
2325 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0
2326 ; AVX512-NEXT: vmovd %xmm7, %ecx
2327 ; AVX512-NEXT: vpextrd $2, %xmm7, %edi
2328 ; AVX512-NEXT: vmovd %xmm5, %ebx
2329 ; AVX512-NEXT: vpextrd $2, %xmm5, %esi
2330 ; AVX512-NEXT: vmovd %xmm4, %edx
2331 ; AVX512-NEXT: vpextrd $2, %xmm4, %ebp
2332 ; AVX512-NEXT: vpextrd $2, %xmm1, %eax
2333 ; AVX512-NEXT: leal -1(%rbp,%rax), %eax
2334 ; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2335 ; AVX512-NEXT: vmovd %xmm1, %eax
2336 ; AVX512-NEXT: leal -1(%rdx,%rax), %eax
2337 ; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2338 ; AVX512-NEXT: vpextrd $2, %xmm8, %eax
2339 ; AVX512-NEXT: leal -1(%rsi,%rax), %eax
2340 ; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2341 ; AVX512-NEXT: vmovd %xmm8, %eax
2342 ; AVX512-NEXT: leal -1(%rbx,%rax), %r10d
2343 ; AVX512-NEXT: vpextrd $2, %xmm6, %eax
2344 ; AVX512-NEXT: leal -1(%rdi,%rax), %r8d
2345 ; AVX512-NEXT: vmovd %xmm6, %eax
2346 ; AVX512-NEXT: leal -1(%rcx,%rax), %edi
2347 ; AVX512-NEXT: vpextrd $2, %xmm3, %eax
2348 ; AVX512-NEXT: leal -1(%r9,%rax), %r9d
2349 ; AVX512-NEXT: vmovd %xmm3, %ecx
2350 ; AVX512-NEXT: leal -1(%r12,%rcx), %r12d
2351 ; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
2352 ; AVX512-NEXT: leal -1(%r15,%rcx), %r15d
2353 ; AVX512-NEXT: vmovq %xmm0, %rcx
2354 ; AVX512-NEXT: leal -1(%r14,%rcx), %r14d
2355 ; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
2356 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2357 ; AVX512-NEXT: leal -1(%rax,%rdx), %edx
2358 ; AVX512-NEXT: vmovq %xmm2, %rax
2359 ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm0
2360 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2361 ; AVX512-NEXT: leal -1(%rcx,%rax), %eax
2362 ; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
2363 ; AVX512-NEXT: leal -1(%r13,%rsi), %esi
2364 ; AVX512-NEXT: vmovq %xmm0, %rbx
2365 ; AVX512-NEXT: leal -1(%r11,%rbx), %ebx
2366 ; AVX512-NEXT: vpextrq $1, %xmm10, %rcx
2367 ; AVX512-NEXT: vpextrq $1, %xmm11, %r13
2368 ; AVX512-NEXT: leal -1(%rcx,%r13), %ecx
2369 ; AVX512-NEXT: vmovq %xmm10, %r13
2370 ; AVX512-NEXT: vmovq %xmm11, %r11
2371 ; AVX512-NEXT: leaq -1(%r13,%r11), %rbp
2372 ; AVX512-NEXT: shrq %rbp
2373 ; AVX512-NEXT: vmovd %ebp, %xmm0
2374 ; AVX512-NEXT: shrl %ecx
2375 ; AVX512-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
2376 ; AVX512-NEXT: shrl %ebx
2377 ; AVX512-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
2378 ; AVX512-NEXT: shrl %esi
2379 ; AVX512-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
2380 ; AVX512-NEXT: shrl %eax
2381 ; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
2382 ; AVX512-NEXT: shrl %edx
2383 ; AVX512-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
2384 ; AVX512-NEXT: shrl %r14d
2385 ; AVX512-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
2386 ; AVX512-NEXT: shrl %r15d
2387 ; AVX512-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0
2388 ; AVX512-NEXT: shrl %r12d
2389 ; AVX512-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0
2390 ; AVX512-NEXT: shrl %r9d
2391 ; AVX512-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
2392 ; AVX512-NEXT: shrl %edi
2393 ; AVX512-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
2394 ; AVX512-NEXT: shrl %r8d
2395 ; AVX512-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0
2396 ; AVX512-NEXT: shrl %r10d
2397 ; AVX512-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
2398 ; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2399 ; AVX512-NEXT: shrl %eax
2400 ; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
2401 ; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2402 ; AVX512-NEXT: shrl %eax
2403 ; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
2404 ; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2405 ; AVX512-NEXT: shrl %eax
2406 ; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
2407 ; AVX512-NEXT: vmovdqu %xmm0, (%rax)
2408 ; AVX512-NEXT: popq %rbx
2409 ; AVX512-NEXT: popq %r12
2410 ; AVX512-NEXT: popq %r13
2411 ; AVX512-NEXT: popq %r14
2412 ; AVX512-NEXT: popq %r15
2413 ; AVX512-NEXT: popq %rbp
2414 ; AVX512-NEXT: vzeroupper
2416 %1 = load <16 x i8>, <16 x i8>* %a
2417 %2 = load <16 x i8>, <16 x i8>* %b
2418 %3 = zext <16 x i8> %1 to <16 x i128>
2419 %4 = zext <16 x i8> %2 to <16 x i128>
2420 %5 = add nuw nsw <16 x i128> %3, <i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1>
2421 %6 = add nuw nsw <16 x i128> %5, %4
2422 %7 = lshr <16 x i128> %6, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
2423 %8 = trunc <16 x i128> %7 to <16 x i8>
2424 store <16 x i8> %8, <16 x i8>* undef, align 4
2428 ; Make sure we don't fail on single element vectors.
2429 define <1 x i8> @avg_v1i8(<1 x i8> %x, <1 x i8> %y) {
2430 ; SSE2-LABEL: avg_v1i8:
2432 ; SSE2-NEXT: movzbl %dil, %eax
2433 ; SSE2-NEXT: movzbl %sil, %ecx
2434 ; SSE2-NEXT: leal 1(%rax,%rcx), %eax
2435 ; SSE2-NEXT: shrl %eax
2436 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
2439 ; AVX-LABEL: avg_v1i8:
2441 ; AVX-NEXT: movzbl %dil, %eax
2442 ; AVX-NEXT: movzbl %sil, %ecx
2443 ; AVX-NEXT: leal 1(%rax,%rcx), %eax
2444 ; AVX-NEXT: shrl %eax
2445 ; AVX-NEXT: # kill: def $al killed $al killed $eax
2447 %a = zext <1 x i8> %x to <1 x i16>
2448 %b = zext <1 x i8> %y to <1 x i16>
2449 %c = add <1 x i16> %a, %b
2450 %d = add <1 x i16> %c, <i16 1>
2451 %e = lshr <1 x i16> %d, <i16 1>
2452 %f = trunc <1 x i16> %e to <1 x i8>
2456 ; _mm_avg_epu16( _mm_slli_epi16(a, 2), _mm_slli_epi16(b, 2))
2457 define <2 x i64> @PR41316(<2 x i64>, <2 x i64>) {
2458 ; SSE2-LABEL: PR41316:
2460 ; SSE2-NEXT: psllw $2, %xmm0
2461 ; SSE2-NEXT: psllw $2, %xmm1
2462 ; SSE2-NEXT: pavgw %xmm1, %xmm0
2465 ; AVX-LABEL: PR41316:
2467 ; AVX-NEXT: vpsllw $2, %xmm0, %xmm0
2468 ; AVX-NEXT: vpsllw $2, %xmm1, %xmm1
2469 ; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0
2471 %3 = bitcast <2 x i64> %0 to <8 x i16>
2472 %4 = shl <8 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
2473 %5 = bitcast <2 x i64> %1 to <8 x i16>
2474 %6 = shl <8 x i16> %5, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
2475 %7 = zext <8 x i16> %6 to <8 x i32>
2476 %8 = or <8 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
2477 %9 = zext <8 x i16> %8 to <8 x i32>
2478 %10 = add nuw nsw <8 x i32> %9, %7
2479 %11 = lshr <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2480 %12 = trunc <8 x i32> %11 to <8 x i16>
2481 %13 = bitcast <8 x i16> %12 to <2 x i64>