1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
8 define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) nounwind {
9 ; SSE2-LABEL: avg_v4i8:
11 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
12 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
13 ; SSE2-NEXT: pavgb %xmm0, %xmm1
14 ; SSE2-NEXT: movd %xmm1, (%rax)
17 ; AVX-LABEL: avg_v4i8:
19 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
20 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
21 ; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0
22 ; AVX-NEXT: vmovd %xmm0, (%rax)
24 %1 = load <4 x i8>, <4 x i8>* %a
25 %2 = load <4 x i8>, <4 x i8>* %b
26 %3 = zext <4 x i8> %1 to <4 x i32>
27 %4 = zext <4 x i8> %2 to <4 x i32>
28 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
29 %6 = add nuw nsw <4 x i32> %5, %4
30 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
31 %8 = trunc <4 x i32> %7 to <4 x i8>
32 store <4 x i8> %8, <4 x i8>* undef, align 4
36 define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) nounwind {
37 ; SSE2-LABEL: avg_v8i8:
39 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
40 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
41 ; SSE2-NEXT: pavgb %xmm0, %xmm1
42 ; SSE2-NEXT: movq %xmm1, (%rax)
45 ; AVX-LABEL: avg_v8i8:
47 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
48 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
49 ; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0
50 ; AVX-NEXT: vmovq %xmm0, (%rax)
52 %1 = load <8 x i8>, <8 x i8>* %a
53 %2 = load <8 x i8>, <8 x i8>* %b
54 %3 = zext <8 x i8> %1 to <8 x i32>
55 %4 = zext <8 x i8> %2 to <8 x i32>
56 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
57 %6 = add nuw nsw <8 x i32> %5, %4
58 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
59 %8 = trunc <8 x i32> %7 to <8 x i8>
60 store <8 x i8> %8, <8 x i8>* undef, align 4
64 define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind {
65 ; SSE2-LABEL: avg_v16i8:
67 ; SSE2-NEXT: movdqa (%rsi), %xmm0
68 ; SSE2-NEXT: pavgb (%rdi), %xmm0
69 ; SSE2-NEXT: movdqu %xmm0, (%rax)
72 ; AVX-LABEL: avg_v16i8:
74 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
75 ; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0
76 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
78 %1 = load <16 x i8>, <16 x i8>* %a
79 %2 = load <16 x i8>, <16 x i8>* %b
80 %3 = zext <16 x i8> %1 to <16 x i32>
81 %4 = zext <16 x i8> %2 to <16 x i32>
82 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
83 %6 = add nuw nsw <16 x i32> %5, %4
84 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
85 %8 = trunc <16 x i32> %7 to <16 x i8>
86 store <16 x i8> %8, <16 x i8>* undef, align 4
90 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
91 ; SSE2-LABEL: avg_v32i8:
93 ; SSE2-NEXT: movdqa (%rsi), %xmm0
94 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
95 ; SSE2-NEXT: pavgb (%rdi), %xmm0
96 ; SSE2-NEXT: pavgb 16(%rdi), %xmm1
97 ; SSE2-NEXT: movdqu %xmm1, (%rax)
98 ; SSE2-NEXT: movdqu %xmm0, (%rax)
101 ; AVX1-LABEL: avg_v32i8:
103 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
104 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
105 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1
106 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0
107 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
108 ; AVX1-NEXT: vmovups %ymm0, (%rax)
109 ; AVX1-NEXT: vzeroupper
112 ; AVX2-LABEL: avg_v32i8:
114 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
115 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
116 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
117 ; AVX2-NEXT: vzeroupper
120 ; AVX512-LABEL: avg_v32i8:
122 ; AVX512-NEXT: vmovdqa (%rsi), %ymm0
123 ; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0
124 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
125 ; AVX512-NEXT: vzeroupper
127 %1 = load <32 x i8>, <32 x i8>* %a
128 %2 = load <32 x i8>, <32 x i8>* %b
129 %3 = zext <32 x i8> %1 to <32 x i32>
130 %4 = zext <32 x i8> %2 to <32 x i32>
131 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
132 %6 = add nuw nsw <32 x i32> %5, %4
133 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
134 %8 = trunc <32 x i32> %7 to <32 x i8>
135 store <32 x i8> %8, <32 x i8>* undef, align 4
139 define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
140 ; SSE2-LABEL: avg_v48i8:
142 ; SSE2-NEXT: movdqa (%rdi), %xmm1
143 ; SSE2-NEXT: movdqa 16(%rdi), %xmm6
144 ; SSE2-NEXT: movdqa 32(%rdi), %xmm11
145 ; SSE2-NEXT: movdqa (%rsi), %xmm12
146 ; SSE2-NEXT: movdqa 16(%rsi), %xmm13
147 ; SSE2-NEXT: movdqa 32(%rsi), %xmm0
148 ; SSE2-NEXT: pxor %xmm7, %xmm7
149 ; SSE2-NEXT: movdqa %xmm1, %xmm4
150 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
151 ; SSE2-NEXT: movdqa %xmm4, %xmm2
152 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
153 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
154 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
155 ; SSE2-NEXT: movdqa %xmm1, %xmm10
156 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
157 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
158 ; SSE2-NEXT: movdqa %xmm6, %xmm5
159 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
160 ; SSE2-NEXT: movdqa %xmm5, %xmm15
161 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
162 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
163 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
164 ; SSE2-NEXT: movdqa %xmm6, %xmm14
165 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
166 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
167 ; SSE2-NEXT: movdqa %xmm12, %xmm3
168 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15]
169 ; SSE2-NEXT: movdqa %xmm3, %xmm8
170 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
171 ; SSE2-NEXT: paddd %xmm2, %xmm8
172 ; SSE2-NEXT: movdqa %xmm11, %xmm2
173 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
174 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
175 ; SSE2-NEXT: paddd %xmm4, %xmm3
176 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
177 ; SSE2-NEXT: movdqa %xmm12, %xmm9
178 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
179 ; SSE2-NEXT: paddd %xmm10, %xmm9
180 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
181 ; SSE2-NEXT: paddd %xmm1, %xmm12
182 ; SSE2-NEXT: movdqa %xmm13, %xmm4
183 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
184 ; SSE2-NEXT: movdqa %xmm4, %xmm10
185 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
186 ; SSE2-NEXT: paddd %xmm15, %xmm10
187 ; SSE2-NEXT: movdqa %xmm2, %xmm15
188 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
189 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
190 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
191 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
192 ; SSE2-NEXT: paddd %xmm5, %xmm4
193 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
194 ; SSE2-NEXT: movdqa %xmm13, %xmm1
195 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
196 ; SSE2-NEXT: paddd %xmm14, %xmm1
197 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3]
198 ; SSE2-NEXT: paddd %xmm6, %xmm13
199 ; SSE2-NEXT: movdqa %xmm0, %xmm6
200 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
201 ; SSE2-NEXT: movdqa %xmm6, %xmm14
202 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
203 ; SSE2-NEXT: paddd %xmm15, %xmm14
204 ; SSE2-NEXT: movdqa %xmm11, %xmm5
205 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
206 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
207 ; SSE2-NEXT: paddd %xmm2, %xmm6
208 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
209 ; SSE2-NEXT: movdqa %xmm0, %xmm2
210 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
211 ; SSE2-NEXT: paddd %xmm5, %xmm2
212 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
213 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
214 ; SSE2-NEXT: paddd %xmm11, %xmm0
215 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
216 ; SSE2-NEXT: psubd %xmm5, %xmm8
217 ; SSE2-NEXT: psubd %xmm5, %xmm3
218 ; SSE2-NEXT: psubd %xmm5, %xmm9
219 ; SSE2-NEXT: psubd %xmm5, %xmm12
220 ; SSE2-NEXT: psubd %xmm5, %xmm10
221 ; SSE2-NEXT: psubd %xmm5, %xmm4
222 ; SSE2-NEXT: psubd %xmm5, %xmm1
223 ; SSE2-NEXT: psubd %xmm5, %xmm13
224 ; SSE2-NEXT: psubd %xmm5, %xmm14
225 ; SSE2-NEXT: psubd %xmm5, %xmm6
226 ; SSE2-NEXT: psubd %xmm5, %xmm2
227 ; SSE2-NEXT: psubd %xmm5, %xmm0
228 ; SSE2-NEXT: psrld $1, %xmm3
229 ; SSE2-NEXT: psrld $1, %xmm8
230 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255]
231 ; SSE2-NEXT: pand %xmm7, %xmm8
232 ; SSE2-NEXT: pand %xmm7, %xmm3
233 ; SSE2-NEXT: packuswb %xmm8, %xmm3
234 ; SSE2-NEXT: psrld $1, %xmm12
235 ; SSE2-NEXT: psrld $1, %xmm9
236 ; SSE2-NEXT: pand %xmm7, %xmm9
237 ; SSE2-NEXT: pand %xmm7, %xmm12
238 ; SSE2-NEXT: packuswb %xmm9, %xmm12
239 ; SSE2-NEXT: packuswb %xmm3, %xmm12
240 ; SSE2-NEXT: psrld $1, %xmm4
241 ; SSE2-NEXT: psrld $1, %xmm10
242 ; SSE2-NEXT: pand %xmm7, %xmm10
243 ; SSE2-NEXT: pand %xmm7, %xmm4
244 ; SSE2-NEXT: packuswb %xmm10, %xmm4
245 ; SSE2-NEXT: psrld $1, %xmm13
246 ; SSE2-NEXT: psrld $1, %xmm1
247 ; SSE2-NEXT: pand %xmm7, %xmm1
248 ; SSE2-NEXT: pand %xmm7, %xmm13
249 ; SSE2-NEXT: packuswb %xmm1, %xmm13
250 ; SSE2-NEXT: packuswb %xmm4, %xmm13
251 ; SSE2-NEXT: psrld $1, %xmm6
252 ; SSE2-NEXT: psrld $1, %xmm14
253 ; SSE2-NEXT: pand %xmm7, %xmm14
254 ; SSE2-NEXT: pand %xmm7, %xmm6
255 ; SSE2-NEXT: packuswb %xmm14, %xmm6
256 ; SSE2-NEXT: psrld $1, %xmm0
257 ; SSE2-NEXT: psrld $1, %xmm2
258 ; SSE2-NEXT: pand %xmm7, %xmm2
259 ; SSE2-NEXT: pand %xmm7, %xmm0
260 ; SSE2-NEXT: packuswb %xmm2, %xmm0
261 ; SSE2-NEXT: packuswb %xmm6, %xmm0
262 ; SSE2-NEXT: movdqu %xmm0, (%rax)
263 ; SSE2-NEXT: movdqu %xmm13, (%rax)
264 ; SSE2-NEXT: movdqu %xmm12, (%rax)
267 ; AVX1-LABEL: avg_v48i8:
269 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
270 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
271 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm4
272 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
273 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
274 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1]
275 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
276 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
277 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
278 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
279 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
280 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
281 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1]
282 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
283 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
284 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
285 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
286 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
287 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
288 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
289 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,0,1]
290 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
291 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
292 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
293 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
294 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
295 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
296 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
297 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
298 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4
299 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3
300 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
301 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
302 ; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm13
303 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,3,0,1]
304 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
305 ; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm11
306 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,3]
307 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
308 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm9
309 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
310 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm8
311 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
312 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
313 ; AVX1-NEXT: vpaddd %xmm4, %xmm15, %xmm15
314 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,0,1]
315 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
316 ; AVX1-NEXT: vpaddd %xmm7, %xmm10, %xmm7
317 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
318 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
319 ; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm14
320 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
321 ; AVX1-NEXT: vpaddd %xmm0, %xmm12, %xmm12
322 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
323 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
324 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
325 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,3,0,1]
326 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
327 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
328 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,2,3]
329 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
330 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
331 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
332 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
333 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
334 ; AVX1-NEXT: vpsubd %xmm4, %xmm13, %xmm10
335 ; AVX1-NEXT: vpsubd %xmm4, %xmm11, %xmm11
336 ; AVX1-NEXT: vpsubd %xmm4, %xmm9, %xmm9
337 ; AVX1-NEXT: vpsubd %xmm4, %xmm8, %xmm8
338 ; AVX1-NEXT: vpsubd %xmm4, %xmm15, %xmm13
339 ; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm7
340 ; AVX1-NEXT: vpsubd %xmm4, %xmm14, %xmm0
341 ; AVX1-NEXT: vpsubd %xmm4, %xmm12, %xmm2
342 ; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm5
343 ; AVX1-NEXT: vpsubd %xmm4, %xmm6, %xmm6
344 ; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1
345 ; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3
346 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
347 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
348 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
349 ; AVX1-NEXT: vpsrld $1, %xmm6, %xmm3
350 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm4
351 ; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
352 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
353 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
354 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
355 ; AVX1-NEXT: vpsrld $1, %xmm7, %xmm2
356 ; AVX1-NEXT: vpsrld $1, %xmm13, %xmm4
357 ; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
358 ; AVX1-NEXT: vpsrld $1, %xmm8, %xmm4
359 ; AVX1-NEXT: vpsrld $1, %xmm9, %xmm5
360 ; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
361 ; AVX1-NEXT: vpsrld $1, %xmm11, %xmm5
362 ; AVX1-NEXT: vpsrld $1, %xmm10, %xmm6
363 ; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
364 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
365 ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
366 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
367 ; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
368 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
369 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
370 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
371 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
372 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm2
373 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
374 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
375 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
376 ; AVX1-NEXT: vmovups %ymm0, (%rax)
377 ; AVX1-NEXT: vzeroupper
380 ; AVX2-LABEL: avg_v48i8:
382 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
383 ; AVX2-NEXT: vpbroadcastq 24(%rdi), %ymm1
384 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
385 ; AVX2-NEXT: vmovdqa (%rdi), %xmm2
386 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3
387 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
388 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
389 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
390 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
391 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
392 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
393 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
394 ; AVX2-NEXT: vpaddd %ymm6, %ymm0, %ymm0
395 ; AVX2-NEXT: vpbroadcastq 24(%rsi), %ymm6
396 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
397 ; AVX2-NEXT: vpaddd %ymm6, %ymm1, %ymm1
398 ; AVX2-NEXT: vmovdqa (%rsi), %xmm6
399 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm7
400 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
401 ; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4
402 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
403 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
404 ; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
405 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero
406 ; AVX2-NEXT: vpaddd %ymm6, %ymm5, %ymm5
407 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[2,3,0,1]
408 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
409 ; AVX2-NEXT: vpaddd %ymm6, %ymm3, %ymm3
410 ; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
411 ; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm7
412 ; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1
413 ; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4
414 ; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2
415 ; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5
416 ; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm0
417 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
418 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm3
419 ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
420 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
421 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
422 ; AVX2-NEXT: vpsrld $1, %ymm7, %ymm5
423 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
424 ; AVX2-NEXT: vpackusdw %xmm6, %xmm5, %xmm5
425 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
426 ; AVX2-NEXT: vpand %xmm6, %xmm5, %xmm5
427 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
428 ; AVX2-NEXT: vpackusdw %xmm7, %xmm1, %xmm1
429 ; AVX2-NEXT: vpand %xmm6, %xmm1, %xmm1
430 ; AVX2-NEXT: vpackuswb %xmm1, %xmm5, %xmm1
431 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
432 ; AVX2-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
433 ; AVX2-NEXT: vpand %xmm6, %xmm4, %xmm4
434 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
435 ; AVX2-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
436 ; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2
437 ; AVX2-NEXT: vpackuswb %xmm2, %xmm4, %xmm2
438 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
439 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2
440 ; AVX2-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
441 ; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2
442 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
443 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
444 ; AVX2-NEXT: vpand %xmm6, %xmm0, %xmm0
445 ; AVX2-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
446 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
447 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
448 ; AVX2-NEXT: vzeroupper
451 ; AVX512F-LABEL: avg_v48i8:
453 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
454 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
455 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
456 ; AVX512F-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
457 ; AVX512F-NEXT: vpavgb (%rsi), %xmm0, %xmm0
458 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
459 ; AVX512F-NEXT: vpavgb 32(%rsi), %xmm2, %xmm1
460 ; AVX512F-NEXT: vmovdqu %xmm1, (%rax)
461 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
462 ; AVX512F-NEXT: vzeroupper
465 ; AVX512BW-LABEL: avg_v48i8:
467 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
468 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
469 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
470 ; AVX512BW-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
471 ; AVX512BW-NEXT: vpavgb (%rsi), %xmm0, %xmm0
472 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
473 ; AVX512BW-NEXT: vpavgb 32(%rsi), %xmm2, %xmm1
474 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
475 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
476 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, (%rax)
477 ; AVX512BW-NEXT: vzeroupper
478 ; AVX512BW-NEXT: retq
479 %1 = load <48 x i8>, <48 x i8>* %a
480 %2 = load <48 x i8>, <48 x i8>* %b
481 %3 = zext <48 x i8> %1 to <48 x i32>
482 %4 = zext <48 x i8> %2 to <48 x i32>
483 %5 = add nuw nsw <48 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
484 %6 = add nuw nsw <48 x i32> %5, %4
485 %7 = lshr <48 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
486 %8 = trunc <48 x i32> %7 to <48 x i8>
487 store <48 x i8> %8, <48 x i8>* undef, align 4
491 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
492 ; SSE2-LABEL: avg_v64i8:
494 ; SSE2-NEXT: movdqa (%rsi), %xmm0
495 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
496 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
497 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
498 ; SSE2-NEXT: pavgb (%rdi), %xmm0
499 ; SSE2-NEXT: pavgb 16(%rdi), %xmm1
500 ; SSE2-NEXT: pavgb 32(%rdi), %xmm2
501 ; SSE2-NEXT: pavgb 48(%rdi), %xmm3
502 ; SSE2-NEXT: movdqu %xmm3, (%rax)
503 ; SSE2-NEXT: movdqu %xmm2, (%rax)
504 ; SSE2-NEXT: movdqu %xmm1, (%rax)
505 ; SSE2-NEXT: movdqu %xmm0, (%rax)
508 ; AVX1-LABEL: avg_v64i8:
510 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
511 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
512 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2
513 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3
514 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1
515 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0
516 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
517 ; AVX1-NEXT: vpavgb 48(%rdi), %xmm3, %xmm1
518 ; AVX1-NEXT: vpavgb 32(%rdi), %xmm2, %xmm2
519 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
520 ; AVX1-NEXT: vmovups %ymm1, (%rax)
521 ; AVX1-NEXT: vmovups %ymm0, (%rax)
522 ; AVX1-NEXT: vzeroupper
525 ; AVX2-LABEL: avg_v64i8:
527 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
528 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
529 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
530 ; AVX2-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
531 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
532 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
533 ; AVX2-NEXT: vzeroupper
536 ; AVX512F-LABEL: avg_v64i8:
538 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
539 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
540 ; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0
541 ; AVX512F-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
542 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
543 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
544 ; AVX512F-NEXT: vzeroupper
547 ; AVX512BW-LABEL: avg_v64i8:
549 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
550 ; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
551 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
552 ; AVX512BW-NEXT: vzeroupper
553 ; AVX512BW-NEXT: retq
554 %1 = load <64 x i8>, <64 x i8>* %a
555 %2 = load <64 x i8>, <64 x i8>* %b
556 %3 = zext <64 x i8> %1 to <64 x i32>
557 %4 = zext <64 x i8> %2 to <64 x i32>
558 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
559 %6 = add nuw nsw <64 x i32> %5, %4
560 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
561 %8 = trunc <64 x i32> %7 to <64 x i8>
562 store <64 x i8> %8, <64 x i8>* undef, align 4
566 define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) nounwind {
567 ; SSE2-LABEL: avg_v4i16:
569 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
570 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
571 ; SSE2-NEXT: pavgw %xmm0, %xmm1
572 ; SSE2-NEXT: movq %xmm1, (%rax)
575 ; AVX-LABEL: avg_v4i16:
577 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
578 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
579 ; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0
580 ; AVX-NEXT: vmovq %xmm0, (%rax)
582 %1 = load <4 x i16>, <4 x i16>* %a
583 %2 = load <4 x i16>, <4 x i16>* %b
584 %3 = zext <4 x i16> %1 to <4 x i32>
585 %4 = zext <4 x i16> %2 to <4 x i32>
586 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
587 %6 = add nuw nsw <4 x i32> %5, %4
588 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
589 %8 = trunc <4 x i32> %7 to <4 x i16>
590 store <4 x i16> %8, <4 x i16>* undef, align 4
594 define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind {
595 ; SSE2-LABEL: avg_v8i16:
597 ; SSE2-NEXT: movdqa (%rsi), %xmm0
598 ; SSE2-NEXT: pavgw (%rdi), %xmm0
599 ; SSE2-NEXT: movdqu %xmm0, (%rax)
602 ; AVX-LABEL: avg_v8i16:
604 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
605 ; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0
606 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
608 %1 = load <8 x i16>, <8 x i16>* %a
609 %2 = load <8 x i16>, <8 x i16>* %b
610 %3 = zext <8 x i16> %1 to <8 x i32>
611 %4 = zext <8 x i16> %2 to <8 x i32>
612 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
613 %6 = add nuw nsw <8 x i32> %5, %4
614 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
615 %8 = trunc <8 x i32> %7 to <8 x i16>
616 store <8 x i16> %8, <8 x i16>* undef, align 4
620 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
621 ; SSE2-LABEL: avg_v16i16:
623 ; SSE2-NEXT: movdqa (%rsi), %xmm0
624 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
625 ; SSE2-NEXT: pavgw (%rdi), %xmm0
626 ; SSE2-NEXT: pavgw 16(%rdi), %xmm1
627 ; SSE2-NEXT: movdqu %xmm1, (%rax)
628 ; SSE2-NEXT: movdqu %xmm0, (%rax)
631 ; AVX1-LABEL: avg_v16i16:
633 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
634 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
635 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1
636 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0
637 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
638 ; AVX1-NEXT: vmovups %ymm0, (%rax)
639 ; AVX1-NEXT: vzeroupper
642 ; AVX2-LABEL: avg_v16i16:
644 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
645 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
646 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
647 ; AVX2-NEXT: vzeroupper
650 ; AVX512-LABEL: avg_v16i16:
652 ; AVX512-NEXT: vmovdqa (%rsi), %ymm0
653 ; AVX512-NEXT: vpavgw (%rdi), %ymm0, %ymm0
654 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
655 ; AVX512-NEXT: vzeroupper
657 %1 = load <16 x i16>, <16 x i16>* %a
658 %2 = load <16 x i16>, <16 x i16>* %b
659 %3 = zext <16 x i16> %1 to <16 x i32>
660 %4 = zext <16 x i16> %2 to <16 x i32>
661 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
662 %6 = add nuw nsw <16 x i32> %5, %4
663 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
664 %8 = trunc <16 x i32> %7 to <16 x i16>
665 store <16 x i16> %8, <16 x i16>* undef, align 4
669 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
670 ; SSE2-LABEL: avg_v32i16:
672 ; SSE2-NEXT: movdqa (%rsi), %xmm0
673 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
674 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
675 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
676 ; SSE2-NEXT: pavgw (%rdi), %xmm0
677 ; SSE2-NEXT: pavgw 16(%rdi), %xmm1
678 ; SSE2-NEXT: pavgw 32(%rdi), %xmm2
679 ; SSE2-NEXT: pavgw 48(%rdi), %xmm3
680 ; SSE2-NEXT: movdqu %xmm3, (%rax)
681 ; SSE2-NEXT: movdqu %xmm2, (%rax)
682 ; SSE2-NEXT: movdqu %xmm1, (%rax)
683 ; SSE2-NEXT: movdqu %xmm0, (%rax)
686 ; AVX1-LABEL: avg_v32i16:
688 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
689 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
690 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2
691 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3
692 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1
693 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0
694 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
695 ; AVX1-NEXT: vpavgw 48(%rdi), %xmm3, %xmm1
696 ; AVX1-NEXT: vpavgw 32(%rdi), %xmm2, %xmm2
697 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
698 ; AVX1-NEXT: vmovups %ymm1, (%rax)
699 ; AVX1-NEXT: vmovups %ymm0, (%rax)
700 ; AVX1-NEXT: vzeroupper
703 ; AVX2-LABEL: avg_v32i16:
705 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
706 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
707 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
708 ; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
709 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
710 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
711 ; AVX2-NEXT: vzeroupper
714 ; AVX512F-LABEL: avg_v32i16:
716 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
717 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
718 ; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0
719 ; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
720 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
721 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
722 ; AVX512F-NEXT: vzeroupper
725 ; AVX512BW-LABEL: avg_v32i16:
727 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
728 ; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
729 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
730 ; AVX512BW-NEXT: vzeroupper
731 ; AVX512BW-NEXT: retq
732 %1 = load <32 x i16>, <32 x i16>* %a
733 %2 = load <32 x i16>, <32 x i16>* %b
734 %3 = zext <32 x i16> %1 to <32 x i32>
735 %4 = zext <32 x i16> %2 to <32 x i32>
736 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
737 %6 = add nuw nsw <32 x i32> %5, %4
738 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
739 %8 = trunc <32 x i32> %7 to <32 x i16>
740 store <32 x i16> %8, <32 x i16>* undef, align 4
744 define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) nounwind {
745 ; SSE2-LABEL: avg_v4i8_2:
747 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
748 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
749 ; SSE2-NEXT: pavgb %xmm0, %xmm1
750 ; SSE2-NEXT: movd %xmm1, (%rax)
753 ; AVX-LABEL: avg_v4i8_2:
755 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
756 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
757 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
758 ; AVX-NEXT: vmovd %xmm0, (%rax)
760 %1 = load <4 x i8>, <4 x i8>* %a
761 %2 = load <4 x i8>, <4 x i8>* %b
762 %3 = zext <4 x i8> %1 to <4 x i32>
763 %4 = zext <4 x i8> %2 to <4 x i32>
764 %5 = add nuw nsw <4 x i32> %3, %4
765 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
766 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
767 %8 = trunc <4 x i32> %7 to <4 x i8>
768 store <4 x i8> %8, <4 x i8>* undef, align 4
772 define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) nounwind {
773 ; SSE2-LABEL: avg_v8i8_2:
775 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
776 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
777 ; SSE2-NEXT: pavgb %xmm0, %xmm1
778 ; SSE2-NEXT: movq %xmm1, (%rax)
781 ; AVX-LABEL: avg_v8i8_2:
783 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
784 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
785 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
786 ; AVX-NEXT: vmovq %xmm0, (%rax)
788 %1 = load <8 x i8>, <8 x i8>* %a
789 %2 = load <8 x i8>, <8 x i8>* %b
790 %3 = zext <8 x i8> %1 to <8 x i32>
791 %4 = zext <8 x i8> %2 to <8 x i32>
792 %5 = add nuw nsw <8 x i32> %3, %4
793 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
794 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
795 %8 = trunc <8 x i32> %7 to <8 x i8>
796 store <8 x i8> %8, <8 x i8>* undef, align 4
800 define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind {
801 ; SSE2-LABEL: avg_v16i8_2:
803 ; SSE2-NEXT: movdqa (%rdi), %xmm0
804 ; SSE2-NEXT: pavgb (%rsi), %xmm0
805 ; SSE2-NEXT: movdqu %xmm0, (%rax)
808 ; AVX-LABEL: avg_v16i8_2:
810 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
811 ; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0
812 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
814 %1 = load <16 x i8>, <16 x i8>* %a
815 %2 = load <16 x i8>, <16 x i8>* %b
816 %3 = zext <16 x i8> %1 to <16 x i32>
817 %4 = zext <16 x i8> %2 to <16 x i32>
818 %5 = add nuw nsw <16 x i32> %3, %4
819 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
820 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
821 %8 = trunc <16 x i32> %7 to <16 x i8>
822 store <16 x i8> %8, <16 x i8>* undef, align 4
826 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
827 ; SSE2-LABEL: avg_v32i8_2:
829 ; SSE2-NEXT: movdqa (%rdi), %xmm0
830 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
831 ; SSE2-NEXT: pavgb (%rsi), %xmm0
832 ; SSE2-NEXT: pavgb 16(%rsi), %xmm1
833 ; SSE2-NEXT: movdqu %xmm1, (%rax)
834 ; SSE2-NEXT: movdqu %xmm0, (%rax)
837 ; AVX1-LABEL: avg_v32i8_2:
839 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
840 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
841 ; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
842 ; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0
843 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
844 ; AVX1-NEXT: vmovups %ymm0, (%rax)
845 ; AVX1-NEXT: vzeroupper
848 ; AVX2-LABEL: avg_v32i8_2:
850 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
851 ; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
852 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
853 ; AVX2-NEXT: vzeroupper
856 ; AVX512-LABEL: avg_v32i8_2:
858 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
859 ; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0
860 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
861 ; AVX512-NEXT: vzeroupper
863 %1 = load <32 x i8>, <32 x i8>* %a
864 %2 = load <32 x i8>, <32 x i8>* %b
865 %3 = zext <32 x i8> %1 to <32 x i32>
866 %4 = zext <32 x i8> %2 to <32 x i32>
867 %5 = add nuw nsw <32 x i32> %3, %4
868 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
869 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
870 %8 = trunc <32 x i32> %7 to <32 x i8>
871 store <32 x i8> %8, <32 x i8>* undef, align 4
875 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
876 ; SSE2-LABEL: avg_v64i8_2:
878 ; SSE2-NEXT: movdqa (%rsi), %xmm0
879 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
880 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
881 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
882 ; SSE2-NEXT: pavgb %xmm0, %xmm0
883 ; SSE2-NEXT: pavgb %xmm1, %xmm1
884 ; SSE2-NEXT: pavgb %xmm2, %xmm2
885 ; SSE2-NEXT: pavgb %xmm3, %xmm3
886 ; SSE2-NEXT: movdqu %xmm3, (%rax)
887 ; SSE2-NEXT: movdqu %xmm2, (%rax)
888 ; SSE2-NEXT: movdqu %xmm1, (%rax)
889 ; SSE2-NEXT: movdqu %xmm0, (%rax)
892 ; AVX1-LABEL: avg_v64i8_2:
894 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
895 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
896 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2
897 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3
898 ; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm0
899 ; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm1
900 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
901 ; AVX1-NEXT: vpavgb %xmm2, %xmm2, %xmm1
902 ; AVX1-NEXT: vpavgb %xmm3, %xmm3, %xmm2
903 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
904 ; AVX1-NEXT: vmovups %ymm1, (%rax)
905 ; AVX1-NEXT: vmovups %ymm0, (%rax)
906 ; AVX1-NEXT: vzeroupper
909 ; AVX2-LABEL: avg_v64i8_2:
911 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
912 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
913 ; AVX2-NEXT: vpavgb %ymm0, %ymm0, %ymm0
914 ; AVX2-NEXT: vpavgb %ymm1, %ymm1, %ymm1
915 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
916 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
917 ; AVX2-NEXT: vzeroupper
920 ; AVX512F-LABEL: avg_v64i8_2:
922 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
923 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
924 ; AVX512F-NEXT: vpavgb %ymm0, %ymm0, %ymm0
925 ; AVX512F-NEXT: vpavgb %ymm1, %ymm1, %ymm1
926 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
927 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
928 ; AVX512F-NEXT: vzeroupper
931 ; AVX512BW-LABEL: avg_v64i8_2:
933 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
934 ; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
935 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
936 ; AVX512BW-NEXT: vzeroupper
937 ; AVX512BW-NEXT: retq
938 %1 = load <64 x i8>, <64 x i8>* %a
939 %2 = load <64 x i8>, <64 x i8>* %b
940 %3 = zext <64 x i8> %1 to <64 x i32>
941 %4 = zext <64 x i8> %2 to <64 x i32>
942 %5 = add nuw nsw <64 x i32> %4, %4
943 %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
944 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
945 %8 = trunc <64 x i32> %7 to <64 x i8>
946 store <64 x i8> %8, <64 x i8>* undef, align 4
951 define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) nounwind {
952 ; SSE2-LABEL: avg_v4i16_2:
954 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
955 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
956 ; SSE2-NEXT: pavgw %xmm0, %xmm1
957 ; SSE2-NEXT: movq %xmm1, (%rax)
960 ; AVX-LABEL: avg_v4i16_2:
962 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
963 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
964 ; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0
965 ; AVX-NEXT: vmovq %xmm0, (%rax)
967 %1 = load <4 x i16>, <4 x i16>* %a
968 %2 = load <4 x i16>, <4 x i16>* %b
969 %3 = zext <4 x i16> %1 to <4 x i32>
970 %4 = zext <4 x i16> %2 to <4 x i32>
971 %5 = add nuw nsw <4 x i32> %3, %4
972 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
973 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
974 %8 = trunc <4 x i32> %7 to <4 x i16>
975 store <4 x i16> %8, <4 x i16>* undef, align 4
979 define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind {
980 ; SSE2-LABEL: avg_v8i16_2:
982 ; SSE2-NEXT: movdqa (%rdi), %xmm0
983 ; SSE2-NEXT: pavgw (%rsi), %xmm0
984 ; SSE2-NEXT: movdqu %xmm0, (%rax)
987 ; AVX-LABEL: avg_v8i16_2:
989 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
990 ; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0
991 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
993 %1 = load <8 x i16>, <8 x i16>* %a
994 %2 = load <8 x i16>, <8 x i16>* %b
995 %3 = zext <8 x i16> %1 to <8 x i32>
996 %4 = zext <8 x i16> %2 to <8 x i32>
997 %5 = add nuw nsw <8 x i32> %3, %4
998 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
999 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1000 %8 = trunc <8 x i32> %7 to <8 x i16>
1001 store <8 x i16> %8, <8 x i16>* undef, align 4
1005 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
1006 ; SSE2-LABEL: avg_v16i16_2:
1008 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1009 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
1010 ; SSE2-NEXT: pavgw (%rsi), %xmm0
1011 ; SSE2-NEXT: pavgw 16(%rsi), %xmm1
1012 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1013 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1016 ; AVX1-LABEL: avg_v16i16_2:
1018 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
1019 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
1020 ; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1
1021 ; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0
1022 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1023 ; AVX1-NEXT: vmovups %ymm0, (%rax)
1024 ; AVX1-NEXT: vzeroupper
1027 ; AVX2-LABEL: avg_v16i16_2:
1029 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1030 ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
1031 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1032 ; AVX2-NEXT: vzeroupper
1035 ; AVX512-LABEL: avg_v16i16_2:
1037 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
1038 ; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0
1039 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
1040 ; AVX512-NEXT: vzeroupper
1042 %1 = load <16 x i16>, <16 x i16>* %a
1043 %2 = load <16 x i16>, <16 x i16>* %b
1044 %3 = zext <16 x i16> %1 to <16 x i32>
1045 %4 = zext <16 x i16> %2 to <16 x i32>
1046 %5 = add nuw nsw <16 x i32> %3, %4
1047 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1048 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1049 %8 = trunc <16 x i32> %7 to <16 x i16>
1050 store <16 x i16> %8, <16 x i16>* undef, align 4
1054 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
1055 ; SSE2-LABEL: avg_v32i16_2:
1057 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1058 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
1059 ; SSE2-NEXT: movdqa 32(%rdi), %xmm2
1060 ; SSE2-NEXT: movdqa 48(%rdi), %xmm3
1061 ; SSE2-NEXT: pavgw (%rsi), %xmm0
1062 ; SSE2-NEXT: pavgw 16(%rsi), %xmm1
1063 ; SSE2-NEXT: pavgw 32(%rsi), %xmm2
1064 ; SSE2-NEXT: pavgw 48(%rsi), %xmm3
1065 ; SSE2-NEXT: movdqu %xmm3, (%rax)
1066 ; SSE2-NEXT: movdqu %xmm2, (%rax)
1067 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1068 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1071 ; AVX1-LABEL: avg_v32i16_2:
1073 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
1074 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
1075 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
1076 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
1077 ; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1
1078 ; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0
1079 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1080 ; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm1
1081 ; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2
1082 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1083 ; AVX1-NEXT: vmovups %ymm1, (%rax)
1084 ; AVX1-NEXT: vmovups %ymm0, (%rax)
1085 ; AVX1-NEXT: vzeroupper
1088 ; AVX2-LABEL: avg_v32i16_2:
1090 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1091 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
1092 ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
1093 ; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1
1094 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
1095 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1096 ; AVX2-NEXT: vzeroupper
1099 ; AVX512F-LABEL: avg_v32i16_2:
1101 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1102 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
1103 ; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0
1104 ; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1
1105 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
1106 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
1107 ; AVX512F-NEXT: vzeroupper
1108 ; AVX512F-NEXT: retq
1110 ; AVX512BW-LABEL: avg_v32i16_2:
1111 ; AVX512BW: # %bb.0:
1112 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1113 ; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
1114 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
1115 ; AVX512BW-NEXT: vzeroupper
1116 ; AVX512BW-NEXT: retq
1117 %1 = load <32 x i16>, <32 x i16>* %a
1118 %2 = load <32 x i16>, <32 x i16>* %b
1119 %3 = zext <32 x i16> %1 to <32 x i32>
1120 %4 = zext <32 x i16> %2 to <32 x i32>
1121 %5 = add nuw nsw <32 x i32> %3, %4
1122 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1123 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1124 %8 = trunc <32 x i32> %7 to <32 x i16>
1125 store <32 x i16> %8, <32 x i16>* undef, align 4
1129 define void @avg_v4i8_const(<4 x i8>* %a) nounwind {
1130 ; SSE2-LABEL: avg_v4i8_const:
1132 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1133 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
1134 ; SSE2-NEXT: movd %xmm0, (%rax)
1137 ; AVX-LABEL: avg_v4i8_const:
1139 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1140 ; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
1141 ; AVX-NEXT: vmovd %xmm0, (%rax)
1143 %1 = load <4 x i8>, <4 x i8>* %a
1144 %2 = zext <4 x i8> %1 to <4 x i32>
1145 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
1146 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
1147 %5 = trunc <4 x i32> %4 to <4 x i8>
1148 store <4 x i8> %5, <4 x i8>* undef, align 4
1152 define void @avg_v8i8_const(<8 x i8>* %a) nounwind {
1153 ; SSE2-LABEL: avg_v8i8_const:
1155 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1156 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
1157 ; SSE2-NEXT: movq %xmm0, (%rax)
1160 ; AVX-LABEL: avg_v8i8_const:
1162 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1163 ; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
1164 ; AVX-NEXT: vmovq %xmm0, (%rax)
1166 %1 = load <8 x i8>, <8 x i8>* %a
1167 %2 = zext <8 x i8> %1 to <8 x i32>
1168 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1169 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1170 %5 = trunc <8 x i32> %4 to <8 x i8>
1171 store <8 x i8> %5, <8 x i8>* undef, align 4
1175 define void @avg_v16i8_const(<16 x i8>* %a) nounwind {
1176 ; SSE2-LABEL: avg_v16i8_const:
1178 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1179 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
1180 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1183 ; AVX-LABEL: avg_v16i8_const:
1185 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1186 ; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
1187 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
1189 %1 = load <16 x i8>, <16 x i8>* %a
1190 %2 = zext <16 x i8> %1 to <16 x i32>
1191 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1192 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1193 %5 = trunc <16 x i32> %4 to <16 x i8>
1194 store <16 x i8> %5, <16 x i8>* undef, align 4
1198 define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
1199 ; SSE2-LABEL: avg_v32i8_const:
1201 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1202 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1203 ; SSE2-NEXT: pavgb %xmm0, %xmm1
1204 ; SSE2-NEXT: pavgb 16(%rdi), %xmm0
1205 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1206 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1209 ; AVX1-LABEL: avg_v32i8_const:
1211 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275]
1212 ; AVX1-NEXT: # xmm0 = mem[0,0]
1213 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm1
1214 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0
1215 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1216 ; AVX1-NEXT: vmovups %ymm0, (%rax)
1217 ; AVX1-NEXT: vzeroupper
1220 ; AVX2-LABEL: avg_v32i8_const:
1222 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1223 ; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
1224 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1225 ; AVX2-NEXT: vzeroupper
1228 ; AVX512-LABEL: avg_v32i8_const:
1230 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
1231 ; AVX512-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
1232 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
1233 ; AVX512-NEXT: vzeroupper
1235 %1 = load <32 x i8>, <32 x i8>* %a
1236 %2 = zext <32 x i8> %1 to <32 x i32>
1237 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1238 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1239 %5 = trunc <32 x i32> %4 to <32 x i8>
1240 store <32 x i8> %5, <32 x i8>* undef, align 4
1244 define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
1245 ; SSE2-LABEL: avg_v64i8_const:
1247 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1248 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1249 ; SSE2-NEXT: pavgb %xmm0, %xmm1
1250 ; SSE2-NEXT: movdqa 16(%rdi), %xmm2
1251 ; SSE2-NEXT: pavgb %xmm0, %xmm2
1252 ; SSE2-NEXT: movdqa 32(%rdi), %xmm3
1253 ; SSE2-NEXT: pavgb %xmm0, %xmm3
1254 ; SSE2-NEXT: pavgb 48(%rdi), %xmm0
1255 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1256 ; SSE2-NEXT: movdqu %xmm3, (%rax)
1257 ; SSE2-NEXT: movdqu %xmm2, (%rax)
1258 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1261 ; AVX1-LABEL: avg_v64i8_const:
1263 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275]
1264 ; AVX1-NEXT: # xmm0 = mem[0,0]
1265 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm1
1266 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm2
1267 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1268 ; AVX1-NEXT: vpavgb 48(%rdi), %xmm0, %xmm2
1269 ; AVX1-NEXT: vpavgb 32(%rdi), %xmm0, %xmm0
1270 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1271 ; AVX1-NEXT: vmovups %ymm0, (%rax)
1272 ; AVX1-NEXT: vmovups %ymm1, (%rax)
1273 ; AVX1-NEXT: vzeroupper
1276 ; AVX2-LABEL: avg_v64i8_const:
1278 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
1279 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm1
1280 ; AVX2-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
1281 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1282 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
1283 ; AVX2-NEXT: vzeroupper
1286 ; AVX512F-LABEL: avg_v64i8_const:
1288 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
1289 ; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm1
1290 ; AVX512F-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
1291 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
1292 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
1293 ; AVX512F-NEXT: vzeroupper
1294 ; AVX512F-NEXT: retq
1296 ; AVX512BW-LABEL: avg_v64i8_const:
1297 ; AVX512BW: # %bb.0:
1298 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1299 ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0
1300 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
1301 ; AVX512BW-NEXT: vzeroupper
1302 ; AVX512BW-NEXT: retq
1303 %1 = load <64 x i8>, <64 x i8>* %a
1304 %2 = zext <64 x i8> %1 to <64 x i32>
1305 %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1306 %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1307 %5 = trunc <64 x i32> %4 to <64 x i8>
1308 store <64 x i8> %5, <64 x i8>* undef, align 4
1312 define void @avg_v4i16_const(<4 x i16>* %a) nounwind {
1313 ; SSE2-LABEL: avg_v4i16_const:
1315 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1316 ; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
1317 ; SSE2-NEXT: movq %xmm0, (%rax)
1320 ; AVX-LABEL: avg_v4i16_const:
1322 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1323 ; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
1324 ; AVX-NEXT: vmovq %xmm0, (%rax)
1326 %1 = load <4 x i16>, <4 x i16>* %a
1327 %2 = zext <4 x i16> %1 to <4 x i32>
1328 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
1329 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
1330 %5 = trunc <4 x i32> %4 to <4 x i16>
1331 store <4 x i16> %5, <4 x i16>* undef, align 4
1335 define void @avg_v8i16_const(<8 x i16>* %a) nounwind {
1336 ; SSE2-LABEL: avg_v8i16_const:
1338 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1339 ; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
1340 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1343 ; AVX-LABEL: avg_v8i16_const:
1345 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1346 ; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
1347 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
1349 %1 = load <8 x i16>, <8 x i16>* %a
1350 %2 = zext <8 x i16> %1 to <8 x i32>
1351 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1352 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1353 %5 = trunc <8 x i32> %4 to <8 x i16>
1354 store <8 x i16> %5, <8 x i16>* undef, align 4
1358 define void @avg_v16i16_const(<16 x i16>* %a) nounwind {
1359 ; SSE2-LABEL: avg_v16i16_const:
1361 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1362 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1363 ; SSE2-NEXT: pavgw %xmm0, %xmm1
1364 ; SSE2-NEXT: pavgw 16(%rdi), %xmm0
1365 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1366 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1369 ; AVX1-LABEL: avg_v16i16_const:
1371 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1372 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm1
1373 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0
1374 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1375 ; AVX1-NEXT: vmovups %ymm0, (%rax)
1376 ; AVX1-NEXT: vzeroupper
1379 ; AVX2-LABEL: avg_v16i16_const:
1381 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1382 ; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
1383 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1384 ; AVX2-NEXT: vzeroupper
1387 ; AVX512-LABEL: avg_v16i16_const:
1389 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
1390 ; AVX512-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
1391 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
1392 ; AVX512-NEXT: vzeroupper
1394 %1 = load <16 x i16>, <16 x i16>* %a
1395 %2 = zext <16 x i16> %1 to <16 x i32>
1396 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1397 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1398 %5 = trunc <16 x i32> %4 to <16 x i16>
1399 store <16 x i16> %5, <16 x i16>* undef, align 4
1403 define void @avg_v32i16_const(<32 x i16>* %a) nounwind {
1404 ; SSE2-LABEL: avg_v32i16_const:
1406 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1407 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1408 ; SSE2-NEXT: pavgw %xmm0, %xmm1
1409 ; SSE2-NEXT: movdqa 16(%rdi), %xmm2
1410 ; SSE2-NEXT: pavgw %xmm0, %xmm2
1411 ; SSE2-NEXT: movdqa 32(%rdi), %xmm3
1412 ; SSE2-NEXT: pavgw %xmm0, %xmm3
1413 ; SSE2-NEXT: pavgw 48(%rdi), %xmm0
1414 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1415 ; SSE2-NEXT: movdqu %xmm3, (%rax)
1416 ; SSE2-NEXT: movdqu %xmm2, (%rax)
1417 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1420 ; AVX1-LABEL: avg_v32i16_const:
1422 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1423 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm1
1424 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm2
1425 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1426 ; AVX1-NEXT: vpavgw 48(%rdi), %xmm0, %xmm2
1427 ; AVX1-NEXT: vpavgw 32(%rdi), %xmm0, %xmm0
1428 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1429 ; AVX1-NEXT: vmovups %ymm0, (%rax)
1430 ; AVX1-NEXT: vmovups %ymm1, (%rax)
1431 ; AVX1-NEXT: vzeroupper
1434 ; AVX2-LABEL: avg_v32i16_const:
1436 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1437 ; AVX2-NEXT: # ymm0 = mem[0,1,0,1]
1438 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm1
1439 ; AVX2-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0
1440 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1441 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
1442 ; AVX2-NEXT: vzeroupper
1445 ; AVX512F-LABEL: avg_v32i16_const:
1447 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1448 ; AVX512F-NEXT: # ymm0 = mem[0,1,0,1]
1449 ; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm1
1450 ; AVX512F-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0
1451 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
1452 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
1453 ; AVX512F-NEXT: vzeroupper
1454 ; AVX512F-NEXT: retq
1456 ; AVX512BW-LABEL: avg_v32i16_const:
1457 ; AVX512BW: # %bb.0:
1458 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1459 ; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0
1460 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
1461 ; AVX512BW-NEXT: vzeroupper
1462 ; AVX512BW-NEXT: retq
1463 %1 = load <32 x i16>, <32 x i16>* %a
1464 %2 = zext <32 x i16> %1 to <32 x i32>
1465 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1466 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1467 %5 = trunc <32 x i32> %4 to <32 x i16>
1468 store <32 x i16> %5, <32 x i16>* undef, align 4
1472 define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind {
1473 ; SSE2-LABEL: avg_v16i8_3:
1475 ; SSE2-NEXT: pavgb %xmm1, %xmm0
1478 ; AVX-LABEL: avg_v16i8_3:
1480 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
1482 %za = zext <16 x i8> %a to <16 x i16>
1483 %zb = zext <16 x i8> %b to <16 x i16>
1484 %add = add nuw nsw <16 x i16> %za, %zb
1485 %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1486 %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1487 %res = trunc <16 x i16> %lshr to <16 x i8>
1491 define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {
1492 ; SSE2-LABEL: avg_v32i8_3:
1494 ; SSE2-NEXT: pavgb %xmm2, %xmm0
1495 ; SSE2-NEXT: pavgb %xmm3, %xmm1
1498 ; AVX1-LABEL: avg_v32i8_3:
1500 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1501 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1502 ; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2
1503 ; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0
1504 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1507 ; AVX2-LABEL: avg_v32i8_3:
1509 ; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0
1512 ; AVX512-LABEL: avg_v32i8_3:
1514 ; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
1516 %za = zext <32 x i8> %a to <32 x i16>
1517 %zb = zext <32 x i8> %b to <32 x i16>
1518 %add = add nuw nsw <32 x i16> %za, %zb
1519 %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1520 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1521 %res = trunc <32 x i16> %lshr to <32 x i8>
1525 define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind {
1526 ; SSE2-LABEL: avg_v64i8_3:
1528 ; SSE2-NEXT: pavgb %xmm4, %xmm0
1529 ; SSE2-NEXT: pavgb %xmm5, %xmm1
1530 ; SSE2-NEXT: pavgb %xmm6, %xmm2
1531 ; SSE2-NEXT: pavgb %xmm7, %xmm3
1534 ; AVX1-LABEL: avg_v64i8_3:
1536 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1537 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
1538 ; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4
1539 ; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0
1540 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1541 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
1542 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1543 ; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2
1544 ; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1
1545 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1548 ; AVX2-LABEL: avg_v64i8_3:
1550 ; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0
1551 ; AVX2-NEXT: vpavgb %ymm3, %ymm1, %ymm1
1554 ; AVX512F-LABEL: avg_v64i8_3:
1556 ; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
1557 ; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
1558 ; AVX512F-NEXT: retq
1560 ; AVX512BW-LABEL: avg_v64i8_3:
1561 ; AVX512BW: # %bb.0:
1562 ; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0
1563 ; AVX512BW-NEXT: retq
1564 %za = zext <64 x i8> %a to <64 x i16>
1565 %zb = zext <64 x i8> %b to <64 x i16>
1566 %add = add nuw nsw <64 x i16> %za, %zb
1567 %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1568 %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1569 %res = trunc <64 x i16> %lshr to <64 x i8>
1573 define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind {
1574 ; SSE2-LABEL: avg_v512i8_3:
1576 ; SSE2-NEXT: movq %rdi, %rax
1577 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1578 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1579 ; SSE2-NEXT: movdqa %xmm8, 496(%rdi)
1580 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1581 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1582 ; SSE2-NEXT: movdqa %xmm8, 480(%rdi)
1583 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1584 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1585 ; SSE2-NEXT: movdqa %xmm8, 464(%rdi)
1586 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1587 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1588 ; SSE2-NEXT: movdqa %xmm8, 448(%rdi)
1589 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1590 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1591 ; SSE2-NEXT: movdqa %xmm8, 432(%rdi)
1592 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1593 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1594 ; SSE2-NEXT: movdqa %xmm8, 416(%rdi)
1595 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1596 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1597 ; SSE2-NEXT: movdqa %xmm8, 400(%rdi)
1598 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1599 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1600 ; SSE2-NEXT: movdqa %xmm8, 384(%rdi)
1601 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1602 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1603 ; SSE2-NEXT: movdqa %xmm8, 368(%rdi)
1604 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1605 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1606 ; SSE2-NEXT: movdqa %xmm8, 352(%rdi)
1607 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1608 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1609 ; SSE2-NEXT: movdqa %xmm8, 336(%rdi)
1610 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1611 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1612 ; SSE2-NEXT: movdqa %xmm8, 320(%rdi)
1613 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1614 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1615 ; SSE2-NEXT: movdqa %xmm8, 304(%rdi)
1616 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1617 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1618 ; SSE2-NEXT: movdqa %xmm8, 288(%rdi)
1619 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1620 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1621 ; SSE2-NEXT: movdqa %xmm8, 272(%rdi)
1622 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1623 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1624 ; SSE2-NEXT: movdqa %xmm8, 256(%rdi)
1625 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1626 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1627 ; SSE2-NEXT: movdqa %xmm8, 240(%rdi)
1628 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1629 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1630 ; SSE2-NEXT: movdqa %xmm8, 224(%rdi)
1631 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1632 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1633 ; SSE2-NEXT: movdqa %xmm8, 208(%rdi)
1634 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1635 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1636 ; SSE2-NEXT: movdqa %xmm8, 192(%rdi)
1637 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1638 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1639 ; SSE2-NEXT: movdqa %xmm8, 176(%rdi)
1640 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1641 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1642 ; SSE2-NEXT: movdqa %xmm8, 160(%rdi)
1643 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1644 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1645 ; SSE2-NEXT: movdqa %xmm8, 144(%rdi)
1646 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1647 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1648 ; SSE2-NEXT: movdqa %xmm8, 128(%rdi)
1649 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm7
1650 ; SSE2-NEXT: movdqa %xmm7, 112(%rdi)
1651 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm6
1652 ; SSE2-NEXT: movdqa %xmm6, 96(%rdi)
1653 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm5
1654 ; SSE2-NEXT: movdqa %xmm5, 80(%rdi)
1655 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm4
1656 ; SSE2-NEXT: movdqa %xmm4, 64(%rdi)
1657 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm3
1658 ; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
1659 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm2
1660 ; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
1661 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm1
1662 ; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
1663 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm0
1664 ; SSE2-NEXT: movdqa %xmm0, (%rdi)
1667 ; AVX1-LABEL: avg_v512i8_3:
1669 ; AVX1-NEXT: pushq %rbp
1670 ; AVX1-NEXT: movq %rsp, %rbp
1671 ; AVX1-NEXT: andq $-32, %rsp
1672 ; AVX1-NEXT: subq $96, %rsp
1673 ; AVX1-NEXT: movq %rdi, %rax
1674 ; AVX1-NEXT: vpavgb 272(%rbp), %xmm0, %xmm8
1675 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1676 ; AVX1-NEXT: vpavgb 288(%rbp), %xmm0, %xmm0
1677 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0
1678 ; AVX1-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1679 ; AVX1-NEXT: vpavgb 304(%rbp), %xmm1, %xmm8
1680 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1681 ; AVX1-NEXT: vpavgb 320(%rbp), %xmm1, %xmm1
1682 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm0
1683 ; AVX1-NEXT: vmovaps %ymm0, (%rsp) # 32-byte Spill
1684 ; AVX1-NEXT: vpavgb 336(%rbp), %xmm2, %xmm8
1685 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1686 ; AVX1-NEXT: vpavgb 352(%rbp), %xmm2, %xmm2
1687 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm13
1688 ; AVX1-NEXT: vpavgb 368(%rbp), %xmm3, %xmm8
1689 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1690 ; AVX1-NEXT: vpavgb 384(%rbp), %xmm3, %xmm3
1691 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm14
1692 ; AVX1-NEXT: vpavgb 400(%rbp), %xmm4, %xmm8
1693 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
1694 ; AVX1-NEXT: vpavgb 416(%rbp), %xmm4, %xmm4
1695 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm15
1696 ; AVX1-NEXT: vpavgb 432(%rbp), %xmm5, %xmm8
1697 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
1698 ; AVX1-NEXT: vpavgb 448(%rbp), %xmm5, %xmm5
1699 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm12
1700 ; AVX1-NEXT: vpavgb 464(%rbp), %xmm6, %xmm8
1701 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
1702 ; AVX1-NEXT: vpavgb 480(%rbp), %xmm6, %xmm6
1703 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6
1704 ; AVX1-NEXT: vpavgb 496(%rbp), %xmm7, %xmm8
1705 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
1706 ; AVX1-NEXT: vpavgb 512(%rbp), %xmm7, %xmm7
1707 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7
1708 ; AVX1-NEXT: vmovdqa 16(%rbp), %xmm0
1709 ; AVX1-NEXT: vmovdqa 32(%rbp), %xmm1
1710 ; AVX1-NEXT: vpavgb 528(%rbp), %xmm0, %xmm0
1711 ; AVX1-NEXT: vpavgb 544(%rbp), %xmm1, %xmm1
1712 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8
1713 ; AVX1-NEXT: vmovdqa 48(%rbp), %xmm0
1714 ; AVX1-NEXT: vmovdqa 64(%rbp), %xmm1
1715 ; AVX1-NEXT: vpavgb 560(%rbp), %xmm0, %xmm0
1716 ; AVX1-NEXT: vpavgb 576(%rbp), %xmm1, %xmm1
1717 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9
1718 ; AVX1-NEXT: vmovdqa 80(%rbp), %xmm0
1719 ; AVX1-NEXT: vmovdqa 96(%rbp), %xmm1
1720 ; AVX1-NEXT: vpavgb 592(%rbp), %xmm0, %xmm0
1721 ; AVX1-NEXT: vpavgb 608(%rbp), %xmm1, %xmm1
1722 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10
1723 ; AVX1-NEXT: vmovdqa 112(%rbp), %xmm0
1724 ; AVX1-NEXT: vmovdqa 128(%rbp), %xmm1
1725 ; AVX1-NEXT: vpavgb 624(%rbp), %xmm0, %xmm0
1726 ; AVX1-NEXT: vpavgb 640(%rbp), %xmm1, %xmm1
1727 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1728 ; AVX1-NEXT: vmovdqa 144(%rbp), %xmm1
1729 ; AVX1-NEXT: vmovdqa 160(%rbp), %xmm2
1730 ; AVX1-NEXT: vpavgb 656(%rbp), %xmm1, %xmm1
1731 ; AVX1-NEXT: vpavgb 672(%rbp), %xmm2, %xmm2
1732 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1733 ; AVX1-NEXT: vmovdqa 176(%rbp), %xmm2
1734 ; AVX1-NEXT: vmovdqa 192(%rbp), %xmm3
1735 ; AVX1-NEXT: vpavgb 688(%rbp), %xmm2, %xmm2
1736 ; AVX1-NEXT: vpavgb 704(%rbp), %xmm3, %xmm3
1737 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1738 ; AVX1-NEXT: vmovdqa 208(%rbp), %xmm3
1739 ; AVX1-NEXT: vmovdqa 224(%rbp), %xmm4
1740 ; AVX1-NEXT: vpavgb 720(%rbp), %xmm3, %xmm3
1741 ; AVX1-NEXT: vpavgb 736(%rbp), %xmm4, %xmm4
1742 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
1743 ; AVX1-NEXT: vmovdqa 240(%rbp), %xmm4
1744 ; AVX1-NEXT: vpavgb 752(%rbp), %xmm4, %xmm4
1745 ; AVX1-NEXT: vmovdqa 256(%rbp), %xmm11
1746 ; AVX1-NEXT: vpavgb 768(%rbp), %xmm11, %xmm5
1747 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
1748 ; AVX1-NEXT: vmovaps %ymm4, 480(%rdi)
1749 ; AVX1-NEXT: vmovaps %ymm3, 448(%rdi)
1750 ; AVX1-NEXT: vmovaps %ymm2, 416(%rdi)
1751 ; AVX1-NEXT: vmovaps %ymm1, 384(%rdi)
1752 ; AVX1-NEXT: vmovaps %ymm0, 352(%rdi)
1753 ; AVX1-NEXT: vmovaps %ymm10, 320(%rdi)
1754 ; AVX1-NEXT: vmovaps %ymm9, 288(%rdi)
1755 ; AVX1-NEXT: vmovaps %ymm8, 256(%rdi)
1756 ; AVX1-NEXT: vmovaps %ymm7, 224(%rdi)
1757 ; AVX1-NEXT: vmovaps %ymm6, 192(%rdi)
1758 ; AVX1-NEXT: vmovaps %ymm12, 160(%rdi)
1759 ; AVX1-NEXT: vmovaps %ymm15, 128(%rdi)
1760 ; AVX1-NEXT: vmovaps %ymm14, 96(%rdi)
1761 ; AVX1-NEXT: vmovaps %ymm13, 64(%rdi)
1762 ; AVX1-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload
1763 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdi)
1764 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1765 ; AVX1-NEXT: vmovaps %ymm0, (%rdi)
1766 ; AVX1-NEXT: movq %rbp, %rsp
1767 ; AVX1-NEXT: popq %rbp
1768 ; AVX1-NEXT: vzeroupper
1771 ; AVX2-LABEL: avg_v512i8_3:
1773 ; AVX2-NEXT: pushq %rbp
1774 ; AVX2-NEXT: movq %rsp, %rbp
1775 ; AVX2-NEXT: andq $-32, %rsp
1776 ; AVX2-NEXT: subq $32, %rsp
1777 ; AVX2-NEXT: movq %rdi, %rax
1778 ; AVX2-NEXT: vmovdqa 240(%rbp), %ymm8
1779 ; AVX2-NEXT: vmovdqa 208(%rbp), %ymm9
1780 ; AVX2-NEXT: vmovdqa 176(%rbp), %ymm10
1781 ; AVX2-NEXT: vmovdqa 144(%rbp), %ymm11
1782 ; AVX2-NEXT: vmovdqa 112(%rbp), %ymm12
1783 ; AVX2-NEXT: vmovdqa 80(%rbp), %ymm13
1784 ; AVX2-NEXT: vmovdqa 48(%rbp), %ymm14
1785 ; AVX2-NEXT: vmovdqa 16(%rbp), %ymm15
1786 ; AVX2-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0
1787 ; AVX2-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1
1788 ; AVX2-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2
1789 ; AVX2-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3
1790 ; AVX2-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4
1791 ; AVX2-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5
1792 ; AVX2-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6
1793 ; AVX2-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7
1794 ; AVX2-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15
1795 ; AVX2-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14
1796 ; AVX2-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13
1797 ; AVX2-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12
1798 ; AVX2-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11
1799 ; AVX2-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10
1800 ; AVX2-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9
1801 ; AVX2-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8
1802 ; AVX2-NEXT: vmovdqa %ymm8, 480(%rdi)
1803 ; AVX2-NEXT: vmovdqa %ymm9, 448(%rdi)
1804 ; AVX2-NEXT: vmovdqa %ymm10, 416(%rdi)
1805 ; AVX2-NEXT: vmovdqa %ymm11, 384(%rdi)
1806 ; AVX2-NEXT: vmovdqa %ymm12, 352(%rdi)
1807 ; AVX2-NEXT: vmovdqa %ymm13, 320(%rdi)
1808 ; AVX2-NEXT: vmovdqa %ymm14, 288(%rdi)
1809 ; AVX2-NEXT: vmovdqa %ymm15, 256(%rdi)
1810 ; AVX2-NEXT: vmovdqa %ymm7, 224(%rdi)
1811 ; AVX2-NEXT: vmovdqa %ymm6, 192(%rdi)
1812 ; AVX2-NEXT: vmovdqa %ymm5, 160(%rdi)
1813 ; AVX2-NEXT: vmovdqa %ymm4, 128(%rdi)
1814 ; AVX2-NEXT: vmovdqa %ymm3, 96(%rdi)
1815 ; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi)
1816 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi)
1817 ; AVX2-NEXT: vmovdqa %ymm0, (%rdi)
1818 ; AVX2-NEXT: movq %rbp, %rsp
1819 ; AVX2-NEXT: popq %rbp
1820 ; AVX2-NEXT: vzeroupper
1823 ; AVX512F-LABEL: avg_v512i8_3:
1825 ; AVX512F-NEXT: pushq %rbp
1826 ; AVX512F-NEXT: movq %rsp, %rbp
1827 ; AVX512F-NEXT: andq $-32, %rsp
1828 ; AVX512F-NEXT: subq $32, %rsp
1829 ; AVX512F-NEXT: movq %rdi, %rax
1830 ; AVX512F-NEXT: vmovdqa 240(%rbp), %ymm8
1831 ; AVX512F-NEXT: vmovdqa 208(%rbp), %ymm9
1832 ; AVX512F-NEXT: vmovdqa 176(%rbp), %ymm10
1833 ; AVX512F-NEXT: vmovdqa 144(%rbp), %ymm11
1834 ; AVX512F-NEXT: vmovdqa 112(%rbp), %ymm12
1835 ; AVX512F-NEXT: vmovdqa 80(%rbp), %ymm13
1836 ; AVX512F-NEXT: vmovdqa 48(%rbp), %ymm14
1837 ; AVX512F-NEXT: vmovdqa 16(%rbp), %ymm15
1838 ; AVX512F-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0
1839 ; AVX512F-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1
1840 ; AVX512F-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2
1841 ; AVX512F-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3
1842 ; AVX512F-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4
1843 ; AVX512F-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5
1844 ; AVX512F-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6
1845 ; AVX512F-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7
1846 ; AVX512F-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15
1847 ; AVX512F-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14
1848 ; AVX512F-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13
1849 ; AVX512F-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12
1850 ; AVX512F-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11
1851 ; AVX512F-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10
1852 ; AVX512F-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9
1853 ; AVX512F-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8
1854 ; AVX512F-NEXT: vmovdqa %ymm8, 480(%rdi)
1855 ; AVX512F-NEXT: vmovdqa %ymm9, 448(%rdi)
1856 ; AVX512F-NEXT: vmovdqa %ymm10, 416(%rdi)
1857 ; AVX512F-NEXT: vmovdqa %ymm11, 384(%rdi)
1858 ; AVX512F-NEXT: vmovdqa %ymm12, 352(%rdi)
1859 ; AVX512F-NEXT: vmovdqa %ymm13, 320(%rdi)
1860 ; AVX512F-NEXT: vmovdqa %ymm14, 288(%rdi)
1861 ; AVX512F-NEXT: vmovdqa %ymm15, 256(%rdi)
1862 ; AVX512F-NEXT: vmovdqa %ymm7, 224(%rdi)
1863 ; AVX512F-NEXT: vmovdqa %ymm6, 192(%rdi)
1864 ; AVX512F-NEXT: vmovdqa %ymm5, 160(%rdi)
1865 ; AVX512F-NEXT: vmovdqa %ymm4, 128(%rdi)
1866 ; AVX512F-NEXT: vmovdqa %ymm3, 96(%rdi)
1867 ; AVX512F-NEXT: vmovdqa %ymm2, 64(%rdi)
1868 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdi)
1869 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdi)
1870 ; AVX512F-NEXT: movq %rbp, %rsp
1871 ; AVX512F-NEXT: popq %rbp
1872 ; AVX512F-NEXT: vzeroupper
1873 ; AVX512F-NEXT: retq
1875 ; AVX512BW-LABEL: avg_v512i8_3:
1876 ; AVX512BW: # %bb.0:
1877 ; AVX512BW-NEXT: pushq %rbp
1878 ; AVX512BW-NEXT: movq %rsp, %rbp
1879 ; AVX512BW-NEXT: andq $-64, %rsp
1880 ; AVX512BW-NEXT: subq $64, %rsp
1881 ; AVX512BW-NEXT: movq %rdi, %rax
1882 ; AVX512BW-NEXT: vpavgb 16(%rbp), %zmm0, %zmm0
1883 ; AVX512BW-NEXT: vpavgb 80(%rbp), %zmm1, %zmm1
1884 ; AVX512BW-NEXT: vpavgb 144(%rbp), %zmm2, %zmm2
1885 ; AVX512BW-NEXT: vpavgb 208(%rbp), %zmm3, %zmm3
1886 ; AVX512BW-NEXT: vpavgb 272(%rbp), %zmm4, %zmm4
1887 ; AVX512BW-NEXT: vpavgb 336(%rbp), %zmm5, %zmm5
1888 ; AVX512BW-NEXT: vpavgb 400(%rbp), %zmm6, %zmm6
1889 ; AVX512BW-NEXT: vpavgb 464(%rbp), %zmm7, %zmm7
1890 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdi)
1891 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdi)
1892 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdi)
1893 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdi)
1894 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdi)
1895 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdi)
1896 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdi)
1897 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdi)
1898 ; AVX512BW-NEXT: movq %rbp, %rsp
1899 ; AVX512BW-NEXT: popq %rbp
1900 ; AVX512BW-NEXT: vzeroupper
1901 ; AVX512BW-NEXT: retq
1902 %za = zext <512 x i8> %a to <512 x i16>
1903 %zb = zext <512 x i8> %b to <512 x i16>
1904 %add = add nuw nsw <512 x i16> %za, %zb
1905 %add1 = add nuw nsw <512 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1906 %lshr = lshr <512 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1907 %res = trunc <512 x i16> %lshr to <512 x i8>
1911 ; This is not an avg, but its structurally similar and previously caused a crash
1912 ; because the constants can't be read with APInt::getZExtValue.
1913 define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind {
1914 ; SSE2-LABEL: not_avg_v16i8_wide_constants:
1916 ; SSE2-NEXT: pushq %rbp
1917 ; SSE2-NEXT: pushq %r15
1918 ; SSE2-NEXT: pushq %r14
1919 ; SSE2-NEXT: pushq %r13
1920 ; SSE2-NEXT: pushq %r12
1921 ; SSE2-NEXT: pushq %rbx
1922 ; SSE2-NEXT: movaps (%rdi), %xmm0
1923 ; SSE2-NEXT: movaps (%rsi), %xmm1
1924 ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1925 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1926 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1927 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
1928 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1929 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1930 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1931 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1932 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
1933 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
1934 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
1935 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
1936 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
1937 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
1938 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
1939 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1940 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
1941 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
1942 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1943 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
1944 ; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
1945 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1946 ; SSE2-NEXT: leal -1(%rdx,%rsi), %edx
1947 ; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1948 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
1949 ; SSE2-NEXT: leal -1(%rbx,%rdx), %edx
1950 ; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1951 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
1952 ; SSE2-NEXT: leal -1(%rbp,%rdx), %edx
1953 ; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1954 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
1955 ; SSE2-NEXT: leal -1(%rdi,%rdx), %r8d
1956 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
1957 ; SSE2-NEXT: leal -1(%rax,%rdx), %edi
1958 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1959 ; SSE2-NEXT: leal -1(%rcx,%rax), %edx
1960 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1961 ; SSE2-NEXT: leal -1(%r9,%rax), %ecx
1962 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1963 ; SSE2-NEXT: leal -1(%r10,%rsi), %eax
1964 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1965 ; SSE2-NEXT: leaq -1(%r11,%rsi), %rsi
1966 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1967 ; SSE2-NEXT: leaq -1(%r12,%rbx), %r12
1968 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1969 ; SSE2-NEXT: leaq -1(%r15,%rbx), %r15
1970 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1971 ; SSE2-NEXT: leaq -1(%r14,%rbx), %r14
1972 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1973 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
1974 ; SSE2-NEXT: leaq -1(%rbp,%rbx), %r11
1975 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1976 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
1977 ; SSE2-NEXT: leaq -1(%rbp,%rbx), %r10
1978 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1979 ; SSE2-NEXT: leaq -1(%r13,%rbx), %r9
1980 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1981 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
1982 ; SSE2-NEXT: leaq -1(%r13,%rbx), %rbx
1983 ; SSE2-NEXT: shrl %eax
1984 ; SSE2-NEXT: movd %eax, %xmm8
1985 ; SSE2-NEXT: shrl %ecx
1986 ; SSE2-NEXT: movd %ecx, %xmm15
1987 ; SSE2-NEXT: shrl %edx
1988 ; SSE2-NEXT: movd %edx, %xmm9
1989 ; SSE2-NEXT: shrl %edi
1990 ; SSE2-NEXT: movd %edi, %xmm2
1991 ; SSE2-NEXT: shrl %r8d
1992 ; SSE2-NEXT: movd %r8d, %xmm10
1993 ; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1994 ; SSE2-NEXT: shrl %eax
1995 ; SSE2-NEXT: movd %eax, %xmm6
1996 ; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1997 ; SSE2-NEXT: shrl %eax
1998 ; SSE2-NEXT: movd %eax, %xmm11
1999 ; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2000 ; SSE2-NEXT: shrl %eax
2001 ; SSE2-NEXT: movd %eax, %xmm4
2002 ; SSE2-NEXT: shrq %rsi
2003 ; SSE2-NEXT: movd %esi, %xmm12
2004 ; SSE2-NEXT: shrq %r12
2005 ; SSE2-NEXT: movd %r12d, %xmm3
2006 ; SSE2-NEXT: shrq %r15
2007 ; SSE2-NEXT: movd %r15d, %xmm13
2008 ; SSE2-NEXT: shrq %r14
2009 ; SSE2-NEXT: movd %r14d, %xmm7
2010 ; SSE2-NEXT: shrq %r11
2011 ; SSE2-NEXT: movd %r11d, %xmm14
2012 ; SSE2-NEXT: shrq %r10
2013 ; SSE2-NEXT: movd %r10d, %xmm5
2014 ; SSE2-NEXT: shrq %r9
2015 ; SSE2-NEXT: movd %r9d, %xmm0
2016 ; SSE2-NEXT: shrq %rbx
2017 ; SSE2-NEXT: movd %ebx, %xmm1
2018 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
2019 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
2020 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
2021 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
2022 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
2023 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
2024 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
2025 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
2026 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
2027 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
2028 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
2029 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2030 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
2031 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
2032 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
2033 ; SSE2-NEXT: movdqu %xmm4, (%rax)
2034 ; SSE2-NEXT: popq %rbx
2035 ; SSE2-NEXT: popq %r12
2036 ; SSE2-NEXT: popq %r13
2037 ; SSE2-NEXT: popq %r14
2038 ; SSE2-NEXT: popq %r15
2039 ; SSE2-NEXT: popq %rbp
2042 ; AVX1-LABEL: not_avg_v16i8_wide_constants:
2044 ; AVX1-NEXT: pushq %rbp
2045 ; AVX1-NEXT: pushq %r15
2046 ; AVX1-NEXT: pushq %r14
2047 ; AVX1-NEXT: pushq %r13
2048 ; AVX1-NEXT: pushq %r12
2049 ; AVX1-NEXT: pushq %rbx
2050 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2051 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2052 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2053 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2054 ; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
2055 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
2056 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
2057 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
2058 ; AVX1-NEXT: vpextrq $1, %xmm5, %r15
2059 ; AVX1-NEXT: vmovq %xmm5, %r12
2060 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
2061 ; AVX1-NEXT: vpextrq $1, %xmm2, %r11
2062 ; AVX1-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2063 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2064 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm7[2],xmm0[3],xmm7[3]
2065 ; AVX1-NEXT: vpextrq $1, %xmm2, %r13
2066 ; AVX1-NEXT: vmovq %xmm2, %r14
2067 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2068 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
2069 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
2070 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
2071 ; AVX1-NEXT: vpextrq $1, %xmm5, %rbx
2072 ; AVX1-NEXT: vmovq %xmm5, %rdx
2073 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
2074 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
2075 ; AVX1-NEXT: vpextrq $1, %xmm1, %r9
2076 ; AVX1-NEXT: vmovq %xmm1, %r10
2077 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
2078 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
2079 ; AVX1-NEXT: vmovd %xmm6, %esi
2080 ; AVX1-NEXT: vpextrd $1, %xmm6, %edi
2081 ; AVX1-NEXT: vpextrd $2, %xmm6, %eax
2082 ; AVX1-NEXT: vpextrd $3, %xmm6, %ebp
2083 ; AVX1-NEXT: vpextrd $3, %xmm5, %ecx
2084 ; AVX1-NEXT: leal -1(%rbp,%rcx), %ecx
2085 ; AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2086 ; AVX1-NEXT: vpextrd $2, %xmm5, %ecx
2087 ; AVX1-NEXT: leal -1(%rax,%rcx), %eax
2088 ; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2089 ; AVX1-NEXT: vpextrd $1, %xmm5, %ecx
2090 ; AVX1-NEXT: leal -1(%rdi,%rcx), %eax
2091 ; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2092 ; AVX1-NEXT: vmovd %xmm5, %ecx
2093 ; AVX1-NEXT: leal -1(%rsi,%rcx), %r8d
2094 ; AVX1-NEXT: vpextrq $1, %xmm4, %rcx
2095 ; AVX1-NEXT: leal -1(%r15,%rbx), %r15d
2096 ; AVX1-NEXT: vmovq %xmm4, %rsi
2097 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
2098 ; AVX1-NEXT: leal -1(%r12,%rdx), %edx
2099 ; AVX1-NEXT: vmovd %xmm2, %r12d
2100 ; AVX1-NEXT: leal -1(%r11,%r9), %r11d
2101 ; AVX1-NEXT: vpextrd $1, %xmm2, %edi
2102 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2103 ; AVX1-NEXT: leal -1(%rax,%r10), %r10d
2104 ; AVX1-NEXT: vpextrd $2, %xmm2, %ebx
2105 ; AVX1-NEXT: leal -1(%r13,%rcx), %r9d
2106 ; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
2107 ; AVX1-NEXT: leal -1(%r14,%rsi), %esi
2108 ; AVX1-NEXT: vpextrd $3, %xmm3, %eax
2109 ; AVX1-NEXT: leal -1(%rcx,%rax), %ecx
2110 ; AVX1-NEXT: vpextrd $2, %xmm3, %eax
2111 ; AVX1-NEXT: leal -1(%rbx,%rax), %ebx
2112 ; AVX1-NEXT: vpextrd $1, %xmm3, %eax
2113 ; AVX1-NEXT: leal -1(%rdi,%rax), %eax
2114 ; AVX1-NEXT: vmovd %xmm3, %edi
2115 ; AVX1-NEXT: leal -1(%r12,%rdi), %edi
2116 ; AVX1-NEXT: vpextrq $1, %xmm0, %r12
2117 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
2118 ; AVX1-NEXT: vpextrq $1, %xmm1, %r13
2119 ; AVX1-NEXT: leal -1(%r12,%r13), %r12d
2120 ; AVX1-NEXT: vmovq %xmm0, %r13
2121 ; AVX1-NEXT: vmovq %xmm1, %r14
2122 ; AVX1-NEXT: leal -1(%r13,%r14), %ebp
2123 ; AVX1-NEXT: shrl %ebp
2124 ; AVX1-NEXT: vmovd %ebp, %xmm0
2125 ; AVX1-NEXT: shrl %r12d
2126 ; AVX1-NEXT: vpinsrb $1, %r12d, %xmm0, %xmm0
2127 ; AVX1-NEXT: shrl %esi
2128 ; AVX1-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0
2129 ; AVX1-NEXT: shrl %r9d
2130 ; AVX1-NEXT: vpinsrb $3, %r9d, %xmm0, %xmm0
2131 ; AVX1-NEXT: shrl %r10d
2132 ; AVX1-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
2133 ; AVX1-NEXT: shrl %r11d
2134 ; AVX1-NEXT: vpinsrb $5, %r11d, %xmm0, %xmm0
2135 ; AVX1-NEXT: shrl %edx
2136 ; AVX1-NEXT: vpinsrb $6, %edx, %xmm0, %xmm0
2137 ; AVX1-NEXT: shrl %r15d
2138 ; AVX1-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0
2139 ; AVX1-NEXT: shrl %edi
2140 ; AVX1-NEXT: vpinsrb $8, %edi, %xmm0, %xmm0
2141 ; AVX1-NEXT: shrl %eax
2142 ; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
2143 ; AVX1-NEXT: shrl %ebx
2144 ; AVX1-NEXT: vpinsrb $10, %ebx, %xmm0, %xmm0
2145 ; AVX1-NEXT: shrl %ecx
2146 ; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
2147 ; AVX1-NEXT: shrl %r8d
2148 ; AVX1-NEXT: vpinsrb $12, %r8d, %xmm0, %xmm0
2149 ; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2150 ; AVX1-NEXT: shrl %eax
2151 ; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
2152 ; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2153 ; AVX1-NEXT: shrl %eax
2154 ; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
2155 ; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2156 ; AVX1-NEXT: shrl %eax
2157 ; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
2158 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
2159 ; AVX1-NEXT: popq %rbx
2160 ; AVX1-NEXT: popq %r12
2161 ; AVX1-NEXT: popq %r13
2162 ; AVX1-NEXT: popq %r14
2163 ; AVX1-NEXT: popq %r15
2164 ; AVX1-NEXT: popq %rbp
2167 ; AVX2-LABEL: not_avg_v16i8_wide_constants:
2169 ; AVX2-NEXT: pushq %rbp
2170 ; AVX2-NEXT: pushq %r15
2171 ; AVX2-NEXT: pushq %r14
2172 ; AVX2-NEXT: pushq %r13
2173 ; AVX2-NEXT: pushq %r12
2174 ; AVX2-NEXT: pushq %rbx
2175 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2176 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2177 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2178 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
2179 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2180 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0
2181 ; AVX2-NEXT: vpextrq $1, %xmm4, %r14
2182 ; AVX2-NEXT: vmovq %xmm4, %r13
2183 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2184 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
2185 ; AVX2-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2186 ; AVX2-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2187 ; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2188 ; AVX2-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2189 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
2190 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2191 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2192 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2193 ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm5
2194 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2195 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7
2196 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2197 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
2198 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2199 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2200 ; AVX2-NEXT: vmovd %xmm4, %r12d
2201 ; AVX2-NEXT: vpextrd $2, %xmm4, %r15d
2202 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
2203 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3
2204 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2205 ; AVX2-NEXT: vmovd %xmm7, %ecx
2206 ; AVX2-NEXT: vpextrd $2, %xmm7, %edi
2207 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm7
2208 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
2209 ; AVX2-NEXT: vmovd %xmm6, %ebx
2210 ; AVX2-NEXT: vpextrd $2, %xmm6, %esi
2211 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm6
2212 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
2213 ; AVX2-NEXT: vmovd %xmm5, %edx
2214 ; AVX2-NEXT: vpextrd $2, %xmm5, %ebp
2215 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
2216 ; AVX2-NEXT: vpextrd $2, %xmm6, %eax
2217 ; AVX2-NEXT: leal -1(%rbp,%rax), %eax
2218 ; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2219 ; AVX2-NEXT: vmovd %xmm6, %eax
2220 ; AVX2-NEXT: leal -1(%rdx,%rax), %eax
2221 ; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2222 ; AVX2-NEXT: vpextrd $2, %xmm7, %eax
2223 ; AVX2-NEXT: leal -1(%rsi,%rax), %r11d
2224 ; AVX2-NEXT: vmovd %xmm7, %eax
2225 ; AVX2-NEXT: leal -1(%rbx,%rax), %r10d
2226 ; AVX2-NEXT: vpextrd $2, %xmm5, %eax
2227 ; AVX2-NEXT: leal -1(%rdi,%rax), %r9d
2228 ; AVX2-NEXT: vmovd %xmm5, %eax
2229 ; AVX2-NEXT: leal -1(%rcx,%rax), %r8d
2230 ; AVX2-NEXT: vpextrd $2, %xmm3, %eax
2231 ; AVX2-NEXT: leal -1(%r15,%rax), %r15d
2232 ; AVX2-NEXT: vmovd %xmm3, %ecx
2233 ; AVX2-NEXT: leal -1(%r12,%rcx), %r12d
2234 ; AVX2-NEXT: vpextrq $1, %xmm2, %rdx
2235 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2236 ; AVX2-NEXT: leaq -1(%rax,%rdx), %rdx
2237 ; AVX2-NEXT: vmovq %xmm2, %rsi
2238 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2239 ; AVX2-NEXT: leaq -1(%rax,%rsi), %rsi
2240 ; AVX2-NEXT: vmovq %xmm4, %rbx
2241 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2242 ; AVX2-NEXT: leaq -1(%rax,%rbx), %rbx
2243 ; AVX2-NEXT: vpextrq $1, %xmm4, %rbp
2244 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2245 ; AVX2-NEXT: leaq -1(%rax,%rbp), %rbp
2246 ; AVX2-NEXT: vmovq %xmm1, %rdi
2247 ; AVX2-NEXT: leaq -1(%r13,%rdi), %rdi
2248 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
2249 ; AVX2-NEXT: leaq -1(%r14,%rax), %rax
2250 ; AVX2-NEXT: vmovq %xmm0, %rcx
2251 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
2252 ; AVX2-NEXT: vmovq %xmm1, %r13
2253 ; AVX2-NEXT: leaq -1(%rcx,%r13), %r13
2254 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
2255 ; AVX2-NEXT: vpextrq $1, %xmm1, %r14
2256 ; AVX2-NEXT: leaq -1(%rcx,%r14), %rcx
2257 ; AVX2-NEXT: shrq %rsi
2258 ; AVX2-NEXT: vmovd %esi, %xmm0
2259 ; AVX2-NEXT: shrq %rdx
2260 ; AVX2-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
2261 ; AVX2-NEXT: shrq %rbx
2262 ; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
2263 ; AVX2-NEXT: shrq %rbp
2264 ; AVX2-NEXT: vpinsrb $3, %ebp, %xmm0, %xmm0
2265 ; AVX2-NEXT: shrq %rdi
2266 ; AVX2-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
2267 ; AVX2-NEXT: shrq %rax
2268 ; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
2269 ; AVX2-NEXT: shrq %r13
2270 ; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
2271 ; AVX2-NEXT: shrq %rcx
2272 ; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
2273 ; AVX2-NEXT: shrl %r12d
2274 ; AVX2-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0
2275 ; AVX2-NEXT: shrl %r15d
2276 ; AVX2-NEXT: vpinsrb $9, %r15d, %xmm0, %xmm0
2277 ; AVX2-NEXT: shrl %r8d
2278 ; AVX2-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0
2279 ; AVX2-NEXT: shrl %r9d
2280 ; AVX2-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0
2281 ; AVX2-NEXT: shrl %r10d
2282 ; AVX2-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
2283 ; AVX2-NEXT: shrl %r11d
2284 ; AVX2-NEXT: vpinsrb $13, %r11d, %xmm0, %xmm0
2285 ; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2286 ; AVX2-NEXT: shrl %eax
2287 ; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
2288 ; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2289 ; AVX2-NEXT: shrl %eax
2290 ; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
2291 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
2292 ; AVX2-NEXT: popq %rbx
2293 ; AVX2-NEXT: popq %r12
2294 ; AVX2-NEXT: popq %r13
2295 ; AVX2-NEXT: popq %r14
2296 ; AVX2-NEXT: popq %r15
2297 ; AVX2-NEXT: popq %rbp
2298 ; AVX2-NEXT: vzeroupper
2301 ; AVX512-LABEL: not_avg_v16i8_wide_constants:
2303 ; AVX512-NEXT: pushq %rbp
2304 ; AVX512-NEXT: pushq %r15
2305 ; AVX512-NEXT: pushq %r14
2306 ; AVX512-NEXT: pushq %r13
2307 ; AVX512-NEXT: pushq %r12
2308 ; AVX512-NEXT: pushq %rbx
2309 ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2310 ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2311 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2312 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0
2313 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2314 ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm0
2315 ; AVX512-NEXT: vpextrq $1, %xmm4, %r14
2316 ; AVX512-NEXT: vmovq %xmm4, %r13
2317 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2318 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm4
2319 ; AVX512-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2320 ; AVX512-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2321 ; AVX512-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2322 ; AVX512-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2323 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
2324 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2325 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
2326 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2327 ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm5
2328 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2329 ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm7
2330 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2331 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1
2332 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2333 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2334 ; AVX512-NEXT: vmovd %xmm4, %r12d
2335 ; AVX512-NEXT: vpextrd $2, %xmm4, %r15d
2336 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm4
2337 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
2338 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2339 ; AVX512-NEXT: vmovd %xmm7, %ecx
2340 ; AVX512-NEXT: vpextrd $2, %xmm7, %edi
2341 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7
2342 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
2343 ; AVX512-NEXT: vmovd %xmm6, %ebx
2344 ; AVX512-NEXT: vpextrd $2, %xmm6, %esi
2345 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm6
2346 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
2347 ; AVX512-NEXT: vmovd %xmm5, %edx
2348 ; AVX512-NEXT: vpextrd $2, %xmm5, %ebp
2349 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5
2350 ; AVX512-NEXT: vpextrd $2, %xmm6, %eax
2351 ; AVX512-NEXT: leal -1(%rbp,%rax), %eax
2352 ; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2353 ; AVX512-NEXT: vmovd %xmm6, %eax
2354 ; AVX512-NEXT: leal -1(%rdx,%rax), %eax
2355 ; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2356 ; AVX512-NEXT: vpextrd $2, %xmm7, %eax
2357 ; AVX512-NEXT: leal -1(%rsi,%rax), %r11d
2358 ; AVX512-NEXT: vmovd %xmm7, %eax
2359 ; AVX512-NEXT: leal -1(%rbx,%rax), %r10d
2360 ; AVX512-NEXT: vpextrd $2, %xmm5, %eax
2361 ; AVX512-NEXT: leal -1(%rdi,%rax), %r9d
2362 ; AVX512-NEXT: vmovd %xmm5, %eax
2363 ; AVX512-NEXT: leal -1(%rcx,%rax), %r8d
2364 ; AVX512-NEXT: vpextrd $2, %xmm3, %eax
2365 ; AVX512-NEXT: leal -1(%r15,%rax), %r15d
2366 ; AVX512-NEXT: vmovd %xmm3, %ecx
2367 ; AVX512-NEXT: leal -1(%r12,%rcx), %r12d
2368 ; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
2369 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2370 ; AVX512-NEXT: leaq -1(%rax,%rdx), %rdx
2371 ; AVX512-NEXT: vmovq %xmm2, %rsi
2372 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2373 ; AVX512-NEXT: leaq -1(%rax,%rsi), %rsi
2374 ; AVX512-NEXT: vmovq %xmm4, %rbx
2375 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2376 ; AVX512-NEXT: leaq -1(%rax,%rbx), %rbx
2377 ; AVX512-NEXT: vpextrq $1, %xmm4, %rbp
2378 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2379 ; AVX512-NEXT: leaq -1(%rax,%rbp), %rbp
2380 ; AVX512-NEXT: vmovq %xmm1, %rdi
2381 ; AVX512-NEXT: leaq -1(%r13,%rdi), %rdi
2382 ; AVX512-NEXT: vpextrq $1, %xmm1, %rax
2383 ; AVX512-NEXT: leaq -1(%r14,%rax), %rax
2384 ; AVX512-NEXT: vmovq %xmm0, %rcx
2385 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
2386 ; AVX512-NEXT: vmovq %xmm1, %r13
2387 ; AVX512-NEXT: leaq -1(%rcx,%r13), %r13
2388 ; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
2389 ; AVX512-NEXT: vpextrq $1, %xmm1, %r14
2390 ; AVX512-NEXT: leaq -1(%rcx,%r14), %rcx
2391 ; AVX512-NEXT: shrq %rsi
2392 ; AVX512-NEXT: vmovd %esi, %xmm0
2393 ; AVX512-NEXT: shrq %rdx
2394 ; AVX512-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
2395 ; AVX512-NEXT: shrq %rbx
2396 ; AVX512-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
2397 ; AVX512-NEXT: shrq %rbp
2398 ; AVX512-NEXT: vpinsrb $3, %ebp, %xmm0, %xmm0
2399 ; AVX512-NEXT: shrq %rdi
2400 ; AVX512-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
2401 ; AVX512-NEXT: shrq %rax
2402 ; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
2403 ; AVX512-NEXT: shrq %r13
2404 ; AVX512-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
2405 ; AVX512-NEXT: shrq %rcx
2406 ; AVX512-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
2407 ; AVX512-NEXT: shrl %r12d
2408 ; AVX512-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0
2409 ; AVX512-NEXT: shrl %r15d
2410 ; AVX512-NEXT: vpinsrb $9, %r15d, %xmm0, %xmm0
2411 ; AVX512-NEXT: shrl %r8d
2412 ; AVX512-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0
2413 ; AVX512-NEXT: shrl %r9d
2414 ; AVX512-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0
2415 ; AVX512-NEXT: shrl %r10d
2416 ; AVX512-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
2417 ; AVX512-NEXT: shrl %r11d
2418 ; AVX512-NEXT: vpinsrb $13, %r11d, %xmm0, %xmm0
2419 ; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2420 ; AVX512-NEXT: shrl %eax
2421 ; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
2422 ; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2423 ; AVX512-NEXT: shrl %eax
2424 ; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
2425 ; AVX512-NEXT: vmovdqu %xmm0, (%rax)
2426 ; AVX512-NEXT: popq %rbx
2427 ; AVX512-NEXT: popq %r12
2428 ; AVX512-NEXT: popq %r13
2429 ; AVX512-NEXT: popq %r14
2430 ; AVX512-NEXT: popq %r15
2431 ; AVX512-NEXT: popq %rbp
2432 ; AVX512-NEXT: vzeroupper
2434 %1 = load <16 x i8>, <16 x i8>* %a
2435 %2 = load <16 x i8>, <16 x i8>* %b
2436 %3 = zext <16 x i8> %1 to <16 x i128>
2437 %4 = zext <16 x i8> %2 to <16 x i128>
2438 %5 = add nuw nsw <16 x i128> %3, <i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1>
2439 %6 = add nuw nsw <16 x i128> %5, %4
2440 %7 = lshr <16 x i128> %6, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
2441 %8 = trunc <16 x i128> %7 to <16 x i8>
2442 store <16 x i8> %8, <16 x i8>* undef, align 4
2446 ; Make sure we don't fail on single element vectors.
2447 define <1 x i8> @avg_v1i8(<1 x i8> %x, <1 x i8> %y) {
2448 ; SSE2-LABEL: avg_v1i8:
2450 ; SSE2-NEXT: movzbl %dil, %eax
2451 ; SSE2-NEXT: movzbl %sil, %ecx
2452 ; SSE2-NEXT: leal 1(%rax,%rcx), %eax
2453 ; SSE2-NEXT: shrl %eax
2454 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
2457 ; AVX-LABEL: avg_v1i8:
2459 ; AVX-NEXT: movzbl %dil, %eax
2460 ; AVX-NEXT: movzbl %sil, %ecx
2461 ; AVX-NEXT: leal 1(%rax,%rcx), %eax
2462 ; AVX-NEXT: shrl %eax
2463 ; AVX-NEXT: # kill: def $al killed $al killed $eax
2465 %a = zext <1 x i8> %x to <1 x i16>
2466 %b = zext <1 x i8> %y to <1 x i16>
2467 %c = add <1 x i16> %a, %b
2468 %d = add <1 x i16> %c, <i16 1>
2469 %e = lshr <1 x i16> %d, <i16 1>
2470 %f = trunc <1 x i16> %e to <1 x i8>