1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
8 define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) nounwind {
9 ; SSE2-LABEL: avg_v4i8:
11 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
12 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
13 ; SSE2-NEXT: pavgb %xmm0, %xmm1
14 ; SSE2-NEXT: movd %xmm1, (%rax)
17 ; AVX-LABEL: avg_v4i8:
19 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
20 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
21 ; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0
22 ; AVX-NEXT: vmovd %xmm0, (%rax)
24 %1 = load <4 x i8>, <4 x i8>* %a
25 %2 = load <4 x i8>, <4 x i8>* %b
26 %3 = zext <4 x i8> %1 to <4 x i32>
27 %4 = zext <4 x i8> %2 to <4 x i32>
28 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
29 %6 = add nuw nsw <4 x i32> %5, %4
30 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
31 %8 = trunc <4 x i32> %7 to <4 x i8>
32 store <4 x i8> %8, <4 x i8>* undef, align 4
36 define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) nounwind {
37 ; SSE2-LABEL: avg_v8i8:
39 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
40 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
41 ; SSE2-NEXT: pavgb %xmm0, %xmm1
42 ; SSE2-NEXT: movq %xmm1, (%rax)
45 ; AVX-LABEL: avg_v8i8:
47 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
48 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
49 ; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0
50 ; AVX-NEXT: vmovq %xmm0, (%rax)
52 %1 = load <8 x i8>, <8 x i8>* %a
53 %2 = load <8 x i8>, <8 x i8>* %b
54 %3 = zext <8 x i8> %1 to <8 x i32>
55 %4 = zext <8 x i8> %2 to <8 x i32>
56 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
57 %6 = add nuw nsw <8 x i32> %5, %4
58 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
59 %8 = trunc <8 x i32> %7 to <8 x i8>
60 store <8 x i8> %8, <8 x i8>* undef, align 4
64 define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind {
65 ; SSE2-LABEL: avg_v16i8:
67 ; SSE2-NEXT: movdqa (%rsi), %xmm0
68 ; SSE2-NEXT: pavgb (%rdi), %xmm0
69 ; SSE2-NEXT: movdqu %xmm0, (%rax)
72 ; AVX-LABEL: avg_v16i8:
74 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
75 ; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0
76 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
78 %1 = load <16 x i8>, <16 x i8>* %a
79 %2 = load <16 x i8>, <16 x i8>* %b
80 %3 = zext <16 x i8> %1 to <16 x i32>
81 %4 = zext <16 x i8> %2 to <16 x i32>
82 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
83 %6 = add nuw nsw <16 x i32> %5, %4
84 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
85 %8 = trunc <16 x i32> %7 to <16 x i8>
86 store <16 x i8> %8, <16 x i8>* undef, align 4
90 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
91 ; SSE2-LABEL: avg_v32i8:
93 ; SSE2-NEXT: movdqa (%rsi), %xmm0
94 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
95 ; SSE2-NEXT: pavgb (%rdi), %xmm0
96 ; SSE2-NEXT: pavgb 16(%rdi), %xmm1
97 ; SSE2-NEXT: movdqu %xmm1, (%rax)
98 ; SSE2-NEXT: movdqu %xmm0, (%rax)
101 ; AVX1-LABEL: avg_v32i8:
103 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
104 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
105 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0
106 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1
107 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
108 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
111 ; AVX2-LABEL: avg_v32i8:
113 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
114 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
115 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
116 ; AVX2-NEXT: vzeroupper
119 ; AVX512-LABEL: avg_v32i8:
121 ; AVX512-NEXT: vmovdqa (%rsi), %ymm0
122 ; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0
123 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
124 ; AVX512-NEXT: vzeroupper
126 %1 = load <32 x i8>, <32 x i8>* %a
127 %2 = load <32 x i8>, <32 x i8>* %b
128 %3 = zext <32 x i8> %1 to <32 x i32>
129 %4 = zext <32 x i8> %2 to <32 x i32>
130 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
131 %6 = add nuw nsw <32 x i32> %5, %4
132 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
133 %8 = trunc <32 x i32> %7 to <32 x i8>
134 store <32 x i8> %8, <32 x i8>* undef, align 4
138 define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
139 ; SSE2-LABEL: avg_v48i8:
141 ; SSE2-NEXT: movdqa (%rdi), %xmm1
142 ; SSE2-NEXT: movdqa 16(%rdi), %xmm6
143 ; SSE2-NEXT: movdqa 32(%rdi), %xmm11
144 ; SSE2-NEXT: movdqa (%rsi), %xmm12
145 ; SSE2-NEXT: movdqa 16(%rsi), %xmm13
146 ; SSE2-NEXT: movdqa 32(%rsi), %xmm0
147 ; SSE2-NEXT: pxor %xmm7, %xmm7
148 ; SSE2-NEXT: movdqa %xmm1, %xmm4
149 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
150 ; SSE2-NEXT: movdqa %xmm4, %xmm2
151 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
152 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
153 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
154 ; SSE2-NEXT: movdqa %xmm1, %xmm10
155 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
156 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
157 ; SSE2-NEXT: movdqa %xmm6, %xmm5
158 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
159 ; SSE2-NEXT: movdqa %xmm5, %xmm15
160 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
161 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
162 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
163 ; SSE2-NEXT: movdqa %xmm6, %xmm14
164 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
165 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
166 ; SSE2-NEXT: movdqa %xmm12, %xmm3
167 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15]
168 ; SSE2-NEXT: movdqa %xmm3, %xmm8
169 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
170 ; SSE2-NEXT: paddd %xmm2, %xmm8
171 ; SSE2-NEXT: movdqa %xmm11, %xmm2
172 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
173 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
174 ; SSE2-NEXT: paddd %xmm4, %xmm3
175 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
176 ; SSE2-NEXT: movdqa %xmm12, %xmm9
177 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
178 ; SSE2-NEXT: paddd %xmm10, %xmm9
179 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
180 ; SSE2-NEXT: paddd %xmm1, %xmm12
181 ; SSE2-NEXT: movdqa %xmm13, %xmm4
182 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
183 ; SSE2-NEXT: movdqa %xmm4, %xmm10
184 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
185 ; SSE2-NEXT: paddd %xmm15, %xmm10
186 ; SSE2-NEXT: movdqa %xmm2, %xmm15
187 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
188 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
189 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
190 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
191 ; SSE2-NEXT: paddd %xmm5, %xmm4
192 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
193 ; SSE2-NEXT: movdqa %xmm13, %xmm1
194 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
195 ; SSE2-NEXT: paddd %xmm14, %xmm1
196 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3]
197 ; SSE2-NEXT: paddd %xmm6, %xmm13
198 ; SSE2-NEXT: movdqa %xmm0, %xmm6
199 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
200 ; SSE2-NEXT: movdqa %xmm6, %xmm14
201 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
202 ; SSE2-NEXT: paddd %xmm15, %xmm14
203 ; SSE2-NEXT: movdqa %xmm11, %xmm5
204 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
205 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
206 ; SSE2-NEXT: paddd %xmm2, %xmm6
207 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
208 ; SSE2-NEXT: movdqa %xmm0, %xmm2
209 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
210 ; SSE2-NEXT: paddd %xmm5, %xmm2
211 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
212 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
213 ; SSE2-NEXT: paddd %xmm11, %xmm0
214 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
215 ; SSE2-NEXT: psubd %xmm5, %xmm8
216 ; SSE2-NEXT: psubd %xmm5, %xmm3
217 ; SSE2-NEXT: psubd %xmm5, %xmm9
218 ; SSE2-NEXT: psubd %xmm5, %xmm12
219 ; SSE2-NEXT: psubd %xmm5, %xmm10
220 ; SSE2-NEXT: psubd %xmm5, %xmm4
221 ; SSE2-NEXT: psubd %xmm5, %xmm1
222 ; SSE2-NEXT: psubd %xmm5, %xmm13
223 ; SSE2-NEXT: psubd %xmm5, %xmm14
224 ; SSE2-NEXT: psubd %xmm5, %xmm6
225 ; SSE2-NEXT: psubd %xmm5, %xmm2
226 ; SSE2-NEXT: psubd %xmm5, %xmm0
227 ; SSE2-NEXT: psrld $1, %xmm3
228 ; SSE2-NEXT: psrld $1, %xmm8
229 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
230 ; SSE2-NEXT: pand %xmm7, %xmm8
231 ; SSE2-NEXT: pand %xmm7, %xmm3
232 ; SSE2-NEXT: packuswb %xmm8, %xmm3
233 ; SSE2-NEXT: psrld $1, %xmm12
234 ; SSE2-NEXT: psrld $1, %xmm9
235 ; SSE2-NEXT: pand %xmm7, %xmm9
236 ; SSE2-NEXT: pand %xmm7, %xmm12
237 ; SSE2-NEXT: packuswb %xmm9, %xmm12
238 ; SSE2-NEXT: packuswb %xmm3, %xmm12
239 ; SSE2-NEXT: psrld $1, %xmm4
240 ; SSE2-NEXT: psrld $1, %xmm10
241 ; SSE2-NEXT: pand %xmm7, %xmm10
242 ; SSE2-NEXT: pand %xmm7, %xmm4
243 ; SSE2-NEXT: packuswb %xmm10, %xmm4
244 ; SSE2-NEXT: psrld $1, %xmm13
245 ; SSE2-NEXT: psrld $1, %xmm1
246 ; SSE2-NEXT: pand %xmm7, %xmm1
247 ; SSE2-NEXT: pand %xmm7, %xmm13
248 ; SSE2-NEXT: packuswb %xmm1, %xmm13
249 ; SSE2-NEXT: packuswb %xmm4, %xmm13
250 ; SSE2-NEXT: psrld $1, %xmm6
251 ; SSE2-NEXT: psrld $1, %xmm14
252 ; SSE2-NEXT: pand %xmm7, %xmm14
253 ; SSE2-NEXT: pand %xmm7, %xmm6
254 ; SSE2-NEXT: packuswb %xmm14, %xmm6
255 ; SSE2-NEXT: psrld $1, %xmm0
256 ; SSE2-NEXT: psrld $1, %xmm2
257 ; SSE2-NEXT: pand %xmm7, %xmm2
258 ; SSE2-NEXT: pand %xmm7, %xmm0
259 ; SSE2-NEXT: packuswb %xmm2, %xmm0
260 ; SSE2-NEXT: packuswb %xmm6, %xmm0
261 ; SSE2-NEXT: movdqu %xmm0, (%rax)
262 ; SSE2-NEXT: movdqu %xmm13, (%rax)
263 ; SSE2-NEXT: movdqu %xmm12, (%rax)
266 ; AVX1-LABEL: avg_v48i8:
268 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
269 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4
270 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1
271 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
272 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
273 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1]
274 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
275 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
276 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
277 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
278 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
279 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
280 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1]
281 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
282 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
283 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
284 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
285 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
286 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
287 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
288 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,0,1]
289 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
290 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
291 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
292 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
293 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
294 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
295 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
296 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
297 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4
298 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3
299 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
300 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
301 ; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm12
302 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,0,1]
303 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
304 ; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm10
305 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3]
306 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
307 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm9
308 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
309 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm8
310 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
311 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
312 ; AVX1-NEXT: vpaddd %xmm3, %xmm15, %xmm15
313 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,0,1]
314 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
315 ; AVX1-NEXT: vpaddd %xmm7, %xmm11, %xmm7
316 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
317 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
318 ; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm14
319 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
320 ; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13
321 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
322 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
323 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
324 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,0,1]
325 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
326 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
327 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,3]
328 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
329 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
330 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
331 ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
332 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
333 ; AVX1-NEXT: vpsubd %xmm3, %xmm12, %xmm11
334 ; AVX1-NEXT: vpsubd %xmm3, %xmm10, %xmm10
335 ; AVX1-NEXT: vpsubd %xmm3, %xmm9, %xmm9
336 ; AVX1-NEXT: vpsubd %xmm3, %xmm8, %xmm8
337 ; AVX1-NEXT: vpsubd %xmm3, %xmm15, %xmm12
338 ; AVX1-NEXT: vpsubd %xmm3, %xmm7, %xmm7
339 ; AVX1-NEXT: vpsubd %xmm3, %xmm14, %xmm0
340 ; AVX1-NEXT: vpsubd %xmm3, %xmm13, %xmm2
341 ; AVX1-NEXT: vpsubd %xmm3, %xmm5, %xmm5
342 ; AVX1-NEXT: vpsubd %xmm3, %xmm6, %xmm6
343 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
344 ; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3
345 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
346 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
347 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
348 ; AVX1-NEXT: vpsrld $1, %xmm6, %xmm3
349 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm4
350 ; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
351 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
352 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
353 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
354 ; AVX1-NEXT: vpsrld $1, %xmm7, %xmm2
355 ; AVX1-NEXT: vpsrld $1, %xmm12, %xmm4
356 ; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
357 ; AVX1-NEXT: vpsrld $1, %xmm8, %xmm4
358 ; AVX1-NEXT: vpsrld $1, %xmm9, %xmm5
359 ; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
360 ; AVX1-NEXT: vpsrld $1, %xmm10, %xmm5
361 ; AVX1-NEXT: vpsrld $1, %xmm11, %xmm6
362 ; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
363 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
364 ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
365 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
366 ; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
367 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
368 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
369 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
370 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm2
371 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
372 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
373 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
374 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
375 ; AVX1-NEXT: vmovdqu %xmm4, (%rax)
378 ; AVX2-LABEL: avg_v48i8:
380 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
381 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
382 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
383 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
384 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
385 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
386 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
387 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
388 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
389 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
390 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
391 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
392 ; AVX2-NEXT: vmovdqa (%rsi), %xmm6
393 ; AVX2-NEXT: vmovdqa 16(%rsi), %xmm7
394 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2
395 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]
396 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
397 ; AVX2-NEXT: vpaddd %ymm5, %ymm3, %ymm3
398 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
399 ; AVX2-NEXT: vpaddd %ymm5, %ymm0, %ymm0
400 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
401 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
402 ; AVX2-NEXT: vpaddd %ymm5, %ymm4, %ymm4
403 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero
404 ; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
405 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
406 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
407 ; AVX2-NEXT: vpaddd %ymm5, %ymm9, %ymm5
408 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
409 ; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2
410 ; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
411 ; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3
412 ; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0
413 ; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4
414 ; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1
415 ; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5
416 ; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2
417 ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
418 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
419 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
420 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
421 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
422 ; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
423 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3]
424 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
425 ; AVX2-NEXT: vpackusdw %ymm6, %ymm0, %ymm0
426 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
427 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
428 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm4[2,3]
429 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
430 ; AVX2-NEXT: vpackusdw %ymm6, %ymm1, %ymm1
431 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
432 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
433 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
434 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
435 ; AVX2-NEXT: vpackuswb %ymm0, %ymm4, %ymm0
436 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm5[2,3]
437 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
438 ; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1
439 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
440 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
441 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
442 ; AVX2-NEXT: vmovdqu %xmm1, (%rax)
443 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
444 ; AVX2-NEXT: vzeroupper
447 ; AVX512F-LABEL: avg_v48i8:
449 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
450 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
451 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
452 ; AVX512F-NEXT: vpavgb (%rsi), %xmm0, %xmm0
453 ; AVX512F-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
454 ; AVX512F-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2
455 ; AVX512F-NEXT: vmovdqu %xmm1, (%rax)
456 ; AVX512F-NEXT: vmovdqu %xmm0, (%rax)
457 ; AVX512F-NEXT: vmovdqu %xmm2, (%rax)
460 ; AVX512BW-LABEL: avg_v48i8:
462 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
463 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
464 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
465 ; AVX512BW-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
466 ; AVX512BW-NEXT: vpavgb (%rsi), %xmm0, %xmm0
467 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
468 ; AVX512BW-NEXT: vpavgb 32(%rsi), %xmm2, %xmm1
469 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
470 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
471 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, (%rax)
472 ; AVX512BW-NEXT: vzeroupper
473 ; AVX512BW-NEXT: retq
474 %1 = load <48 x i8>, <48 x i8>* %a
475 %2 = load <48 x i8>, <48 x i8>* %b
476 %3 = zext <48 x i8> %1 to <48 x i32>
477 %4 = zext <48 x i8> %2 to <48 x i32>
478 %5 = add nuw nsw <48 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
479 %6 = add nuw nsw <48 x i32> %5, %4
480 %7 = lshr <48 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
481 %8 = trunc <48 x i32> %7 to <48 x i8>
482 store <48 x i8> %8, <48 x i8>* undef, align 4
486 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
487 ; SSE2-LABEL: avg_v64i8:
489 ; SSE2-NEXT: movdqa (%rsi), %xmm0
490 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
491 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
492 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
493 ; SSE2-NEXT: pavgb (%rdi), %xmm0
494 ; SSE2-NEXT: pavgb 16(%rdi), %xmm1
495 ; SSE2-NEXT: pavgb 32(%rdi), %xmm2
496 ; SSE2-NEXT: pavgb 48(%rdi), %xmm3
497 ; SSE2-NEXT: movdqu %xmm3, (%rax)
498 ; SSE2-NEXT: movdqu %xmm2, (%rax)
499 ; SSE2-NEXT: movdqu %xmm1, (%rax)
500 ; SSE2-NEXT: movdqu %xmm0, (%rax)
503 ; AVX1-LABEL: avg_v64i8:
505 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
506 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
507 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2
508 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3
509 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0
510 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1
511 ; AVX1-NEXT: vpavgb 32(%rdi), %xmm2, %xmm2
512 ; AVX1-NEXT: vpavgb 48(%rdi), %xmm3, %xmm3
513 ; AVX1-NEXT: vmovdqu %xmm3, (%rax)
514 ; AVX1-NEXT: vmovdqu %xmm2, (%rax)
515 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
516 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
519 ; AVX2-LABEL: avg_v64i8:
521 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
522 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
523 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
524 ; AVX2-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
525 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
526 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
527 ; AVX2-NEXT: vzeroupper
530 ; AVX512F-LABEL: avg_v64i8:
532 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
533 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
534 ; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0
535 ; AVX512F-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
536 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
537 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
538 ; AVX512F-NEXT: vzeroupper
541 ; AVX512BW-LABEL: avg_v64i8:
543 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
544 ; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
545 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
546 ; AVX512BW-NEXT: vzeroupper
547 ; AVX512BW-NEXT: retq
548 %1 = load <64 x i8>, <64 x i8>* %a
549 %2 = load <64 x i8>, <64 x i8>* %b
550 %3 = zext <64 x i8> %1 to <64 x i32>
551 %4 = zext <64 x i8> %2 to <64 x i32>
552 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
553 %6 = add nuw nsw <64 x i32> %5, %4
554 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
555 %8 = trunc <64 x i32> %7 to <64 x i8>
556 store <64 x i8> %8, <64 x i8>* undef, align 4
560 define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) nounwind {
561 ; SSE2-LABEL: avg_v4i16:
563 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
564 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
565 ; SSE2-NEXT: pavgw %xmm0, %xmm1
566 ; SSE2-NEXT: movq %xmm1, (%rax)
569 ; AVX-LABEL: avg_v4i16:
571 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
572 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
573 ; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0
574 ; AVX-NEXT: vmovq %xmm0, (%rax)
576 %1 = load <4 x i16>, <4 x i16>* %a
577 %2 = load <4 x i16>, <4 x i16>* %b
578 %3 = zext <4 x i16> %1 to <4 x i32>
579 %4 = zext <4 x i16> %2 to <4 x i32>
580 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
581 %6 = add nuw nsw <4 x i32> %5, %4
582 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
583 %8 = trunc <4 x i32> %7 to <4 x i16>
584 store <4 x i16> %8, <4 x i16>* undef, align 4
588 define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind {
589 ; SSE2-LABEL: avg_v8i16:
591 ; SSE2-NEXT: movdqa (%rsi), %xmm0
592 ; SSE2-NEXT: pavgw (%rdi), %xmm0
593 ; SSE2-NEXT: movdqu %xmm0, (%rax)
596 ; AVX-LABEL: avg_v8i16:
598 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
599 ; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0
600 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
602 %1 = load <8 x i16>, <8 x i16>* %a
603 %2 = load <8 x i16>, <8 x i16>* %b
604 %3 = zext <8 x i16> %1 to <8 x i32>
605 %4 = zext <8 x i16> %2 to <8 x i32>
606 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
607 %6 = add nuw nsw <8 x i32> %5, %4
608 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
609 %8 = trunc <8 x i32> %7 to <8 x i16>
610 store <8 x i16> %8, <8 x i16>* undef, align 4
614 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
615 ; SSE2-LABEL: avg_v16i16:
617 ; SSE2-NEXT: movdqa (%rsi), %xmm0
618 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
619 ; SSE2-NEXT: pavgw (%rdi), %xmm0
620 ; SSE2-NEXT: pavgw 16(%rdi), %xmm1
621 ; SSE2-NEXT: movdqu %xmm1, (%rax)
622 ; SSE2-NEXT: movdqu %xmm0, (%rax)
625 ; AVX1-LABEL: avg_v16i16:
627 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
628 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
629 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0
630 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1
631 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
632 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
635 ; AVX2-LABEL: avg_v16i16:
637 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
638 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
639 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
640 ; AVX2-NEXT: vzeroupper
643 ; AVX512-LABEL: avg_v16i16:
645 ; AVX512-NEXT: vmovdqa (%rsi), %ymm0
646 ; AVX512-NEXT: vpavgw (%rdi), %ymm0, %ymm0
647 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
648 ; AVX512-NEXT: vzeroupper
650 %1 = load <16 x i16>, <16 x i16>* %a
651 %2 = load <16 x i16>, <16 x i16>* %b
652 %3 = zext <16 x i16> %1 to <16 x i32>
653 %4 = zext <16 x i16> %2 to <16 x i32>
654 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
655 %6 = add nuw nsw <16 x i32> %5, %4
656 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
657 %8 = trunc <16 x i32> %7 to <16 x i16>
658 store <16 x i16> %8, <16 x i16>* undef, align 4
662 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
663 ; SSE2-LABEL: avg_v32i16:
665 ; SSE2-NEXT: movdqa (%rsi), %xmm0
666 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
667 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
668 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
669 ; SSE2-NEXT: pavgw (%rdi), %xmm0
670 ; SSE2-NEXT: pavgw 16(%rdi), %xmm1
671 ; SSE2-NEXT: pavgw 32(%rdi), %xmm2
672 ; SSE2-NEXT: pavgw 48(%rdi), %xmm3
673 ; SSE2-NEXT: movdqu %xmm3, (%rax)
674 ; SSE2-NEXT: movdqu %xmm2, (%rax)
675 ; SSE2-NEXT: movdqu %xmm1, (%rax)
676 ; SSE2-NEXT: movdqu %xmm0, (%rax)
679 ; AVX1-LABEL: avg_v32i16:
681 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
682 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
683 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2
684 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3
685 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0
686 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1
687 ; AVX1-NEXT: vpavgw 32(%rdi), %xmm2, %xmm2
688 ; AVX1-NEXT: vpavgw 48(%rdi), %xmm3, %xmm3
689 ; AVX1-NEXT: vmovdqu %xmm3, (%rax)
690 ; AVX1-NEXT: vmovdqu %xmm2, (%rax)
691 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
692 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
695 ; AVX2-LABEL: avg_v32i16:
697 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
698 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
699 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
700 ; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
701 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
702 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
703 ; AVX2-NEXT: vzeroupper
706 ; AVX512F-LABEL: avg_v32i16:
708 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
709 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
710 ; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0
711 ; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
712 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
713 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
714 ; AVX512F-NEXT: vzeroupper
717 ; AVX512BW-LABEL: avg_v32i16:
719 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
720 ; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
721 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
722 ; AVX512BW-NEXT: vzeroupper
723 ; AVX512BW-NEXT: retq
724 %1 = load <32 x i16>, <32 x i16>* %a
725 %2 = load <32 x i16>, <32 x i16>* %b
726 %3 = zext <32 x i16> %1 to <32 x i32>
727 %4 = zext <32 x i16> %2 to <32 x i32>
728 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
729 %6 = add nuw nsw <32 x i32> %5, %4
730 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
731 %8 = trunc <32 x i32> %7 to <32 x i16>
732 store <32 x i16> %8, <32 x i16>* undef, align 4
736 define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) nounwind {
737 ; SSE2-LABEL: avg_v4i8_2:
739 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
740 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
741 ; SSE2-NEXT: pavgb %xmm0, %xmm1
742 ; SSE2-NEXT: movd %xmm1, (%rax)
745 ; AVX-LABEL: avg_v4i8_2:
747 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
748 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
749 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
750 ; AVX-NEXT: vmovd %xmm0, (%rax)
752 %1 = load <4 x i8>, <4 x i8>* %a
753 %2 = load <4 x i8>, <4 x i8>* %b
754 %3 = zext <4 x i8> %1 to <4 x i32>
755 %4 = zext <4 x i8> %2 to <4 x i32>
756 %5 = add nuw nsw <4 x i32> %3, %4
757 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
758 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
759 %8 = trunc <4 x i32> %7 to <4 x i8>
760 store <4 x i8> %8, <4 x i8>* undef, align 4
764 define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) nounwind {
765 ; SSE2-LABEL: avg_v8i8_2:
767 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
768 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
769 ; SSE2-NEXT: pavgb %xmm0, %xmm1
770 ; SSE2-NEXT: movq %xmm1, (%rax)
773 ; AVX-LABEL: avg_v8i8_2:
775 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
776 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
777 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
778 ; AVX-NEXT: vmovq %xmm0, (%rax)
780 %1 = load <8 x i8>, <8 x i8>* %a
781 %2 = load <8 x i8>, <8 x i8>* %b
782 %3 = zext <8 x i8> %1 to <8 x i32>
783 %4 = zext <8 x i8> %2 to <8 x i32>
784 %5 = add nuw nsw <8 x i32> %3, %4
785 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
786 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
787 %8 = trunc <8 x i32> %7 to <8 x i8>
788 store <8 x i8> %8, <8 x i8>* undef, align 4
792 define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind {
793 ; SSE2-LABEL: avg_v16i8_2:
795 ; SSE2-NEXT: movdqa (%rdi), %xmm0
796 ; SSE2-NEXT: pavgb (%rsi), %xmm0
797 ; SSE2-NEXT: movdqu %xmm0, (%rax)
800 ; AVX-LABEL: avg_v16i8_2:
802 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
803 ; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0
804 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
806 %1 = load <16 x i8>, <16 x i8>* %a
807 %2 = load <16 x i8>, <16 x i8>* %b
808 %3 = zext <16 x i8> %1 to <16 x i32>
809 %4 = zext <16 x i8> %2 to <16 x i32>
810 %5 = add nuw nsw <16 x i32> %3, %4
811 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
812 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
813 %8 = trunc <16 x i32> %7 to <16 x i8>
814 store <16 x i8> %8, <16 x i8>* undef, align 4
818 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
819 ; SSE2-LABEL: avg_v32i8_2:
821 ; SSE2-NEXT: movdqa (%rdi), %xmm0
822 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
823 ; SSE2-NEXT: pavgb (%rsi), %xmm0
824 ; SSE2-NEXT: pavgb 16(%rsi), %xmm1
825 ; SSE2-NEXT: movdqu %xmm1, (%rax)
826 ; SSE2-NEXT: movdqu %xmm0, (%rax)
829 ; AVX1-LABEL: avg_v32i8_2:
831 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
832 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
833 ; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0
834 ; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
835 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
836 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
839 ; AVX2-LABEL: avg_v32i8_2:
841 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
842 ; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
843 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
844 ; AVX2-NEXT: vzeroupper
847 ; AVX512-LABEL: avg_v32i8_2:
849 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
850 ; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0
851 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
852 ; AVX512-NEXT: vzeroupper
854 %1 = load <32 x i8>, <32 x i8>* %a
855 %2 = load <32 x i8>, <32 x i8>* %b
856 %3 = zext <32 x i8> %1 to <32 x i32>
857 %4 = zext <32 x i8> %2 to <32 x i32>
858 %5 = add nuw nsw <32 x i32> %3, %4
859 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
860 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
861 %8 = trunc <32 x i32> %7 to <32 x i8>
862 store <32 x i8> %8, <32 x i8>* undef, align 4
866 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
867 ; SSE2-LABEL: avg_v64i8_2:
869 ; SSE2-NEXT: movdqa (%rsi), %xmm0
870 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
871 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
872 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
873 ; SSE2-NEXT: pavgb %xmm0, %xmm0
874 ; SSE2-NEXT: pavgb %xmm1, %xmm1
875 ; SSE2-NEXT: pavgb %xmm2, %xmm2
876 ; SSE2-NEXT: pavgb %xmm3, %xmm3
877 ; SSE2-NEXT: movdqu %xmm3, (%rax)
878 ; SSE2-NEXT: movdqu %xmm2, (%rax)
879 ; SSE2-NEXT: movdqu %xmm1, (%rax)
880 ; SSE2-NEXT: movdqu %xmm0, (%rax)
883 ; AVX1-LABEL: avg_v64i8_2:
885 ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
886 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
887 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2
888 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3
889 ; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm0
890 ; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm1
891 ; AVX1-NEXT: vpavgb %xmm2, %xmm2, %xmm2
892 ; AVX1-NEXT: vpavgb %xmm3, %xmm3, %xmm3
893 ; AVX1-NEXT: vmovdqu %xmm3, (%rax)
894 ; AVX1-NEXT: vmovdqu %xmm2, (%rax)
895 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
896 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
899 ; AVX2-LABEL: avg_v64i8_2:
901 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
902 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
903 ; AVX2-NEXT: vpavgb %ymm0, %ymm0, %ymm0
904 ; AVX2-NEXT: vpavgb %ymm1, %ymm1, %ymm1
905 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
906 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
907 ; AVX2-NEXT: vzeroupper
910 ; AVX512F-LABEL: avg_v64i8_2:
912 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
913 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
914 ; AVX512F-NEXT: vpavgb %ymm0, %ymm0, %ymm0
915 ; AVX512F-NEXT: vpavgb %ymm1, %ymm1, %ymm1
916 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
917 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
918 ; AVX512F-NEXT: vzeroupper
921 ; AVX512BW-LABEL: avg_v64i8_2:
923 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
924 ; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
925 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
926 ; AVX512BW-NEXT: vzeroupper
927 ; AVX512BW-NEXT: retq
928 %1 = load <64 x i8>, <64 x i8>* %a
929 %2 = load <64 x i8>, <64 x i8>* %b
930 %3 = zext <64 x i8> %1 to <64 x i32>
931 %4 = zext <64 x i8> %2 to <64 x i32>
932 %5 = add nuw nsw <64 x i32> %4, %4
933 %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
934 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
935 %8 = trunc <64 x i32> %7 to <64 x i8>
936 store <64 x i8> %8, <64 x i8>* undef, align 4
941 define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) nounwind {
942 ; SSE2-LABEL: avg_v4i16_2:
944 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
945 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
946 ; SSE2-NEXT: pavgw %xmm0, %xmm1
947 ; SSE2-NEXT: movq %xmm1, (%rax)
950 ; AVX-LABEL: avg_v4i16_2:
952 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
953 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
954 ; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0
955 ; AVX-NEXT: vmovq %xmm0, (%rax)
957 %1 = load <4 x i16>, <4 x i16>* %a
958 %2 = load <4 x i16>, <4 x i16>* %b
959 %3 = zext <4 x i16> %1 to <4 x i32>
960 %4 = zext <4 x i16> %2 to <4 x i32>
961 %5 = add nuw nsw <4 x i32> %3, %4
962 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
963 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
964 %8 = trunc <4 x i32> %7 to <4 x i16>
965 store <4 x i16> %8, <4 x i16>* undef, align 4
969 define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind {
970 ; SSE2-LABEL: avg_v8i16_2:
972 ; SSE2-NEXT: movdqa (%rdi), %xmm0
973 ; SSE2-NEXT: pavgw (%rsi), %xmm0
974 ; SSE2-NEXT: movdqu %xmm0, (%rax)
977 ; AVX-LABEL: avg_v8i16_2:
979 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
980 ; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0
981 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
983 %1 = load <8 x i16>, <8 x i16>* %a
984 %2 = load <8 x i16>, <8 x i16>* %b
985 %3 = zext <8 x i16> %1 to <8 x i32>
986 %4 = zext <8 x i16> %2 to <8 x i32>
987 %5 = add nuw nsw <8 x i32> %3, %4
988 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
989 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
990 %8 = trunc <8 x i32> %7 to <8 x i16>
991 store <8 x i16> %8, <8 x i16>* undef, align 4
995 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
996 ; SSE2-LABEL: avg_v16i16_2:
998 ; SSE2-NEXT: movdqa (%rdi), %xmm0
999 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
1000 ; SSE2-NEXT: pavgw (%rsi), %xmm0
1001 ; SSE2-NEXT: pavgw 16(%rsi), %xmm1
1002 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1003 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1006 ; AVX1-LABEL: avg_v16i16_2:
1008 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
1009 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
1010 ; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0
1011 ; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1
1012 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
1013 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1016 ; AVX2-LABEL: avg_v16i16_2:
1018 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1019 ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
1020 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1021 ; AVX2-NEXT: vzeroupper
1024 ; AVX512-LABEL: avg_v16i16_2:
1026 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
1027 ; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0
1028 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
1029 ; AVX512-NEXT: vzeroupper
1031 %1 = load <16 x i16>, <16 x i16>* %a
1032 %2 = load <16 x i16>, <16 x i16>* %b
1033 %3 = zext <16 x i16> %1 to <16 x i32>
1034 %4 = zext <16 x i16> %2 to <16 x i32>
1035 %5 = add nuw nsw <16 x i32> %3, %4
1036 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1037 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1038 %8 = trunc <16 x i32> %7 to <16 x i16>
1039 store <16 x i16> %8, <16 x i16>* undef, align 4
1043 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
1044 ; SSE2-LABEL: avg_v32i16_2:
1046 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1047 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
1048 ; SSE2-NEXT: movdqa 32(%rdi), %xmm2
1049 ; SSE2-NEXT: movdqa 48(%rdi), %xmm3
1050 ; SSE2-NEXT: pavgw (%rsi), %xmm0
1051 ; SSE2-NEXT: pavgw 16(%rsi), %xmm1
1052 ; SSE2-NEXT: pavgw 32(%rsi), %xmm2
1053 ; SSE2-NEXT: pavgw 48(%rsi), %xmm3
1054 ; SSE2-NEXT: movdqu %xmm3, (%rax)
1055 ; SSE2-NEXT: movdqu %xmm2, (%rax)
1056 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1057 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1060 ; AVX1-LABEL: avg_v32i16_2:
1062 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
1063 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
1064 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
1065 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
1066 ; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0
1067 ; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1
1068 ; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2
1069 ; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm3
1070 ; AVX1-NEXT: vmovdqu %xmm3, (%rax)
1071 ; AVX1-NEXT: vmovdqu %xmm2, (%rax)
1072 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
1073 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1076 ; AVX2-LABEL: avg_v32i16_2:
1078 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1079 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
1080 ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
1081 ; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1
1082 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
1083 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1084 ; AVX2-NEXT: vzeroupper
1087 ; AVX512F-LABEL: avg_v32i16_2:
1089 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1090 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
1091 ; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0
1092 ; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1
1093 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
1094 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
1095 ; AVX512F-NEXT: vzeroupper
1096 ; AVX512F-NEXT: retq
1098 ; AVX512BW-LABEL: avg_v32i16_2:
1099 ; AVX512BW: # %bb.0:
1100 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1101 ; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
1102 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
1103 ; AVX512BW-NEXT: vzeroupper
1104 ; AVX512BW-NEXT: retq
1105 %1 = load <32 x i16>, <32 x i16>* %a
1106 %2 = load <32 x i16>, <32 x i16>* %b
1107 %3 = zext <32 x i16> %1 to <32 x i32>
1108 %4 = zext <32 x i16> %2 to <32 x i32>
1109 %5 = add nuw nsw <32 x i32> %3, %4
1110 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1111 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1112 %8 = trunc <32 x i32> %7 to <32 x i16>
1113 store <32 x i16> %8, <32 x i16>* undef, align 4
1117 define void @avg_v4i8_const(<4 x i8>* %a) nounwind {
1118 ; SSE2-LABEL: avg_v4i8_const:
1120 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1121 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
1122 ; SSE2-NEXT: movd %xmm0, (%rax)
1125 ; AVX-LABEL: avg_v4i8_const:
1127 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1128 ; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
1129 ; AVX-NEXT: vmovd %xmm0, (%rax)
1131 %1 = load <4 x i8>, <4 x i8>* %a
1132 %2 = zext <4 x i8> %1 to <4 x i32>
1133 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
1134 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
1135 %5 = trunc <4 x i32> %4 to <4 x i8>
1136 store <4 x i8> %5, <4 x i8>* undef, align 4
1140 define void @avg_v8i8_const(<8 x i8>* %a) nounwind {
1141 ; SSE2-LABEL: avg_v8i8_const:
1143 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1144 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
1145 ; SSE2-NEXT: movq %xmm0, (%rax)
1148 ; AVX-LABEL: avg_v8i8_const:
1150 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1151 ; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
1152 ; AVX-NEXT: vmovq %xmm0, (%rax)
1154 %1 = load <8 x i8>, <8 x i8>* %a
1155 %2 = zext <8 x i8> %1 to <8 x i32>
1156 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1157 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1158 %5 = trunc <8 x i32> %4 to <8 x i8>
1159 store <8 x i8> %5, <8 x i8>* undef, align 4
1163 define void @avg_v16i8_const(<16 x i8>* %a) nounwind {
1164 ; SSE2-LABEL: avg_v16i8_const:
1166 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1167 ; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
1168 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1171 ; AVX-LABEL: avg_v16i8_const:
1173 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1174 ; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
1175 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
1177 %1 = load <16 x i8>, <16 x i8>* %a
1178 %2 = zext <16 x i8> %1 to <16 x i32>
1179 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1180 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1181 %5 = trunc <16 x i32> %4 to <16 x i8>
1182 store <16 x i8> %5, <16 x i8>* undef, align 4
1186 define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
1187 ; SSE2-LABEL: avg_v32i8_const:
1189 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1190 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1191 ; SSE2-NEXT: pavgb %xmm0, %xmm1
1192 ; SSE2-NEXT: pavgb 16(%rdi), %xmm0
1193 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1194 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1197 ; AVX1-LABEL: avg_v32i8_const:
1199 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275]
1200 ; AVX1-NEXT: # xmm0 = mem[0,0]
1201 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1
1202 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm0
1203 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1204 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
1207 ; AVX2-LABEL: avg_v32i8_const:
1209 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1210 ; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
1211 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1212 ; AVX2-NEXT: vzeroupper
1215 ; AVX512-LABEL: avg_v32i8_const:
1217 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
1218 ; AVX512-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
1219 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
1220 ; AVX512-NEXT: vzeroupper
1222 %1 = load <32 x i8>, <32 x i8>* %a
1223 %2 = zext <32 x i8> %1 to <32 x i32>
1224 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1225 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1226 %5 = trunc <32 x i32> %4 to <32 x i8>
1227 store <32 x i8> %5, <32 x i8>* undef, align 4
1231 define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
1232 ; SSE2-LABEL: avg_v64i8_const:
1234 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1235 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1236 ; SSE2-NEXT: pavgb %xmm0, %xmm1
1237 ; SSE2-NEXT: movdqa 16(%rdi), %xmm2
1238 ; SSE2-NEXT: pavgb %xmm0, %xmm2
1239 ; SSE2-NEXT: movdqa 32(%rdi), %xmm3
1240 ; SSE2-NEXT: pavgb %xmm0, %xmm3
1241 ; SSE2-NEXT: pavgb 48(%rdi), %xmm0
1242 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1243 ; SSE2-NEXT: movdqu %xmm3, (%rax)
1244 ; SSE2-NEXT: movdqu %xmm2, (%rax)
1245 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1248 ; AVX1-LABEL: avg_v64i8_const:
1250 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275]
1251 ; AVX1-NEXT: # xmm0 = mem[0,0]
1252 ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1
1253 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm2
1254 ; AVX1-NEXT: vpavgb 32(%rdi), %xmm0, %xmm3
1255 ; AVX1-NEXT: vpavgb 48(%rdi), %xmm0, %xmm0
1256 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1257 ; AVX1-NEXT: vmovdqu %xmm3, (%rax)
1258 ; AVX1-NEXT: vmovdqu %xmm2, (%rax)
1259 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
1262 ; AVX2-LABEL: avg_v64i8_const:
1264 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
1265 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm1
1266 ; AVX2-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
1267 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1268 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
1269 ; AVX2-NEXT: vzeroupper
1272 ; AVX512F-LABEL: avg_v64i8_const:
1274 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
1275 ; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm1
1276 ; AVX512F-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
1277 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
1278 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
1279 ; AVX512F-NEXT: vzeroupper
1280 ; AVX512F-NEXT: retq
1282 ; AVX512BW-LABEL: avg_v64i8_const:
1283 ; AVX512BW: # %bb.0:
1284 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1285 ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0
1286 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
1287 ; AVX512BW-NEXT: vzeroupper
1288 ; AVX512BW-NEXT: retq
1289 %1 = load <64 x i8>, <64 x i8>* %a
1290 %2 = zext <64 x i8> %1 to <64 x i32>
1291 %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1292 %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1293 %5 = trunc <64 x i32> %4 to <64 x i8>
1294 store <64 x i8> %5, <64 x i8>* undef, align 4
1298 define void @avg_v4i16_const(<4 x i16>* %a) nounwind {
1299 ; SSE2-LABEL: avg_v4i16_const:
1301 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1302 ; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
1303 ; SSE2-NEXT: movq %xmm0, (%rax)
1306 ; AVX-LABEL: avg_v4i16_const:
1308 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1309 ; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
1310 ; AVX-NEXT: vmovq %xmm0, (%rax)
1312 %1 = load <4 x i16>, <4 x i16>* %a
1313 %2 = zext <4 x i16> %1 to <4 x i32>
1314 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
1315 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
1316 %5 = trunc <4 x i32> %4 to <4 x i16>
1317 store <4 x i16> %5, <4 x i16>* undef, align 4
1321 define void @avg_v8i16_const(<8 x i16>* %a) nounwind {
1322 ; SSE2-LABEL: avg_v8i16_const:
1324 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1325 ; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
1326 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1329 ; AVX-LABEL: avg_v8i16_const:
1331 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1332 ; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
1333 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
1335 %1 = load <8 x i16>, <8 x i16>* %a
1336 %2 = zext <8 x i16> %1 to <8 x i32>
1337 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1338 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1339 %5 = trunc <8 x i32> %4 to <8 x i16>
1340 store <8 x i16> %5, <8 x i16>* undef, align 4
1344 define void @avg_v16i16_const(<16 x i16>* %a) nounwind {
1345 ; SSE2-LABEL: avg_v16i16_const:
1347 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1348 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1349 ; SSE2-NEXT: pavgw %xmm0, %xmm1
1350 ; SSE2-NEXT: pavgw 16(%rdi), %xmm0
1351 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1352 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1355 ; AVX1-LABEL: avg_v16i16_const:
1357 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1358 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1
1359 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm0
1360 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1361 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
1364 ; AVX2-LABEL: avg_v16i16_const:
1366 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1367 ; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
1368 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1369 ; AVX2-NEXT: vzeroupper
1372 ; AVX512-LABEL: avg_v16i16_const:
1374 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
1375 ; AVX512-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
1376 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
1377 ; AVX512-NEXT: vzeroupper
1379 %1 = load <16 x i16>, <16 x i16>* %a
1380 %2 = zext <16 x i16> %1 to <16 x i32>
1381 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1382 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1383 %5 = trunc <16 x i32> %4 to <16 x i16>
1384 store <16 x i16> %5, <16 x i16>* undef, align 4
1388 define void @avg_v32i16_const(<32 x i16>* %a) nounwind {
1389 ; SSE2-LABEL: avg_v32i16_const:
1391 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1392 ; SSE2-NEXT: movdqa (%rdi), %xmm1
1393 ; SSE2-NEXT: pavgw %xmm0, %xmm1
1394 ; SSE2-NEXT: movdqa 16(%rdi), %xmm2
1395 ; SSE2-NEXT: pavgw %xmm0, %xmm2
1396 ; SSE2-NEXT: movdqa 32(%rdi), %xmm3
1397 ; SSE2-NEXT: pavgw %xmm0, %xmm3
1398 ; SSE2-NEXT: pavgw 48(%rdi), %xmm0
1399 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1400 ; SSE2-NEXT: movdqu %xmm3, (%rax)
1401 ; SSE2-NEXT: movdqu %xmm2, (%rax)
1402 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1405 ; AVX1-LABEL: avg_v32i16_const:
1407 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
1408 ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1
1409 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm2
1410 ; AVX1-NEXT: vpavgw 32(%rdi), %xmm0, %xmm3
1411 ; AVX1-NEXT: vpavgw 48(%rdi), %xmm0, %xmm0
1412 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
1413 ; AVX1-NEXT: vmovdqu %xmm3, (%rax)
1414 ; AVX1-NEXT: vmovdqu %xmm2, (%rax)
1415 ; AVX1-NEXT: vmovdqu %xmm1, (%rax)
1418 ; AVX2-LABEL: avg_v32i16_const:
1420 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1421 ; AVX2-NEXT: # ymm0 = mem[0,1,0,1]
1422 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm1
1423 ; AVX2-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0
1424 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1425 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
1426 ; AVX2-NEXT: vzeroupper
1429 ; AVX512F-LABEL: avg_v32i16_const:
1431 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
1432 ; AVX512F-NEXT: # ymm0 = mem[0,1,0,1]
1433 ; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm1
1434 ; AVX512F-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0
1435 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
1436 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
1437 ; AVX512F-NEXT: vzeroupper
1438 ; AVX512F-NEXT: retq
1440 ; AVX512BW-LABEL: avg_v32i16_const:
1441 ; AVX512BW: # %bb.0:
1442 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1443 ; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0
1444 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
1445 ; AVX512BW-NEXT: vzeroupper
1446 ; AVX512BW-NEXT: retq
1447 %1 = load <32 x i16>, <32 x i16>* %a
1448 %2 = zext <32 x i16> %1 to <32 x i32>
1449 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
1450 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1451 %5 = trunc <32 x i32> %4 to <32 x i16>
1452 store <32 x i16> %5, <32 x i16>* undef, align 4
1456 define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind {
1457 ; SSE2-LABEL: avg_v16i8_3:
1459 ; SSE2-NEXT: pavgb %xmm1, %xmm0
1462 ; AVX-LABEL: avg_v16i8_3:
1464 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
1466 %za = zext <16 x i8> %a to <16 x i16>
1467 %zb = zext <16 x i8> %b to <16 x i16>
1468 %add = add nuw nsw <16 x i16> %za, %zb
1469 %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1470 %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1471 %res = trunc <16 x i16> %lshr to <16 x i8>
1475 define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {
1476 ; SSE2-LABEL: avg_v32i8_3:
1478 ; SSE2-NEXT: pavgb %xmm2, %xmm0
1479 ; SSE2-NEXT: pavgb %xmm3, %xmm1
1482 ; AVX1-LABEL: avg_v32i8_3:
1484 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1485 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1486 ; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2
1487 ; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0
1488 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1491 ; AVX2-LABEL: avg_v32i8_3:
1493 ; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0
1496 ; AVX512-LABEL: avg_v32i8_3:
1498 ; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
1500 %za = zext <32 x i8> %a to <32 x i16>
1501 %zb = zext <32 x i8> %b to <32 x i16>
1502 %add = add nuw nsw <32 x i16> %za, %zb
1503 %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1504 %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1505 %res = trunc <32 x i16> %lshr to <32 x i8>
1509 define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind {
1510 ; SSE2-LABEL: avg_v64i8_3:
1512 ; SSE2-NEXT: pavgb %xmm4, %xmm0
1513 ; SSE2-NEXT: pavgb %xmm5, %xmm1
1514 ; SSE2-NEXT: pavgb %xmm6, %xmm2
1515 ; SSE2-NEXT: pavgb %xmm7, %xmm3
1518 ; AVX1-LABEL: avg_v64i8_3:
1520 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1521 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
1522 ; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4
1523 ; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0
1524 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1525 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
1526 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1527 ; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2
1528 ; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1
1529 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1532 ; AVX2-LABEL: avg_v64i8_3:
1534 ; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0
1535 ; AVX2-NEXT: vpavgb %ymm3, %ymm1, %ymm1
1538 ; AVX512F-LABEL: avg_v64i8_3:
1540 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
1541 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1542 ; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2
1543 ; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
1544 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1545 ; AVX512F-NEXT: retq
1547 ; AVX512BW-LABEL: avg_v64i8_3:
1548 ; AVX512BW: # %bb.0:
1549 ; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0
1550 ; AVX512BW-NEXT: retq
1551 %za = zext <64 x i8> %a to <64 x i16>
1552 %zb = zext <64 x i8> %b to <64 x i16>
1553 %add = add nuw nsw <64 x i16> %za, %zb
1554 %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1555 %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1556 %res = trunc <64 x i16> %lshr to <64 x i8>
1560 define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind {
1561 ; SSE2-LABEL: avg_v512i8_3:
1563 ; SSE2-NEXT: movq %rdi, %rax
1564 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1565 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1566 ; SSE2-NEXT: movdqa %xmm8, 496(%rdi)
1567 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1568 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1569 ; SSE2-NEXT: movdqa %xmm8, 480(%rdi)
1570 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1571 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1572 ; SSE2-NEXT: movdqa %xmm8, 464(%rdi)
1573 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1574 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1575 ; SSE2-NEXT: movdqa %xmm8, 448(%rdi)
1576 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1577 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1578 ; SSE2-NEXT: movdqa %xmm8, 432(%rdi)
1579 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1580 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1581 ; SSE2-NEXT: movdqa %xmm8, 416(%rdi)
1582 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1583 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1584 ; SSE2-NEXT: movdqa %xmm8, 400(%rdi)
1585 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1586 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1587 ; SSE2-NEXT: movdqa %xmm8, 384(%rdi)
1588 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1589 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1590 ; SSE2-NEXT: movdqa %xmm8, 368(%rdi)
1591 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1592 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1593 ; SSE2-NEXT: movdqa %xmm8, 352(%rdi)
1594 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1595 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1596 ; SSE2-NEXT: movdqa %xmm8, 336(%rdi)
1597 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1598 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1599 ; SSE2-NEXT: movdqa %xmm8, 320(%rdi)
1600 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1601 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1602 ; SSE2-NEXT: movdqa %xmm8, 304(%rdi)
1603 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1604 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1605 ; SSE2-NEXT: movdqa %xmm8, 288(%rdi)
1606 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1607 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1608 ; SSE2-NEXT: movdqa %xmm8, 272(%rdi)
1609 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1610 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1611 ; SSE2-NEXT: movdqa %xmm8, 256(%rdi)
1612 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1613 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1614 ; SSE2-NEXT: movdqa %xmm8, 240(%rdi)
1615 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1616 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1617 ; SSE2-NEXT: movdqa %xmm8, 224(%rdi)
1618 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1619 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1620 ; SSE2-NEXT: movdqa %xmm8, 208(%rdi)
1621 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1622 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1623 ; SSE2-NEXT: movdqa %xmm8, 192(%rdi)
1624 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1625 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1626 ; SSE2-NEXT: movdqa %xmm8, 176(%rdi)
1627 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1628 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1629 ; SSE2-NEXT: movdqa %xmm8, 160(%rdi)
1630 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1631 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1632 ; SSE2-NEXT: movdqa %xmm8, 144(%rdi)
1633 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
1634 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
1635 ; SSE2-NEXT: movdqa %xmm8, 128(%rdi)
1636 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm7
1637 ; SSE2-NEXT: movdqa %xmm7, 112(%rdi)
1638 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm6
1639 ; SSE2-NEXT: movdqa %xmm6, 96(%rdi)
1640 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm5
1641 ; SSE2-NEXT: movdqa %xmm5, 80(%rdi)
1642 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm4
1643 ; SSE2-NEXT: movdqa %xmm4, 64(%rdi)
1644 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm3
1645 ; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
1646 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm2
1647 ; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
1648 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm1
1649 ; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
1650 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm0
1651 ; SSE2-NEXT: movdqa %xmm0, (%rdi)
1654 ; AVX1-LABEL: avg_v512i8_3:
1656 ; AVX1-NEXT: pushq %rbp
1657 ; AVX1-NEXT: movq %rsp, %rbp
1658 ; AVX1-NEXT: andq $-32, %rsp
1659 ; AVX1-NEXT: subq $32, %rsp
1660 ; AVX1-NEXT: movq %rdi, %rax
1661 ; AVX1-NEXT: vmovdqa 256(%rbp), %xmm8
1662 ; AVX1-NEXT: vpavgb 768(%rbp), %xmm8, %xmm8
1663 ; AVX1-NEXT: vmovdqa %xmm8, 496(%rdi)
1664 ; AVX1-NEXT: vmovdqa 240(%rbp), %xmm8
1665 ; AVX1-NEXT: vpavgb 752(%rbp), %xmm8, %xmm8
1666 ; AVX1-NEXT: vmovdqa %xmm8, 480(%rdi)
1667 ; AVX1-NEXT: vmovdqa 224(%rbp), %xmm8
1668 ; AVX1-NEXT: vpavgb 736(%rbp), %xmm8, %xmm8
1669 ; AVX1-NEXT: vmovdqa %xmm8, 464(%rdi)
1670 ; AVX1-NEXT: vmovdqa 208(%rbp), %xmm8
1671 ; AVX1-NEXT: vpavgb 720(%rbp), %xmm8, %xmm8
1672 ; AVX1-NEXT: vmovdqa %xmm8, 448(%rdi)
1673 ; AVX1-NEXT: vmovdqa 192(%rbp), %xmm8
1674 ; AVX1-NEXT: vpavgb 704(%rbp), %xmm8, %xmm8
1675 ; AVX1-NEXT: vmovdqa %xmm8, 432(%rdi)
1676 ; AVX1-NEXT: vmovdqa 176(%rbp), %xmm8
1677 ; AVX1-NEXT: vpavgb 688(%rbp), %xmm8, %xmm8
1678 ; AVX1-NEXT: vmovdqa %xmm8, 416(%rdi)
1679 ; AVX1-NEXT: vmovdqa 160(%rbp), %xmm8
1680 ; AVX1-NEXT: vpavgb 672(%rbp), %xmm8, %xmm8
1681 ; AVX1-NEXT: vmovdqa %xmm8, 400(%rdi)
1682 ; AVX1-NEXT: vmovdqa 144(%rbp), %xmm8
1683 ; AVX1-NEXT: vpavgb 656(%rbp), %xmm8, %xmm8
1684 ; AVX1-NEXT: vmovdqa %xmm8, 384(%rdi)
1685 ; AVX1-NEXT: vmovdqa 128(%rbp), %xmm8
1686 ; AVX1-NEXT: vpavgb 640(%rbp), %xmm8, %xmm8
1687 ; AVX1-NEXT: vmovdqa %xmm8, 368(%rdi)
1688 ; AVX1-NEXT: vmovdqa 112(%rbp), %xmm8
1689 ; AVX1-NEXT: vpavgb 624(%rbp), %xmm8, %xmm8
1690 ; AVX1-NEXT: vmovdqa %xmm8, 352(%rdi)
1691 ; AVX1-NEXT: vmovdqa 96(%rbp), %xmm8
1692 ; AVX1-NEXT: vpavgb 608(%rbp), %xmm8, %xmm8
1693 ; AVX1-NEXT: vmovdqa %xmm8, 336(%rdi)
1694 ; AVX1-NEXT: vmovdqa 80(%rbp), %xmm8
1695 ; AVX1-NEXT: vpavgb 592(%rbp), %xmm8, %xmm8
1696 ; AVX1-NEXT: vmovdqa %xmm8, 320(%rdi)
1697 ; AVX1-NEXT: vmovdqa 64(%rbp), %xmm8
1698 ; AVX1-NEXT: vpavgb 576(%rbp), %xmm8, %xmm8
1699 ; AVX1-NEXT: vmovdqa %xmm8, 304(%rdi)
1700 ; AVX1-NEXT: vmovdqa 48(%rbp), %xmm8
1701 ; AVX1-NEXT: vpavgb 560(%rbp), %xmm8, %xmm8
1702 ; AVX1-NEXT: vmovdqa %xmm8, 288(%rdi)
1703 ; AVX1-NEXT: vmovdqa 32(%rbp), %xmm8
1704 ; AVX1-NEXT: vpavgb 544(%rbp), %xmm8, %xmm8
1705 ; AVX1-NEXT: vmovdqa %xmm8, 272(%rdi)
1706 ; AVX1-NEXT: vmovdqa 16(%rbp), %xmm8
1707 ; AVX1-NEXT: vpavgb 528(%rbp), %xmm8, %xmm8
1708 ; AVX1-NEXT: vmovdqa %xmm8, 256(%rdi)
1709 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
1710 ; AVX1-NEXT: vpavgb 512(%rbp), %xmm8, %xmm8
1711 ; AVX1-NEXT: vmovdqa %xmm8, 240(%rdi)
1712 ; AVX1-NEXT: vpavgb 496(%rbp), %xmm7, %xmm7
1713 ; AVX1-NEXT: vmovdqa %xmm7, 224(%rdi)
1714 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
1715 ; AVX1-NEXT: vpavgb 480(%rbp), %xmm7, %xmm7
1716 ; AVX1-NEXT: vmovdqa %xmm7, 208(%rdi)
1717 ; AVX1-NEXT: vpavgb 464(%rbp), %xmm6, %xmm6
1718 ; AVX1-NEXT: vmovdqa %xmm6, 192(%rdi)
1719 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
1720 ; AVX1-NEXT: vpavgb 448(%rbp), %xmm6, %xmm6
1721 ; AVX1-NEXT: vmovdqa %xmm6, 176(%rdi)
1722 ; AVX1-NEXT: vpavgb 432(%rbp), %xmm5, %xmm5
1723 ; AVX1-NEXT: vmovdqa %xmm5, 160(%rdi)
1724 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
1725 ; AVX1-NEXT: vpavgb 416(%rbp), %xmm5, %xmm5
1726 ; AVX1-NEXT: vmovdqa %xmm5, 144(%rdi)
1727 ; AVX1-NEXT: vpavgb 400(%rbp), %xmm4, %xmm4
1728 ; AVX1-NEXT: vmovdqa %xmm4, 128(%rdi)
1729 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
1730 ; AVX1-NEXT: vpavgb 384(%rbp), %xmm4, %xmm4
1731 ; AVX1-NEXT: vmovdqa %xmm4, 112(%rdi)
1732 ; AVX1-NEXT: vpavgb 368(%rbp), %xmm3, %xmm3
1733 ; AVX1-NEXT: vmovdqa %xmm3, 96(%rdi)
1734 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1735 ; AVX1-NEXT: vpavgb 352(%rbp), %xmm3, %xmm3
1736 ; AVX1-NEXT: vmovdqa %xmm3, 80(%rdi)
1737 ; AVX1-NEXT: vpavgb 336(%rbp), %xmm2, %xmm2
1738 ; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi)
1739 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1740 ; AVX1-NEXT: vpavgb 320(%rbp), %xmm2, %xmm2
1741 ; AVX1-NEXT: vmovdqa %xmm2, 48(%rdi)
1742 ; AVX1-NEXT: vpavgb 304(%rbp), %xmm1, %xmm1
1743 ; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi)
1744 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1745 ; AVX1-NEXT: vpavgb 288(%rbp), %xmm1, %xmm1
1746 ; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi)
1747 ; AVX1-NEXT: vpavgb 272(%rbp), %xmm0, %xmm0
1748 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
1749 ; AVX1-NEXT: movq %rbp, %rsp
1750 ; AVX1-NEXT: popq %rbp
1751 ; AVX1-NEXT: vzeroupper
1754 ; AVX2-LABEL: avg_v512i8_3:
1756 ; AVX2-NEXT: pushq %rbp
1757 ; AVX2-NEXT: movq %rsp, %rbp
1758 ; AVX2-NEXT: andq $-32, %rsp
1759 ; AVX2-NEXT: subq $32, %rsp
1760 ; AVX2-NEXT: movq %rdi, %rax
1761 ; AVX2-NEXT: vmovdqa 240(%rbp), %ymm8
1762 ; AVX2-NEXT: vmovdqa 208(%rbp), %ymm9
1763 ; AVX2-NEXT: vmovdqa 176(%rbp), %ymm10
1764 ; AVX2-NEXT: vmovdqa 144(%rbp), %ymm11
1765 ; AVX2-NEXT: vmovdqa 112(%rbp), %ymm12
1766 ; AVX2-NEXT: vmovdqa 80(%rbp), %ymm13
1767 ; AVX2-NEXT: vmovdqa 48(%rbp), %ymm14
1768 ; AVX2-NEXT: vmovdqa 16(%rbp), %ymm15
1769 ; AVX2-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0
1770 ; AVX2-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1
1771 ; AVX2-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2
1772 ; AVX2-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3
1773 ; AVX2-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4
1774 ; AVX2-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5
1775 ; AVX2-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6
1776 ; AVX2-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7
1777 ; AVX2-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15
1778 ; AVX2-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14
1779 ; AVX2-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13
1780 ; AVX2-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12
1781 ; AVX2-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11
1782 ; AVX2-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10
1783 ; AVX2-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9
1784 ; AVX2-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8
1785 ; AVX2-NEXT: vmovdqa %ymm8, 480(%rdi)
1786 ; AVX2-NEXT: vmovdqa %ymm9, 448(%rdi)
1787 ; AVX2-NEXT: vmovdqa %ymm10, 416(%rdi)
1788 ; AVX2-NEXT: vmovdqa %ymm11, 384(%rdi)
1789 ; AVX2-NEXT: vmovdqa %ymm12, 352(%rdi)
1790 ; AVX2-NEXT: vmovdqa %ymm13, 320(%rdi)
1791 ; AVX2-NEXT: vmovdqa %ymm14, 288(%rdi)
1792 ; AVX2-NEXT: vmovdqa %ymm15, 256(%rdi)
1793 ; AVX2-NEXT: vmovdqa %ymm7, 224(%rdi)
1794 ; AVX2-NEXT: vmovdqa %ymm6, 192(%rdi)
1795 ; AVX2-NEXT: vmovdqa %ymm5, 160(%rdi)
1796 ; AVX2-NEXT: vmovdqa %ymm4, 128(%rdi)
1797 ; AVX2-NEXT: vmovdqa %ymm3, 96(%rdi)
1798 ; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi)
1799 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi)
1800 ; AVX2-NEXT: vmovdqa %ymm0, (%rdi)
1801 ; AVX2-NEXT: movq %rbp, %rsp
1802 ; AVX2-NEXT: popq %rbp
1803 ; AVX2-NEXT: vzeroupper
1806 ; AVX512F-LABEL: avg_v512i8_3:
1808 ; AVX512F-NEXT: pushq %rbp
1809 ; AVX512F-NEXT: movq %rsp, %rbp
1810 ; AVX512F-NEXT: andq $-32, %rsp
1811 ; AVX512F-NEXT: subq $32, %rsp
1812 ; AVX512F-NEXT: movq %rdi, %rax
1813 ; AVX512F-NEXT: vmovdqa 240(%rbp), %ymm8
1814 ; AVX512F-NEXT: vmovdqa 208(%rbp), %ymm9
1815 ; AVX512F-NEXT: vmovdqa 176(%rbp), %ymm10
1816 ; AVX512F-NEXT: vmovdqa 144(%rbp), %ymm11
1817 ; AVX512F-NEXT: vmovdqa 112(%rbp), %ymm12
1818 ; AVX512F-NEXT: vmovdqa 80(%rbp), %ymm13
1819 ; AVX512F-NEXT: vmovdqa 48(%rbp), %ymm14
1820 ; AVX512F-NEXT: vmovdqa 16(%rbp), %ymm15
1821 ; AVX512F-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0
1822 ; AVX512F-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1
1823 ; AVX512F-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2
1824 ; AVX512F-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3
1825 ; AVX512F-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4
1826 ; AVX512F-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5
1827 ; AVX512F-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6
1828 ; AVX512F-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7
1829 ; AVX512F-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15
1830 ; AVX512F-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14
1831 ; AVX512F-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13
1832 ; AVX512F-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12
1833 ; AVX512F-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11
1834 ; AVX512F-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10
1835 ; AVX512F-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9
1836 ; AVX512F-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8
1837 ; AVX512F-NEXT: vmovdqa %ymm8, 480(%rdi)
1838 ; AVX512F-NEXT: vmovdqa %ymm9, 448(%rdi)
1839 ; AVX512F-NEXT: vmovdqa %ymm10, 416(%rdi)
1840 ; AVX512F-NEXT: vmovdqa %ymm11, 384(%rdi)
1841 ; AVX512F-NEXT: vmovdqa %ymm12, 352(%rdi)
1842 ; AVX512F-NEXT: vmovdqa %ymm13, 320(%rdi)
1843 ; AVX512F-NEXT: vmovdqa %ymm14, 288(%rdi)
1844 ; AVX512F-NEXT: vmovdqa %ymm15, 256(%rdi)
1845 ; AVX512F-NEXT: vmovdqa %ymm7, 224(%rdi)
1846 ; AVX512F-NEXT: vmovdqa %ymm6, 192(%rdi)
1847 ; AVX512F-NEXT: vmovdqa %ymm5, 160(%rdi)
1848 ; AVX512F-NEXT: vmovdqa %ymm4, 128(%rdi)
1849 ; AVX512F-NEXT: vmovdqa %ymm3, 96(%rdi)
1850 ; AVX512F-NEXT: vmovdqa %ymm2, 64(%rdi)
1851 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdi)
1852 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdi)
1853 ; AVX512F-NEXT: movq %rbp, %rsp
1854 ; AVX512F-NEXT: popq %rbp
1855 ; AVX512F-NEXT: vzeroupper
1856 ; AVX512F-NEXT: retq
1858 ; AVX512BW-LABEL: avg_v512i8_3:
1859 ; AVX512BW: # %bb.0:
1860 ; AVX512BW-NEXT: pushq %rbp
1861 ; AVX512BW-NEXT: movq %rsp, %rbp
1862 ; AVX512BW-NEXT: andq $-64, %rsp
1863 ; AVX512BW-NEXT: subq $64, %rsp
1864 ; AVX512BW-NEXT: movq %rdi, %rax
1865 ; AVX512BW-NEXT: vpavgb 16(%rbp), %zmm0, %zmm0
1866 ; AVX512BW-NEXT: vpavgb 80(%rbp), %zmm1, %zmm1
1867 ; AVX512BW-NEXT: vpavgb 144(%rbp), %zmm2, %zmm2
1868 ; AVX512BW-NEXT: vpavgb 208(%rbp), %zmm3, %zmm3
1869 ; AVX512BW-NEXT: vpavgb 272(%rbp), %zmm4, %zmm4
1870 ; AVX512BW-NEXT: vpavgb 336(%rbp), %zmm5, %zmm5
1871 ; AVX512BW-NEXT: vpavgb 400(%rbp), %zmm6, %zmm6
1872 ; AVX512BW-NEXT: vpavgb 464(%rbp), %zmm7, %zmm7
1873 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdi)
1874 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdi)
1875 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdi)
1876 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdi)
1877 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdi)
1878 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdi)
1879 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdi)
1880 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdi)
1881 ; AVX512BW-NEXT: movq %rbp, %rsp
1882 ; AVX512BW-NEXT: popq %rbp
1883 ; AVX512BW-NEXT: vzeroupper
1884 ; AVX512BW-NEXT: retq
1885 %za = zext <512 x i8> %a to <512 x i16>
1886 %zb = zext <512 x i8> %b to <512 x i16>
1887 %add = add nuw nsw <512 x i16> %za, %zb
1888 %add1 = add nuw nsw <512 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1889 %lshr = lshr <512 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1890 %res = trunc <512 x i16> %lshr to <512 x i8>
1894 ; This is not an avg, but its structurally similar and previously caused a crash
1895 ; because the constants can't be read with APInt::getZExtValue.
1896 define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind {
1897 ; SSE2-LABEL: not_avg_v16i8_wide_constants:
1899 ; SSE2-NEXT: pushq %rbp
1900 ; SSE2-NEXT: pushq %r15
1901 ; SSE2-NEXT: pushq %r14
1902 ; SSE2-NEXT: pushq %r13
1903 ; SSE2-NEXT: pushq %r12
1904 ; SSE2-NEXT: pushq %rbx
1905 ; SSE2-NEXT: movaps (%rdi), %xmm1
1906 ; SSE2-NEXT: movaps (%rsi), %xmm0
1907 ; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
1908 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1909 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1910 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1911 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1912 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1913 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1914 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1915 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1916 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1917 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1918 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
1919 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
1920 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
1921 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
1922 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
1923 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
1924 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
1925 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
1926 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
1927 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1928 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1929 ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1930 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
1931 ; SSE2-NEXT: addq %r11, %rbp
1932 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
1933 ; SSE2-NEXT: addq %r10, %r14
1934 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
1935 ; SSE2-NEXT: addq %r9, %rbx
1936 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
1937 ; SSE2-NEXT: addq %r8, %r11
1938 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
1939 ; SSE2-NEXT: addq %rdx, %r10
1940 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
1941 ; SSE2-NEXT: addq %rcx, %r8
1942 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
1943 ; SSE2-NEXT: addq %rax, %rdi
1944 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
1945 ; SSE2-NEXT: addq %rsi, %rdx
1946 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1947 ; SSE2-NEXT: leaq -1(%r15,%rsi), %rax
1948 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1949 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1950 ; SSE2-NEXT: leaq -1(%r12,%rsi), %rax
1951 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1952 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1953 ; SSE2-NEXT: leaq -1(%r13,%rsi), %rax
1954 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1955 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1956 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1957 ; SSE2-NEXT: leaq -1(%rax,%rsi), %rax
1958 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1959 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1960 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1961 ; SSE2-NEXT: leaq -1(%rax,%rsi), %rax
1962 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1963 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1964 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1965 ; SSE2-NEXT: leaq -1(%rax,%rsi), %rax
1966 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1967 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1968 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1969 ; SSE2-NEXT: leaq -1(%rax,%rsi), %rsi
1970 ; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1971 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
1972 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1973 ; SSE2-NEXT: leaq -1(%rax,%rsi), %rsi
1974 ; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1975 ; SSE2-NEXT: addq $-1, %rbp
1976 ; SSE2-NEXT: movl $0, %r9d
1977 ; SSE2-NEXT: adcq $-1, %r9
1978 ; SSE2-NEXT: addq $-1, %r14
1979 ; SSE2-NEXT: movl $0, %esi
1980 ; SSE2-NEXT: adcq $-1, %rsi
1981 ; SSE2-NEXT: addq $-1, %rbx
1982 ; SSE2-NEXT: movl $0, %eax
1983 ; SSE2-NEXT: adcq $-1, %rax
1984 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1985 ; SSE2-NEXT: addq $-1, %r11
1986 ; SSE2-NEXT: movl $0, %r12d
1987 ; SSE2-NEXT: adcq $-1, %r12
1988 ; SSE2-NEXT: addq $-1, %r10
1989 ; SSE2-NEXT: movl $0, %r13d
1990 ; SSE2-NEXT: adcq $-1, %r13
1991 ; SSE2-NEXT: addq $-1, %r8
1992 ; SSE2-NEXT: movl $0, %r15d
1993 ; SSE2-NEXT: adcq $-1, %r15
1994 ; SSE2-NEXT: addq $-1, %rdi
1995 ; SSE2-NEXT: movl $0, %ecx
1996 ; SSE2-NEXT: adcq $-1, %rcx
1997 ; SSE2-NEXT: addq $-1, %rdx
1998 ; SSE2-NEXT: movl $0, %eax
1999 ; SSE2-NEXT: adcq $-1, %rax
2000 ; SSE2-NEXT: shldq $63, %rdx, %rax
2001 ; SSE2-NEXT: shldq $63, %rdi, %rcx
2002 ; SSE2-NEXT: movq %rcx, %rdx
2003 ; SSE2-NEXT: shldq $63, %r8, %r15
2004 ; SSE2-NEXT: shldq $63, %r10, %r13
2005 ; SSE2-NEXT: shldq $63, %r11, %r12
2006 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
2007 ; SSE2-NEXT: shldq $63, %rbx, %rdi
2008 ; SSE2-NEXT: shldq $63, %r14, %rsi
2009 ; SSE2-NEXT: shldq $63, %rbp, %r9
2010 ; SSE2-NEXT: movq %r9, %xmm8
2011 ; SSE2-NEXT: movq %rsi, %xmm15
2012 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2013 ; SSE2-NEXT: shrq %rcx
2014 ; SSE2-NEXT: movq %rcx, %xmm9
2015 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2016 ; SSE2-NEXT: shrq %rcx
2017 ; SSE2-NEXT: movq %rcx, %xmm2
2018 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2019 ; SSE2-NEXT: shrq %rcx
2020 ; SSE2-NEXT: movq %rcx, %xmm10
2021 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2022 ; SSE2-NEXT: shrq %rcx
2023 ; SSE2-NEXT: movq %rcx, %xmm4
2024 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2025 ; SSE2-NEXT: shrq %rcx
2026 ; SSE2-NEXT: movq %rcx, %xmm11
2027 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2028 ; SSE2-NEXT: shrq %rcx
2029 ; SSE2-NEXT: movq %rcx, %xmm7
2030 ; SSE2-NEXT: movq %rdi, %xmm12
2031 ; SSE2-NEXT: movq %r12, %xmm0
2032 ; SSE2-NEXT: movq %r13, %xmm13
2033 ; SSE2-NEXT: movq %r15, %xmm6
2034 ; SSE2-NEXT: movq %rdx, %xmm14
2035 ; SSE2-NEXT: movq %rax, %xmm5
2036 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2037 ; SSE2-NEXT: shrq %rax
2038 ; SSE2-NEXT: movq %rax, %xmm3
2039 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2040 ; SSE2-NEXT: shrq %rax
2041 ; SSE2-NEXT: movq %rax, %xmm1
2042 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
2043 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
2044 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm15[0,1,2,0]
2045 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm8
2046 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
2047 ; SSE2-NEXT: por %xmm8, %xmm2
2048 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
2049 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5]
2050 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
2051 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535]
2052 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1]
2053 ; SSE2-NEXT: pand %xmm8, %xmm7
2054 ; SSE2-NEXT: pandn %xmm4, %xmm8
2055 ; SSE2-NEXT: por %xmm7, %xmm8
2056 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,2]
2057 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
2058 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
2059 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
2060 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
2061 ; SSE2-NEXT: pand %xmm2, %xmm0
2062 ; SSE2-NEXT: pslld $16, %xmm6
2063 ; SSE2-NEXT: pandn %xmm6, %xmm2
2064 ; SSE2-NEXT: por %xmm0, %xmm2
2065 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
2066 ; SSE2-NEXT: psllq $48, %xmm5
2067 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
2068 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2069 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
2070 ; SSE2-NEXT: pand %xmm0, %xmm1
2071 ; SSE2-NEXT: pandn %xmm5, %xmm0
2072 ; SSE2-NEXT: por %xmm1, %xmm0
2073 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
2074 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2075 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
2076 ; SSE2-NEXT: movups %xmm2, (%rax)
2077 ; SSE2-NEXT: popq %rbx
2078 ; SSE2-NEXT: popq %r12
2079 ; SSE2-NEXT: popq %r13
2080 ; SSE2-NEXT: popq %r14
2081 ; SSE2-NEXT: popq %r15
2082 ; SSE2-NEXT: popq %rbp
2085 ; AVX1-LABEL: not_avg_v16i8_wide_constants:
2087 ; AVX1-NEXT: pushq %rbp
2088 ; AVX1-NEXT: pushq %r15
2089 ; AVX1-NEXT: pushq %r14
2090 ; AVX1-NEXT: pushq %r13
2091 ; AVX1-NEXT: pushq %r12
2092 ; AVX1-NEXT: pushq %rbx
2093 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2094 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2095 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2096 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
2097 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2098 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2099 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
2100 ; AVX1-NEXT: vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2101 ; AVX1-NEXT: vpextrq $1, %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2102 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2103 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
2104 ; AVX1-NEXT: vmovq %xmm6, %r10
2105 ; AVX1-NEXT: vpextrq $1, %xmm6, %r9
2106 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2107 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero
2108 ; AVX1-NEXT: vmovq %xmm7, %r8
2109 ; AVX1-NEXT: vpextrq $1, %xmm7, %rdi
2110 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3]
2111 ; AVX1-NEXT: vpextrq $1, %xmm6, %rcx
2112 ; AVX1-NEXT: vmovq %xmm6, %r14
2113 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
2114 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3]
2115 ; AVX1-NEXT: vpextrq $1, %xmm6, %rax
2116 ; AVX1-NEXT: vmovq %xmm6, %rbp
2117 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
2118 ; AVX1-NEXT: vpextrq $1, %xmm5, %r11
2119 ; AVX1-NEXT: vmovq %xmm5, %r15
2120 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2121 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
2122 ; AVX1-NEXT: vpextrq $1, %xmm4, %rbx
2123 ; AVX1-NEXT: vmovq %xmm4, %rdx
2124 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2125 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2126 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2127 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
2128 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
2129 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero
2130 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
2131 ; AVX1-NEXT: vpextrq $1, %xmm0, %rsi
2132 ; AVX1-NEXT: addq %rcx, %rsi
2133 ; AVX1-NEXT: vmovq %xmm0, %r13
2134 ; AVX1-NEXT: addq %r14, %r13
2135 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2136 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
2137 ; AVX1-NEXT: vpextrq $1, %xmm0, %r12
2138 ; AVX1-NEXT: addq %rax, %r12
2139 ; AVX1-NEXT: vmovq %xmm0, %r14
2140 ; AVX1-NEXT: addq %rbp, %r14
2141 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero
2142 ; AVX1-NEXT: vpextrq $1, %xmm0, %rbp
2143 ; AVX1-NEXT: addq %r11, %rbp
2144 ; AVX1-NEXT: vmovq %xmm0, %r11
2145 ; AVX1-NEXT: addq %r15, %r11
2146 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
2147 ; AVX1-NEXT: vpextrq $1, %xmm0, %r15
2148 ; AVX1-NEXT: addq %rbx, %r15
2149 ; AVX1-NEXT: vmovq %xmm0, %rbx
2150 ; AVX1-NEXT: addq %rdx, %rbx
2151 ; AVX1-NEXT: vpextrq $1, %xmm6, %rax
2152 ; AVX1-NEXT: leaq -1(%rdi,%rax), %rax
2153 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2154 ; AVX1-NEXT: vmovq %xmm6, %rax
2155 ; AVX1-NEXT: leaq -1(%r8,%rax), %rax
2156 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2157 ; AVX1-NEXT: vpextrq $1, %xmm5, %rax
2158 ; AVX1-NEXT: leaq -1(%r9,%rax), %rax
2159 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2160 ; AVX1-NEXT: vmovq %xmm5, %rax
2161 ; AVX1-NEXT: leaq -1(%r10,%rax), %rax
2162 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2163 ; AVX1-NEXT: vpextrq $1, %xmm4, %rax
2164 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2165 ; AVX1-NEXT: leaq -1(%rcx,%rax), %rax
2166 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2167 ; AVX1-NEXT: vmovq %xmm4, %rax
2168 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2169 ; AVX1-NEXT: leaq -1(%rcx,%rax), %rax
2170 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2171 ; AVX1-NEXT: vpextrq $1, %xmm8, %rax
2172 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
2173 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
2174 ; AVX1-NEXT: leaq -1(%rax,%rcx), %rax
2175 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2176 ; AVX1-NEXT: vmovq %xmm8, %rax
2177 ; AVX1-NEXT: vmovq %xmm0, %rcx
2178 ; AVX1-NEXT: leaq -1(%rax,%rcx), %rax
2179 ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2180 ; AVX1-NEXT: xorl %r10d, %r10d
2181 ; AVX1-NEXT: addq $-1, %rsi
2182 ; AVX1-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2183 ; AVX1-NEXT: movl $0, %ecx
2184 ; AVX1-NEXT: adcq $-1, %rcx
2185 ; AVX1-NEXT: addq $-1, %r13
2186 ; AVX1-NEXT: movl $0, %eax
2187 ; AVX1-NEXT: adcq $-1, %rax
2188 ; AVX1-NEXT: addq $-1, %r12
2189 ; AVX1-NEXT: movl $0, %edi
2190 ; AVX1-NEXT: adcq $-1, %rdi
2191 ; AVX1-NEXT: addq $-1, %r14
2192 ; AVX1-NEXT: movl $0, %esi
2193 ; AVX1-NEXT: adcq $-1, %rsi
2194 ; AVX1-NEXT: addq $-1, %rbp
2195 ; AVX1-NEXT: movl $0, %r9d
2196 ; AVX1-NEXT: adcq $-1, %r9
2197 ; AVX1-NEXT: addq $-1, %r11
2198 ; AVX1-NEXT: movl $0, %r8d
2199 ; AVX1-NEXT: adcq $-1, %r8
2200 ; AVX1-NEXT: addq $-1, %r15
2201 ; AVX1-NEXT: movl $0, %edx
2202 ; AVX1-NEXT: adcq $-1, %rdx
2203 ; AVX1-NEXT: addq $-1, %rbx
2204 ; AVX1-NEXT: adcq $-1, %r10
2205 ; AVX1-NEXT: shldq $63, %r11, %r8
2206 ; AVX1-NEXT: shldq $63, %rbp, %r9
2207 ; AVX1-NEXT: shldq $63, %r14, %rsi
2208 ; AVX1-NEXT: shldq $63, %r12, %rdi
2209 ; AVX1-NEXT: shldq $63, %r13, %rax
2210 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
2211 ; AVX1-NEXT: shldq $63, %rbp, %rcx
2212 ; AVX1-NEXT: shldq $63, %rbx, %r10
2213 ; AVX1-NEXT: shldq $63, %r15, %rdx
2214 ; AVX1-NEXT: vmovq %rcx, %xmm8
2215 ; AVX1-NEXT: vmovq %rax, %xmm9
2216 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2217 ; AVX1-NEXT: shrq %rax
2218 ; AVX1-NEXT: vmovq %rax, %xmm0
2219 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2220 ; AVX1-NEXT: shrq %rax
2221 ; AVX1-NEXT: vmovq %rax, %xmm11
2222 ; AVX1-NEXT: vmovq %rdi, %xmm12
2223 ; AVX1-NEXT: vmovq %rsi, %xmm13
2224 ; AVX1-NEXT: vmovq %rdx, %xmm14
2225 ; AVX1-NEXT: vmovq %r10, %xmm15
2226 ; AVX1-NEXT: vmovq %r9, %xmm10
2227 ; AVX1-NEXT: vmovq %r8, %xmm1
2228 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2229 ; AVX1-NEXT: shrq %rax
2230 ; AVX1-NEXT: vmovq %rax, %xmm2
2231 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2232 ; AVX1-NEXT: shrq %rax
2233 ; AVX1-NEXT: vmovq %rax, %xmm3
2234 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2235 ; AVX1-NEXT: shrq %rax
2236 ; AVX1-NEXT: vmovq %rax, %xmm4
2237 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2238 ; AVX1-NEXT: shrq %rax
2239 ; AVX1-NEXT: vmovq %rax, %xmm5
2240 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2241 ; AVX1-NEXT: shrq %rax
2242 ; AVX1-NEXT: vmovq %rax, %xmm6
2243 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2244 ; AVX1-NEXT: shrq %rax
2245 ; AVX1-NEXT: vmovq %rax, %xmm7
2246 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
2247 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
2248 ; AVX1-NEXT: vpsllq $48, %xmm8, %xmm8
2249 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,1,1]
2250 ; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7]
2251 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
2252 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
2253 ; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
2254 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5,6,7]
2255 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7]
2256 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
2257 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2258 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2259 ; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
2260 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
2261 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
2262 ; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
2263 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
2264 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
2265 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6,7]
2266 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
2267 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2268 ; AVX1-NEXT: vmovdqu %xmm0, (%rax)
2269 ; AVX1-NEXT: popq %rbx
2270 ; AVX1-NEXT: popq %r12
2271 ; AVX1-NEXT: popq %r13
2272 ; AVX1-NEXT: popq %r14
2273 ; AVX1-NEXT: popq %r15
2274 ; AVX1-NEXT: popq %rbp
2277 ; AVX2-LABEL: not_avg_v16i8_wide_constants:
2279 ; AVX2-NEXT: pushq %rbp
2280 ; AVX2-NEXT: pushq %r15
2281 ; AVX2-NEXT: pushq %r14
2282 ; AVX2-NEXT: pushq %r13
2283 ; AVX2-NEXT: pushq %r12
2284 ; AVX2-NEXT: pushq %rbx
2285 ; AVX2-NEXT: subq $16, %rsp
2286 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2287 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2288 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2289 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2290 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2291 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
2292 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
2293 ; AVX2-NEXT: vpextrq $1, %xmm4, %rbx
2294 ; AVX2-NEXT: vmovq %xmm4, %rbp
2295 ; AVX2-NEXT: vpextrq $1, %xmm3, %rdi
2296 ; AVX2-NEXT: vmovq %xmm3, %rcx
2297 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2298 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2299 ; AVX2-NEXT: vpextrq $1, %xmm3, %rdx
2300 ; AVX2-NEXT: vmovq %xmm3, %r9
2301 ; AVX2-NEXT: vpextrq $1, %xmm2, %r13
2302 ; AVX2-NEXT: vmovq %xmm2, %r12
2303 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2304 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2305 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2306 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2307 ; AVX2-NEXT: vpextrq $1, %xmm3, %r14
2308 ; AVX2-NEXT: vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2309 ; AVX2-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2310 ; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2311 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2312 ; AVX2-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2313 ; AVX2-NEXT: vmovq %xmm1, %r10
2314 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
2315 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2316 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2317 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2318 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
2319 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
2320 ; AVX2-NEXT: vpextrq $1, %xmm4, %rax
2321 ; AVX2-NEXT: addq %rbx, %rax
2322 ; AVX2-NEXT: movq %rax, %rbx
2323 ; AVX2-NEXT: vmovq %xmm4, %rsi
2324 ; AVX2-NEXT: addq %rbp, %rsi
2325 ; AVX2-NEXT: vpextrq $1, %xmm3, %rax
2326 ; AVX2-NEXT: addq %rdi, %rax
2327 ; AVX2-NEXT: movq %rax, %rdi
2328 ; AVX2-NEXT: vmovq %xmm3, %r11
2329 ; AVX2-NEXT: addq %rcx, %r11
2330 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2331 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2332 ; AVX2-NEXT: vpextrq $1, %xmm3, %rcx
2333 ; AVX2-NEXT: addq %rdx, %rcx
2334 ; AVX2-NEXT: vmovq %xmm3, %r8
2335 ; AVX2-NEXT: addq %r9, %r8
2336 ; AVX2-NEXT: vpextrq $1, %xmm2, %r9
2337 ; AVX2-NEXT: addq %r13, %r9
2338 ; AVX2-NEXT: vmovq %xmm2, %r15
2339 ; AVX2-NEXT: addq %r12, %r15
2340 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2341 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2342 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2343 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2344 ; AVX2-NEXT: vpextrq $1, %xmm3, %rax
2345 ; AVX2-NEXT: addq %r14, %rax
2346 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2347 ; AVX2-NEXT: vmovq %xmm3, %rax
2348 ; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
2349 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2350 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax
2351 ; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
2352 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2353 ; AVX2-NEXT: vmovq %xmm2, %rax
2354 ; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
2355 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2356 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2357 ; AVX2-NEXT: vpextrq $1, %xmm0, %rbp
2358 ; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
2359 ; AVX2-NEXT: vmovq %xmm0, %r12
2360 ; AVX2-NEXT: addq %r10, %r12
2361 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
2362 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2363 ; AVX2-NEXT: vpextrq $1, %xmm0, %r10
2364 ; AVX2-NEXT: addq %rax, %r10
2365 ; AVX2-NEXT: vmovq %xmm1, %rax
2366 ; AVX2-NEXT: vmovq %xmm0, %rdx
2367 ; AVX2-NEXT: addq %rax, %rdx
2368 ; AVX2-NEXT: addq $-1, %rbx
2369 ; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2370 ; AVX2-NEXT: movl $0, %eax
2371 ; AVX2-NEXT: adcq $-1, %rax
2372 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2373 ; AVX2-NEXT: addq $-1, %rsi
2374 ; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2375 ; AVX2-NEXT: movl $0, %eax
2376 ; AVX2-NEXT: adcq $-1, %rax
2377 ; AVX2-NEXT: movq %rax, (%rsp) # 8-byte Spill
2378 ; AVX2-NEXT: addq $-1, %rdi
2379 ; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2380 ; AVX2-NEXT: movl $0, %eax
2381 ; AVX2-NEXT: adcq $-1, %rax
2382 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2383 ; AVX2-NEXT: addq $-1, %r11
2384 ; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2385 ; AVX2-NEXT: movl $0, %eax
2386 ; AVX2-NEXT: adcq $-1, %rax
2387 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2388 ; AVX2-NEXT: addq $-1, %rcx
2389 ; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2390 ; AVX2-NEXT: movl $0, %eax
2391 ; AVX2-NEXT: adcq $-1, %rax
2392 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2393 ; AVX2-NEXT: addq $-1, %r8
2394 ; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2395 ; AVX2-NEXT: movl $0, %eax
2396 ; AVX2-NEXT: adcq $-1, %rax
2397 ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2398 ; AVX2-NEXT: addq $-1, %r9
2399 ; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2400 ; AVX2-NEXT: movl $0, %eax
2401 ; AVX2-NEXT: adcq $-1, %rax
2402 ; AVX2-NEXT: movq %rax, %rsi
2403 ; AVX2-NEXT: addq $-1, %r15
2404 ; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2405 ; AVX2-NEXT: movl $0, %r15d
2406 ; AVX2-NEXT: adcq $-1, %r15
2407 ; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2408 ; AVX2-NEXT: movl $0, %r13d
2409 ; AVX2-NEXT: adcq $-1, %r13
2410 ; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2411 ; AVX2-NEXT: movl $0, %r14d
2412 ; AVX2-NEXT: adcq $-1, %r14
2413 ; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2414 ; AVX2-NEXT: movl $0, %ebx
2415 ; AVX2-NEXT: adcq $-1, %rbx
2416 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2417 ; AVX2-NEXT: addq $-1, %rax
2418 ; AVX2-NEXT: movl $0, %r11d
2419 ; AVX2-NEXT: adcq $-1, %r11
2420 ; AVX2-NEXT: addq $-1, %rbp
2421 ; AVX2-NEXT: movl $0, %r9d
2422 ; AVX2-NEXT: adcq $-1, %r9
2423 ; AVX2-NEXT: addq $-1, %r12
2424 ; AVX2-NEXT: movl $0, %r8d
2425 ; AVX2-NEXT: adcq $-1, %r8
2426 ; AVX2-NEXT: addq $-1, %r10
2427 ; AVX2-NEXT: movl $0, %edi
2428 ; AVX2-NEXT: adcq $-1, %rdi
2429 ; AVX2-NEXT: addq $-1, %rdx
2430 ; AVX2-NEXT: movl $0, %ecx
2431 ; AVX2-NEXT: adcq $-1, %rcx
2432 ; AVX2-NEXT: shldq $63, %rdx, %rcx
2433 ; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2434 ; AVX2-NEXT: shldq $63, %r10, %rdi
2435 ; AVX2-NEXT: shldq $63, %r12, %r8
2436 ; AVX2-NEXT: shldq $63, %rbp, %r9
2437 ; AVX2-NEXT: shldq $63, %rax, %r11
2438 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
2439 ; AVX2-NEXT: shldq $63, %rdx, %rbx
2440 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
2441 ; AVX2-NEXT: shldq $63, %rdx, %r14
2442 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
2443 ; AVX2-NEXT: shldq $63, %rdx, %r13
2444 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2445 ; AVX2-NEXT: shldq $63, %rax, %r15
2446 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2447 ; AVX2-NEXT: shldq $63, %rax, %rsi
2448 ; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2449 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
2450 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2451 ; AVX2-NEXT: shldq $63, %rax, %rsi
2452 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
2453 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2454 ; AVX2-NEXT: shldq $63, %rax, %r12
2455 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2456 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2457 ; AVX2-NEXT: shldq $63, %rax, %rcx
2458 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
2459 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2460 ; AVX2-NEXT: shldq $63, %rax, %r10
2461 ; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload
2462 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
2463 ; AVX2-NEXT: shldq $63, %rdx, %rax
2464 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
2465 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
2466 ; AVX2-NEXT: shldq $63, %rdx, %rbp
2467 ; AVX2-NEXT: vmovq %rbp, %xmm8
2468 ; AVX2-NEXT: vmovq %rax, %xmm9
2469 ; AVX2-NEXT: vmovq %r10, %xmm0
2470 ; AVX2-NEXT: vmovq %rcx, %xmm1
2471 ; AVX2-NEXT: vmovq %r12, %xmm12
2472 ; AVX2-NEXT: vmovq %rsi, %xmm13
2473 ; AVX2-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 8-byte Folded Reload
2474 ; AVX2-NEXT: # xmm14 = mem[0],zero
2475 ; AVX2-NEXT: vmovq %r15, %xmm15
2476 ; AVX2-NEXT: vmovq %r13, %xmm10
2477 ; AVX2-NEXT: vmovq %r14, %xmm11
2478 ; AVX2-NEXT: vmovq %rbx, %xmm2
2479 ; AVX2-NEXT: vmovq %r11, %xmm3
2480 ; AVX2-NEXT: vmovq %r9, %xmm4
2481 ; AVX2-NEXT: vmovq %r8, %xmm5
2482 ; AVX2-NEXT: vmovq %rdi, %xmm6
2483 ; AVX2-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
2484 ; AVX2-NEXT: # xmm7 = mem[0],zero
2485 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
2486 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2487 ; AVX2-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1]
2488 ; AVX2-NEXT: vpbroadcastw %xmm9, %xmm0
2489 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,6],xmm8[7]
2490 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
2491 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
2492 ; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
2493 ; AVX2-NEXT: vpbroadcastw %xmm9, %xmm1
2494 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
2495 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3]
2496 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
2497 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2498 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
2499 ; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2
2500 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7]
2501 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
2502 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
2503 ; AVX2-NEXT: vpslld $16, %xmm3, %xmm3
2504 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
2505 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
2506 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2507 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
2508 ; AVX2-NEXT: addq $16, %rsp
2509 ; AVX2-NEXT: popq %rbx
2510 ; AVX2-NEXT: popq %r12
2511 ; AVX2-NEXT: popq %r13
2512 ; AVX2-NEXT: popq %r14
2513 ; AVX2-NEXT: popq %r15
2514 ; AVX2-NEXT: popq %rbp
2515 ; AVX2-NEXT: vzeroupper
2518 ; AVX512F-LABEL: not_avg_v16i8_wide_constants:
2520 ; AVX512F-NEXT: pushq %rbp
2521 ; AVX512F-NEXT: pushq %r15
2522 ; AVX512F-NEXT: pushq %r14
2523 ; AVX512F-NEXT: pushq %r13
2524 ; AVX512F-NEXT: pushq %r12
2525 ; AVX512F-NEXT: pushq %rbx
2526 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2527 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2528 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2529 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2530 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
2531 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2532 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4
2533 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
2534 ; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5
2535 ; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx
2536 ; AVX512F-NEXT: vmovq %xmm5, %rcx
2537 ; AVX512F-NEXT: vpextrq $1, %xmm4, %rax
2538 ; AVX512F-NEXT: vmovq %xmm4, %rbx
2539 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2540 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4
2541 ; AVX512F-NEXT: vpextrq $1, %xmm4, %rdi
2542 ; AVX512F-NEXT: vmovq %xmm4, %rsi
2543 ; AVX512F-NEXT: vpextrq $1, %xmm1, %r13
2544 ; AVX512F-NEXT: vmovq %xmm1, %r15
2545 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
2546 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2547 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
2548 ; AVX512F-NEXT: vpextrq $1, %xmm2, %r12
2549 ; AVX512F-NEXT: vmovq %xmm2, %r14
2550 ; AVX512F-NEXT: vpextrq $1, %xmm1, %r11
2551 ; AVX512F-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2552 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
2553 ; AVX512F-NEXT: vpextrq $1, %xmm1, %r10
2554 ; AVX512F-NEXT: vmovq %xmm1, %r9
2555 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2556 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2557 ; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm3
2558 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2559 ; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
2560 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
2561 ; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5
2562 ; AVX512F-NEXT: vpextrq $1, %xmm5, %rbp
2563 ; AVX512F-NEXT: leal -1(%rdx,%rbp), %edx
2564 ; AVX512F-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2565 ; AVX512F-NEXT: vmovq %xmm5, %rbp
2566 ; AVX512F-NEXT: leal -1(%rcx,%rbp), %ecx
2567 ; AVX512F-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2568 ; AVX512F-NEXT: vpextrq $1, %xmm4, %rbp
2569 ; AVX512F-NEXT: leal -1(%rax,%rbp), %eax
2570 ; AVX512F-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2571 ; AVX512F-NEXT: vmovq %xmm4, %rbp
2572 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
2573 ; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
2574 ; AVX512F-NEXT: leal -1(%rbx,%rbp), %r8d
2575 ; AVX512F-NEXT: vpextrq $1, %xmm4, %rbp
2576 ; AVX512F-NEXT: leal -1(%rdi,%rbp), %edi
2577 ; AVX512F-NEXT: vmovq %xmm4, %rbp
2578 ; AVX512F-NEXT: leal -1(%rsi,%rbp), %esi
2579 ; AVX512F-NEXT: vpextrq $1, %xmm3, %rbp
2580 ; AVX512F-NEXT: leal -1(%r13,%rbp), %r13d
2581 ; AVX512F-NEXT: vmovq %xmm3, %rbp
2582 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
2583 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2584 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
2585 ; AVX512F-NEXT: leal -1(%r15,%rbp), %r15d
2586 ; AVX512F-NEXT: vpextrq $1, %xmm3, %rbp
2587 ; AVX512F-NEXT: leal -1(%r12,%rbp), %r12d
2588 ; AVX512F-NEXT: vmovq %xmm3, %rbp
2589 ; AVX512F-NEXT: leal -1(%r14,%rbp), %r14d
2590 ; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx
2591 ; AVX512F-NEXT: leal -1(%r11,%rdx), %r11d
2592 ; AVX512F-NEXT: vmovq %xmm2, %rbp
2593 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
2594 ; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2595 ; AVX512F-NEXT: leal -1(%rax,%rbp), %ebp
2596 ; AVX512F-NEXT: vpextrq $1, %xmm2, %rcx
2597 ; AVX512F-NEXT: leal -1(%r10,%rcx), %ecx
2598 ; AVX512F-NEXT: vmovq %xmm2, %rax
2599 ; AVX512F-NEXT: leal -1(%r9,%rax), %eax
2600 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
2601 ; AVX512F-NEXT: vpextrq $1, %xmm1, %r10
2602 ; AVX512F-NEXT: leal -1(%rdx,%r10), %edx
2603 ; AVX512F-NEXT: vmovq %xmm0, %r10
2604 ; AVX512F-NEXT: vmovq %xmm1, %r9
2605 ; AVX512F-NEXT: leaq -1(%r10,%r9), %rbx
2606 ; AVX512F-NEXT: shrq %rbx
2607 ; AVX512F-NEXT: vmovd %ebx, %xmm0
2608 ; AVX512F-NEXT: shrl %edx
2609 ; AVX512F-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
2610 ; AVX512F-NEXT: shrl %eax
2611 ; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
2612 ; AVX512F-NEXT: shrl %ecx
2613 ; AVX512F-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
2614 ; AVX512F-NEXT: shrl %ebp
2615 ; AVX512F-NEXT: vpinsrb $4, %ebp, %xmm0, %xmm0
2616 ; AVX512F-NEXT: shrl %r11d
2617 ; AVX512F-NEXT: vpinsrb $5, %r11d, %xmm0, %xmm0
2618 ; AVX512F-NEXT: shrl %r14d
2619 ; AVX512F-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
2620 ; AVX512F-NEXT: shrl %r12d
2621 ; AVX512F-NEXT: vpinsrb $7, %r12d, %xmm0, %xmm0
2622 ; AVX512F-NEXT: shrl %r15d
2623 ; AVX512F-NEXT: vpinsrb $8, %r15d, %xmm0, %xmm0
2624 ; AVX512F-NEXT: shrl %r13d
2625 ; AVX512F-NEXT: vpinsrb $9, %r13d, %xmm0, %xmm0
2626 ; AVX512F-NEXT: shrl %esi
2627 ; AVX512F-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0
2628 ; AVX512F-NEXT: shrl %edi
2629 ; AVX512F-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0
2630 ; AVX512F-NEXT: shrl %r8d
2631 ; AVX512F-NEXT: vpinsrb $12, %r8d, %xmm0, %xmm0
2632 ; AVX512F-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2633 ; AVX512F-NEXT: shrl %eax
2634 ; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
2635 ; AVX512F-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2636 ; AVX512F-NEXT: shrl %eax
2637 ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
2638 ; AVX512F-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2639 ; AVX512F-NEXT: shrl %eax
2640 ; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
2641 ; AVX512F-NEXT: vmovdqu %xmm0, (%rax)
2642 ; AVX512F-NEXT: popq %rbx
2643 ; AVX512F-NEXT: popq %r12
2644 ; AVX512F-NEXT: popq %r13
2645 ; AVX512F-NEXT: popq %r14
2646 ; AVX512F-NEXT: popq %r15
2647 ; AVX512F-NEXT: popq %rbp
2648 ; AVX512F-NEXT: vzeroupper
2649 ; AVX512F-NEXT: retq
2651 ; AVX512BW-LABEL: not_avg_v16i8_wide_constants:
2652 ; AVX512BW: # %bb.0:
2653 ; AVX512BW-NEXT: pushq %rbp
2654 ; AVX512BW-NEXT: pushq %r15
2655 ; AVX512BW-NEXT: pushq %r14
2656 ; AVX512BW-NEXT: pushq %r13
2657 ; AVX512BW-NEXT: pushq %r12
2658 ; AVX512BW-NEXT: pushq %rbx
2659 ; AVX512BW-NEXT: subq $24, %rsp
2660 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2661 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
2662 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2663 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2664 ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
2665 ; AVX512BW-NEXT: vmovq %xmm4, %rbx
2666 ; AVX512BW-NEXT: vpextrq $1, %xmm4, %rbp
2667 ; AVX512BW-NEXT: vmovq %xmm3, %rdi
2668 ; AVX512BW-NEXT: vpextrq $1, %xmm3, %rsi
2669 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
2670 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2671 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
2672 ; AVX512BW-NEXT: vmovq %xmm3, %rdx
2673 ; AVX512BW-NEXT: vpextrq $1, %xmm3, %r15
2674 ; AVX512BW-NEXT: vmovq %xmm2, %r8
2675 ; AVX512BW-NEXT: vpextrq $1, %xmm2, %r14
2676 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
2677 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2678 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2679 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
2680 ; AVX512BW-NEXT: vmovq %xmm3, %r9
2681 ; AVX512BW-NEXT: vpextrq $1, %xmm3, %r10
2682 ; AVX512BW-NEXT: vmovq %xmm2, %r11
2683 ; AVX512BW-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2684 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
2685 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2686 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
2687 ; AVX512BW-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2688 ; AVX512BW-NEXT: vpextrq $1, %xmm2, %r13
2689 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2690 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2691 ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
2692 ; AVX512BW-NEXT: vmovq %xmm4, %rax
2693 ; AVX512BW-NEXT: addq %rbx, %rax
2694 ; AVX512BW-NEXT: movq %rax, %rbx
2695 ; AVX512BW-NEXT: vpextrq $1, %xmm4, %rax
2696 ; AVX512BW-NEXT: addq %rbp, %rax
2697 ; AVX512BW-NEXT: movq %rax, %rbp
2698 ; AVX512BW-NEXT: vmovq %xmm3, %rcx
2699 ; AVX512BW-NEXT: addq %rdi, %rcx
2700 ; AVX512BW-NEXT: vpextrq $1, %xmm3, %r12
2701 ; AVX512BW-NEXT: addq %rsi, %r12
2702 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
2703 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
2704 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
2705 ; AVX512BW-NEXT: vmovq %xmm3, %rax
2706 ; AVX512BW-NEXT: addq %rdx, %rax
2707 ; AVX512BW-NEXT: movq %rax, %rdx
2708 ; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
2709 ; AVX512BW-NEXT: addq %r15, %rax
2710 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2711 ; AVX512BW-NEXT: vmovq %xmm2, %rax
2712 ; AVX512BW-NEXT: addq %r8, %rax
2713 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2714 ; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax
2715 ; AVX512BW-NEXT: addq %r14, %rax
2716 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2717 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2718 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2719 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2720 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
2721 ; AVX512BW-NEXT: vmovq %xmm3, %rax
2722 ; AVX512BW-NEXT: addq %r9, %rax
2723 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2724 ; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
2725 ; AVX512BW-NEXT: addq %r10, %rax
2726 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2727 ; AVX512BW-NEXT: vmovq %xmm2, %rax
2728 ; AVX512BW-NEXT: addq %r11, %rax
2729 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2730 ; AVX512BW-NEXT: vpextrq $1, %xmm2, %r14
2731 ; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
2732 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
2733 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2734 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
2735 ; AVX512BW-NEXT: vmovq %xmm2, %r10
2736 ; AVX512BW-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
2737 ; AVX512BW-NEXT: vpextrq $1, %xmm2, %r9
2738 ; AVX512BW-NEXT: addq %r13, %r9
2739 ; AVX512BW-NEXT: vmovq %xmm0, %rax
2740 ; AVX512BW-NEXT: vmovq %xmm1, %r8
2741 ; AVX512BW-NEXT: addq %rax, %r8
2742 ; AVX512BW-NEXT: vpextrq $1, %xmm0, %rdi
2743 ; AVX512BW-NEXT: vpextrq $1, %xmm1, %rsi
2744 ; AVX512BW-NEXT: addq %rdi, %rsi
2745 ; AVX512BW-NEXT: addq $-1, %rbx
2746 ; AVX512BW-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2747 ; AVX512BW-NEXT: movl $0, %r15d
2748 ; AVX512BW-NEXT: adcq $-1, %r15
2749 ; AVX512BW-NEXT: addq $-1, %rbp
2750 ; AVX512BW-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2751 ; AVX512BW-NEXT: movl $0, %ebx
2752 ; AVX512BW-NEXT: adcq $-1, %rbx
2753 ; AVX512BW-NEXT: addq $-1, %rcx
2754 ; AVX512BW-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2755 ; AVX512BW-NEXT: movl $0, %r11d
2756 ; AVX512BW-NEXT: adcq $-1, %r11
2757 ; AVX512BW-NEXT: addq $-1, %r12
2758 ; AVX512BW-NEXT: movq %r12, (%rsp) # 8-byte Spill
2759 ; AVX512BW-NEXT: movl $0, %edi
2760 ; AVX512BW-NEXT: adcq $-1, %rdi
2761 ; AVX512BW-NEXT: addq $-1, %rdx
2762 ; AVX512BW-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2763 ; AVX512BW-NEXT: movl $0, %eax
2764 ; AVX512BW-NEXT: adcq $-1, %rax
2765 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2766 ; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2767 ; AVX512BW-NEXT: movl $0, %eax
2768 ; AVX512BW-NEXT: adcq $-1, %rax
2769 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2770 ; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2771 ; AVX512BW-NEXT: movl $0, %r13d
2772 ; AVX512BW-NEXT: adcq $-1, %r13
2773 ; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2774 ; AVX512BW-NEXT: movl $0, %r12d
2775 ; AVX512BW-NEXT: adcq $-1, %r12
2776 ; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2777 ; AVX512BW-NEXT: movl $0, %eax
2778 ; AVX512BW-NEXT: adcq $-1, %rax
2779 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2780 ; AVX512BW-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2781 ; AVX512BW-NEXT: movl $0, %eax
2782 ; AVX512BW-NEXT: adcq $-1, %rax
2783 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2784 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2785 ; AVX512BW-NEXT: addq $-1, %rcx
2786 ; AVX512BW-NEXT: movl $0, %eax
2787 ; AVX512BW-NEXT: adcq $-1, %rax
2788 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2789 ; AVX512BW-NEXT: addq $-1, %r14
2790 ; AVX512BW-NEXT: movl $0, %eax
2791 ; AVX512BW-NEXT: adcq $-1, %rax
2792 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2793 ; AVX512BW-NEXT: addq $-1, %r10
2794 ; AVX512BW-NEXT: movl $0, %eax
2795 ; AVX512BW-NEXT: adcq $-1, %rax
2796 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2797 ; AVX512BW-NEXT: addq $-1, %r9
2798 ; AVX512BW-NEXT: movl $0, %edx
2799 ; AVX512BW-NEXT: adcq $-1, %rdx
2800 ; AVX512BW-NEXT: addq $-1, %r8
2801 ; AVX512BW-NEXT: movl $0, %eax
2802 ; AVX512BW-NEXT: adcq $-1, %rax
2803 ; AVX512BW-NEXT: addq $-1, %rsi
2804 ; AVX512BW-NEXT: movl $0, %ebp
2805 ; AVX512BW-NEXT: adcq $-1, %rbp
2806 ; AVX512BW-NEXT: shldq $63, %rsi, %rbp
2807 ; AVX512BW-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2808 ; AVX512BW-NEXT: shldq $63, %r8, %rax
2809 ; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2810 ; AVX512BW-NEXT: shldq $63, %r9, %rdx
2811 ; AVX512BW-NEXT: movq %rdx, %rbp
2812 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
2813 ; AVX512BW-NEXT: shldq $63, %r10, %r8
2814 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
2815 ; AVX512BW-NEXT: shldq $63, %r14, %r10
2816 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
2817 ; AVX512BW-NEXT: shldq $63, %rcx, %r9
2818 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
2819 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2820 ; AVX512BW-NEXT: shldq $63, %rax, %r14
2821 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2822 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
2823 ; AVX512BW-NEXT: shldq $63, %rax, %rsi
2824 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2825 ; AVX512BW-NEXT: shldq $63, %rax, %r12
2826 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2827 ; AVX512BW-NEXT: shldq $63, %rax, %r13
2828 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2829 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
2830 ; AVX512BW-NEXT: shldq $63, %rax, %rdx
2831 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
2832 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2833 ; AVX512BW-NEXT: shldq $63, %rax, %rcx
2834 ; AVX512BW-NEXT: movq (%rsp), %rax # 8-byte Reload
2835 ; AVX512BW-NEXT: shldq $63, %rax, %rdi
2836 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2837 ; AVX512BW-NEXT: shldq $63, %rax, %r11
2838 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2839 ; AVX512BW-NEXT: shldq $63, %rax, %rbx
2840 ; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2841 ; AVX512BW-NEXT: shldq $63, %rax, %r15
2842 ; AVX512BW-NEXT: vmovq %r15, %xmm0
2843 ; AVX512BW-NEXT: vmovq %rbx, %xmm1
2844 ; AVX512BW-NEXT: vmovq %r11, %xmm2
2845 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2846 ; AVX512BW-NEXT: vmovq %rdi, %xmm1
2847 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
2848 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
2849 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
2850 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
2851 ; AVX512BW-NEXT: vmovd %eax, %xmm2
2852 ; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
2853 ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm2, %xmm1
2854 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
2855 ; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
2856 ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
2857 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
2858 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
2859 ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0
2860 ; AVX512BW-NEXT: vmovq %rcx, %xmm1
2861 ; AVX512BW-NEXT: vmovq %rdx, %xmm2
2862 ; AVX512BW-NEXT: vmovq %r13, %xmm3
2863 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2864 ; AVX512BW-NEXT: vmovq %r12, %xmm2
2865 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
2866 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
2867 ; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
2868 ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
2869 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
2870 ; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
2871 ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
2872 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2
2873 ; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
2874 ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
2875 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1
2876 ; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
2877 ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
2878 ; AVX512BW-NEXT: vmovq %rsi, %xmm1
2879 ; AVX512BW-NEXT: vmovq %r14, %xmm2
2880 ; AVX512BW-NEXT: vmovq %r9, %xmm3
2881 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2882 ; AVX512BW-NEXT: vmovq %r10, %xmm2
2883 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
2884 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
2885 ; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
2886 ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
2887 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
2888 ; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
2889 ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
2890 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2
2891 ; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
2892 ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
2893 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1
2894 ; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
2895 ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
2896 ; AVX512BW-NEXT: vmovq %r8, %xmm1
2897 ; AVX512BW-NEXT: vmovq %rbp, %xmm2
2898 ; AVX512BW-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 8-byte Folded Reload
2899 ; AVX512BW-NEXT: # xmm3 = mem[0],zero
2900 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2901 ; AVX512BW-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 8-byte Folded Reload
2902 ; AVX512BW-NEXT: # xmm2 = mem[0],zero
2903 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
2904 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
2905 ; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
2906 ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
2907 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
2908 ; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
2909 ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
2910 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm2
2911 ; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
2912 ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
2913 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1
2914 ; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
2915 ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
2916 ; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
2917 ; AVX512BW-NEXT: addq $24, %rsp
2918 ; AVX512BW-NEXT: popq %rbx
2919 ; AVX512BW-NEXT: popq %r12
2920 ; AVX512BW-NEXT: popq %r13
2921 ; AVX512BW-NEXT: popq %r14
2922 ; AVX512BW-NEXT: popq %r15
2923 ; AVX512BW-NEXT: popq %rbp
2924 ; AVX512BW-NEXT: vzeroupper
2925 ; AVX512BW-NEXT: retq
2926 %1 = load <16 x i8>, <16 x i8>* %a
2927 %2 = load <16 x i8>, <16 x i8>* %b
2928 %3 = zext <16 x i8> %1 to <16 x i128>
2929 %4 = zext <16 x i8> %2 to <16 x i128>
2930 %5 = add nuw nsw <16 x i128> %3, <i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1>
2931 %6 = add nuw nsw <16 x i128> %5, %4
2932 %7 = lshr <16 x i128> %6, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
2933 %8 = trunc <16 x i128> %7 to <16 x i8>
2934 store <16 x i8> %8, <16 x i8>* undef, align 4
2938 ; Make sure we don't fail on single element vectors.
2939 define <1 x i8> @avg_v1i8(<1 x i8> %x, <1 x i8> %y) {
2940 ; SSE2-LABEL: avg_v1i8:
2942 ; SSE2-NEXT: movzbl %dil, %eax
2943 ; SSE2-NEXT: movzbl %sil, %ecx
2944 ; SSE2-NEXT: leal 1(%rax,%rcx), %eax
2945 ; SSE2-NEXT: shrl %eax
2946 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
2949 ; AVX-LABEL: avg_v1i8:
2951 ; AVX-NEXT: movzbl %dil, %eax
2952 ; AVX-NEXT: movzbl %sil, %ecx
2953 ; AVX-NEXT: leal 1(%rax,%rcx), %eax
2954 ; AVX-NEXT: shrl %eax
2955 ; AVX-NEXT: # kill: def $al killed $al killed $eax
2957 %a = zext <1 x i8> %x to <1 x i16>
2958 %b = zext <1 x i8> %y to <1 x i16>
2959 %c = add <1 x i16> %a, %b
2960 %d = add <1 x i16> %c, <i16 1>
2961 %e = lshr <1 x i16> %d, <i16 1>
2962 %f = trunc <1 x i16> %e to <1 x i8>
2966 ; _mm_avg_epu16( _mm_slli_epi16(a, 2), _mm_slli_epi16(b, 2))
2967 define <2 x i64> @PR41316(<2 x i64>, <2 x i64>) {
2968 ; SSE2-LABEL: PR41316:
2970 ; SSE2-NEXT: psllw $2, %xmm0
2971 ; SSE2-NEXT: psllw $2, %xmm1
2972 ; SSE2-NEXT: pavgw %xmm1, %xmm0
2975 ; AVX-LABEL: PR41316:
2977 ; AVX-NEXT: vpsllw $2, %xmm0, %xmm0
2978 ; AVX-NEXT: vpsllw $2, %xmm1, %xmm1
2979 ; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0
2981 %3 = bitcast <2 x i64> %0 to <8 x i16>
2982 %4 = shl <8 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
2983 %5 = bitcast <2 x i64> %1 to <8 x i16>
2984 %6 = shl <8 x i16> %5, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
2985 %7 = zext <8 x i16> %6 to <8 x i32>
2986 %8 = or <8 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
2987 %9 = zext <8 x i16> %8 to <8 x i32>
2988 %10 = add nuw nsw <8 x i32> %9, %7
2989 %11 = lshr <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2990 %12 = trunc <8 x i32> %11 to <8 x i16>
2991 %13 = bitcast <8 x i16> %12 to <2 x i64>