2 * Copyright (c) 2008 Loren Merritt
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * SSSE3 optimized version of (put|avg)_h264_chroma_mc8.
23 * H264_CHROMA_MC8_TMPL must be defined to the desired function name
24 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
25 * AVG_OP must be defined to empty for put and the identify for avg
27 static void H264_CHROMA_MC8_TMPL(uint8_t *dst
/*align 8*/, uint8_t *src
/*align 1*/, int stride
, int h
, int x
, int y
, int rnd
)
30 /* no filter needed */
31 H264_CHROMA_MC8_MV0(dst
, src
, stride
, h
);
35 assert(x
<8 && y
<8 && x
>=0 && y
>=0);
39 /* 1 dimensional filter only */
41 "movd %0, %%xmm7 \n\t"
42 "movq %1, %%xmm6 \n\t"
43 "pshuflw $0, %%xmm7, %%xmm7 \n\t"
44 "movlhps %%xmm6, %%xmm6 \n\t"
45 "movlhps %%xmm7, %%xmm7 \n\t"
46 :: "r"(255*(x
+y
)+8), "m"(*(rnd
?&ff_pw_4
:&ff_pw_3
))
52 "movq (%1), %%xmm0 \n\t"
53 "movq 1(%1), %%xmm1 \n\t"
54 "movq (%1,%3), %%xmm2 \n\t"
55 "movq 1(%1,%3), %%xmm3 \n\t"
56 "punpcklbw %%xmm1, %%xmm0 \n\t"
57 "punpcklbw %%xmm3, %%xmm2 \n\t"
58 "pmaddubsw %%xmm7, %%xmm0 \n\t"
59 "pmaddubsw %%xmm7, %%xmm2 \n\t"
60 AVG_OP("movq (%0), %%xmm4 \n\t")
61 AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
62 "paddw %%xmm6, %%xmm0 \n\t"
63 "paddw %%xmm6, %%xmm2 \n\t"
64 "psrlw $3, %%xmm0 \n\t"
65 "psrlw $3, %%xmm2 \n\t"
66 "packuswb %%xmm2, %%xmm0 \n\t"
67 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
68 "movq %%xmm0, (%0) \n\t"
69 "movhps %%xmm0, (%0,%3) \n\t"
71 "lea (%1,%3,2), %1 \n\t"
72 "lea (%0,%3,2), %0 \n\t"
74 :"+r"(dst
), "+r"(src
), "+r"(h
)
80 "movq (%1), %%xmm0 \n\t"
81 "movq (%1,%3), %%xmm1 \n\t"
82 "movdqa %%xmm1, %%xmm2 \n\t"
83 "movq (%1,%3,2), %%xmm3 \n\t"
84 "punpcklbw %%xmm1, %%xmm0 \n\t"
85 "punpcklbw %%xmm3, %%xmm2 \n\t"
86 "pmaddubsw %%xmm7, %%xmm0 \n\t"
87 "pmaddubsw %%xmm7, %%xmm2 \n\t"
88 AVG_OP("movq (%0), %%xmm4 \n\t")
89 AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
90 "paddw %%xmm6, %%xmm0 \n\t"
91 "paddw %%xmm6, %%xmm2 \n\t"
92 "psrlw $3, %%xmm0 \n\t"
93 "psrlw $3, %%xmm2 \n\t"
94 "packuswb %%xmm2, %%xmm0 \n\t"
95 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
96 "movq %%xmm0, (%0) \n\t"
97 "movhps %%xmm0, (%0,%3) \n\t"
99 "lea (%1,%3,2), %1 \n\t"
100 "lea (%0,%3,2), %0 \n\t"
102 :"+r"(dst
), "+r"(src
), "+r"(h
)
103 :"r"((x86_reg
)stride
)
109 /* general case, bilinear */
111 "movd %0, %%xmm7 \n\t"
112 "movd %1, %%xmm6 \n\t"
113 "movdqa %2, %%xmm5 \n\t"
114 "pshuflw $0, %%xmm7, %%xmm7 \n\t"
115 "pshuflw $0, %%xmm6, %%xmm6 \n\t"
116 "movlhps %%xmm7, %%xmm7 \n\t"
117 "movlhps %%xmm6, %%xmm6 \n\t"
118 :: "r"((x
*255+8)*(8-y
)), "r"((x
*255+8)*y
), "m"(*(rnd
?&ff_pw_32
:&ff_pw_28
))
122 "movq (%1), %%xmm0 \n\t"
123 "movq 1(%1), %%xmm1 \n\t"
124 "punpcklbw %%xmm1, %%xmm0 \n\t"
127 "movq (%1), %%xmm1 \n\t"
128 "movq 1(%1), %%xmm2 \n\t"
129 "movq (%1,%3), %%xmm3 \n\t"
130 "movq 1(%1,%3), %%xmm4 \n\t"
131 "lea (%1,%3,2), %1 \n\t"
132 "punpcklbw %%xmm2, %%xmm1 \n\t"
133 "punpcklbw %%xmm4, %%xmm3 \n\t"
134 "movdqa %%xmm1, %%xmm2 \n\t"
135 "movdqa %%xmm3, %%xmm4 \n\t"
136 "pmaddubsw %%xmm7, %%xmm0 \n\t"
137 "pmaddubsw %%xmm6, %%xmm1 \n\t"
138 "pmaddubsw %%xmm7, %%xmm2 \n\t"
139 "pmaddubsw %%xmm6, %%xmm3 \n\t"
140 "paddw %%xmm5, %%xmm0 \n\t"
141 "paddw %%xmm5, %%xmm2 \n\t"
142 "paddw %%xmm0, %%xmm1 \n\t"
143 "paddw %%xmm2, %%xmm3 \n\t"
144 "movdqa %%xmm4, %%xmm0 \n\t"
145 "psrlw $6, %%xmm1 \n\t"
146 "psrlw $6, %%xmm3 \n\t"
147 AVG_OP("movq (%0), %%xmm2 \n\t")
148 AVG_OP("movhps (%0,%3), %%xmm2 \n\t")
149 "packuswb %%xmm3, %%xmm1 \n\t"
150 AVG_OP("pavgb %%xmm2, %%xmm1 \n\t")
151 "movq %%xmm1, (%0)\n\t"
152 "movhps %%xmm1, (%0,%3)\n\t"
154 "lea (%0,%3,2), %0 \n\t"
156 :"+r"(dst
), "+r"(src
), "+r"(h
)
157 :"r"((x86_reg
)stride
)
161 static void H264_CHROMA_MC4_TMPL(uint8_t *dst
/*align 4*/, uint8_t *src
/*align 1*/, int stride
, int h
, int x
, int y
)
164 "movd %0, %%mm7 \n\t"
165 "movd %1, %%mm6 \n\t"
166 "movq %2, %%mm5 \n\t"
167 "pshufw $0, %%mm7, %%mm7 \n\t"
168 "pshufw $0, %%mm6, %%mm6 \n\t"
169 :: "r"((x
*255+8)*(8-y
)), "r"((x
*255+8)*y
), "m"(ff_pw_32
)
173 "movd (%1), %%mm0 \n\t"
174 "punpcklbw 1(%1), %%mm0 \n\t"
177 "movd (%1), %%mm1 \n\t"
178 "movd (%1,%3), %%mm3 \n\t"
179 "punpcklbw 1(%1), %%mm1 \n\t"
180 "punpcklbw 1(%1,%3), %%mm3 \n\t"
181 "lea (%1,%3,2), %1 \n\t"
182 "movq %%mm1, %%mm2 \n\t"
183 "movq %%mm3, %%mm4 \n\t"
184 "pmaddubsw %%mm7, %%mm0 \n\t"
185 "pmaddubsw %%mm6, %%mm1 \n\t"
186 "pmaddubsw %%mm7, %%mm2 \n\t"
187 "pmaddubsw %%mm6, %%mm3 \n\t"
188 "paddw %%mm5, %%mm0 \n\t"
189 "paddw %%mm5, %%mm2 \n\t"
190 "paddw %%mm0, %%mm1 \n\t"
191 "paddw %%mm2, %%mm3 \n\t"
192 "movq %%mm4, %%mm0 \n\t"
193 "psrlw $6, %%mm1 \n\t"
194 "psrlw $6, %%mm3 \n\t"
195 "packuswb %%mm1, %%mm1 \n\t"
196 "packuswb %%mm3, %%mm3 \n\t"
197 AVG_OP("pavgb (%0), %%mm1 \n\t")
198 AVG_OP("pavgb (%0,%3), %%mm3 \n\t")
199 "movd %%mm1, (%0)\n\t"
200 "movd %%mm3, (%0,%3)\n\t"
202 "lea (%0,%3,2), %0 \n\t"
204 :"+r"(dst
), "+r"(src
), "+r"(h
)
205 :"r"((x86_reg
)stride
)