Rename var: val -> energy
[FFMpeg-mirror/DVCPRO-HD.git] / libavcodec / i386 / dsputil_h264_template_ssse3.c
blob5345ccc1d82deb92da50c9040e5acef1ce4a59e2
1 /*
2 * Copyright (c) 2008 Loren Merritt
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 /**
22 * SSSE3 optimized version of (put|avg)_h264_chroma_mc8.
23 * H264_CHROMA_MC8_TMPL must be defined to the desired function name
24 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
25 * AVG_OP must be defined to empty for put and the identify for avg
27 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
29 if(y==0 && x==0) {
30 /* no filter needed */
31 H264_CHROMA_MC8_MV0(dst, src, stride, h);
32 return;
35 assert(x<8 && y<8 && x>=0 && y>=0);
37 if(y==0 || x==0)
39 /* 1 dimensional filter only */
40 asm volatile(
41 "movd %0, %%xmm7 \n\t"
42 "movq %1, %%xmm6 \n\t"
43 "pshuflw $0, %%xmm7, %%xmm7 \n\t"
44 "movlhps %%xmm6, %%xmm6 \n\t"
45 "movlhps %%xmm7, %%xmm7 \n\t"
46 :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3))
49 if(x) {
50 asm volatile(
51 "1: \n\t"
52 "movq (%1), %%xmm0 \n\t"
53 "movq 1(%1), %%xmm1 \n\t"
54 "movq (%1,%3), %%xmm2 \n\t"
55 "movq 1(%1,%3), %%xmm3 \n\t"
56 "punpcklbw %%xmm1, %%xmm0 \n\t"
57 "punpcklbw %%xmm3, %%xmm2 \n\t"
58 "pmaddubsw %%xmm7, %%xmm0 \n\t"
59 "pmaddubsw %%xmm7, %%xmm2 \n\t"
60 AVG_OP("movq (%0), %%xmm4 \n\t")
61 AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
62 "paddw %%xmm6, %%xmm0 \n\t"
63 "paddw %%xmm6, %%xmm2 \n\t"
64 "psrlw $3, %%xmm0 \n\t"
65 "psrlw $3, %%xmm2 \n\t"
66 "packuswb %%xmm2, %%xmm0 \n\t"
67 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
68 "movq %%xmm0, (%0) \n\t"
69 "movhps %%xmm0, (%0,%3) \n\t"
70 "sub $2, %2 \n\t"
71 "lea (%1,%3,2), %1 \n\t"
72 "lea (%0,%3,2), %0 \n\t"
73 "jg 1b \n\t"
74 :"+r"(dst), "+r"(src), "+r"(h)
75 :"r"((x86_reg)stride)
77 } else {
78 asm volatile(
79 "1: \n\t"
80 "movq (%1), %%xmm0 \n\t"
81 "movq (%1,%3), %%xmm1 \n\t"
82 "movdqa %%xmm1, %%xmm2 \n\t"
83 "movq (%1,%3,2), %%xmm3 \n\t"
84 "punpcklbw %%xmm1, %%xmm0 \n\t"
85 "punpcklbw %%xmm3, %%xmm2 \n\t"
86 "pmaddubsw %%xmm7, %%xmm0 \n\t"
87 "pmaddubsw %%xmm7, %%xmm2 \n\t"
88 AVG_OP("movq (%0), %%xmm4 \n\t")
89 AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
90 "paddw %%xmm6, %%xmm0 \n\t"
91 "paddw %%xmm6, %%xmm2 \n\t"
92 "psrlw $3, %%xmm0 \n\t"
93 "psrlw $3, %%xmm2 \n\t"
94 "packuswb %%xmm2, %%xmm0 \n\t"
95 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
96 "movq %%xmm0, (%0) \n\t"
97 "movhps %%xmm0, (%0,%3) \n\t"
98 "sub $2, %2 \n\t"
99 "lea (%1,%3,2), %1 \n\t"
100 "lea (%0,%3,2), %0 \n\t"
101 "jg 1b \n\t"
102 :"+r"(dst), "+r"(src), "+r"(h)
103 :"r"((x86_reg)stride)
106 return;
109 /* general case, bilinear */
110 asm volatile(
111 "movd %0, %%xmm7 \n\t"
112 "movd %1, %%xmm6 \n\t"
113 "movdqa %2, %%xmm5 \n\t"
114 "pshuflw $0, %%xmm7, %%xmm7 \n\t"
115 "pshuflw $0, %%xmm6, %%xmm6 \n\t"
116 "movlhps %%xmm7, %%xmm7 \n\t"
117 "movlhps %%xmm6, %%xmm6 \n\t"
118 :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28))
121 asm volatile(
122 "movq (%1), %%xmm0 \n\t"
123 "movq 1(%1), %%xmm1 \n\t"
124 "punpcklbw %%xmm1, %%xmm0 \n\t"
125 "add %3, %1 \n\t"
126 "1: \n\t"
127 "movq (%1), %%xmm1 \n\t"
128 "movq 1(%1), %%xmm2 \n\t"
129 "movq (%1,%3), %%xmm3 \n\t"
130 "movq 1(%1,%3), %%xmm4 \n\t"
131 "lea (%1,%3,2), %1 \n\t"
132 "punpcklbw %%xmm2, %%xmm1 \n\t"
133 "punpcklbw %%xmm4, %%xmm3 \n\t"
134 "movdqa %%xmm1, %%xmm2 \n\t"
135 "movdqa %%xmm3, %%xmm4 \n\t"
136 "pmaddubsw %%xmm7, %%xmm0 \n\t"
137 "pmaddubsw %%xmm6, %%xmm1 \n\t"
138 "pmaddubsw %%xmm7, %%xmm2 \n\t"
139 "pmaddubsw %%xmm6, %%xmm3 \n\t"
140 "paddw %%xmm5, %%xmm0 \n\t"
141 "paddw %%xmm5, %%xmm2 \n\t"
142 "paddw %%xmm0, %%xmm1 \n\t"
143 "paddw %%xmm2, %%xmm3 \n\t"
144 "movdqa %%xmm4, %%xmm0 \n\t"
145 "psrlw $6, %%xmm1 \n\t"
146 "psrlw $6, %%xmm3 \n\t"
147 AVG_OP("movq (%0), %%xmm2 \n\t")
148 AVG_OP("movhps (%0,%3), %%xmm2 \n\t")
149 "packuswb %%xmm3, %%xmm1 \n\t"
150 AVG_OP("pavgb %%xmm2, %%xmm1 \n\t")
151 "movq %%xmm1, (%0)\n\t"
152 "movhps %%xmm1, (%0,%3)\n\t"
153 "sub $2, %2 \n\t"
154 "lea (%0,%3,2), %0 \n\t"
155 "jg 1b \n\t"
156 :"+r"(dst), "+r"(src), "+r"(h)
157 :"r"((x86_reg)stride)
161 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
163 asm volatile(
164 "movd %0, %%mm7 \n\t"
165 "movd %1, %%mm6 \n\t"
166 "movq %2, %%mm5 \n\t"
167 "pshufw $0, %%mm7, %%mm7 \n\t"
168 "pshufw $0, %%mm6, %%mm6 \n\t"
169 :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32)
172 asm volatile(
173 "movd (%1), %%mm0 \n\t"
174 "punpcklbw 1(%1), %%mm0 \n\t"
175 "add %3, %1 \n\t"
176 "1: \n\t"
177 "movd (%1), %%mm1 \n\t"
178 "movd (%1,%3), %%mm3 \n\t"
179 "punpcklbw 1(%1), %%mm1 \n\t"
180 "punpcklbw 1(%1,%3), %%mm3 \n\t"
181 "lea (%1,%3,2), %1 \n\t"
182 "movq %%mm1, %%mm2 \n\t"
183 "movq %%mm3, %%mm4 \n\t"
184 "pmaddubsw %%mm7, %%mm0 \n\t"
185 "pmaddubsw %%mm6, %%mm1 \n\t"
186 "pmaddubsw %%mm7, %%mm2 \n\t"
187 "pmaddubsw %%mm6, %%mm3 \n\t"
188 "paddw %%mm5, %%mm0 \n\t"
189 "paddw %%mm5, %%mm2 \n\t"
190 "paddw %%mm0, %%mm1 \n\t"
191 "paddw %%mm2, %%mm3 \n\t"
192 "movq %%mm4, %%mm0 \n\t"
193 "psrlw $6, %%mm1 \n\t"
194 "psrlw $6, %%mm3 \n\t"
195 "packuswb %%mm1, %%mm1 \n\t"
196 "packuswb %%mm3, %%mm3 \n\t"
197 AVG_OP("pavgb (%0), %%mm1 \n\t")
198 AVG_OP("pavgb (%0,%3), %%mm3 \n\t")
199 "movd %%mm1, (%0)\n\t"
200 "movd %%mm3, (%0,%3)\n\t"
201 "sub $2, %2 \n\t"
202 "lea (%0,%3,2), %0 \n\t"
203 "jg 1b \n\t"
204 :"+r"(dst), "+r"(src), "+r"(h)
205 :"r"((x86_reg)stride)