applied AkhIL's cinelerra-cv surround patch
[cinelerra_cv/ct.git] / mpeg2enc / bdist1_mmx.s
blob34602405562c1d0558d846b3c0b7aa7e83bbe9a0
2 ; bdist1_mmx.s: mmX optimized bidirectional absolute distance sum
4 ; Original believed to be Copyright (C) 2000 Brent Byeler
6 ; This program is free software; you can reaxstribute it and/or
7 ; modify it under the terms of the GNU General Public License
8 ; as published by the Free Software Foundation; either version 2
9 ; of the License, or (at your option) any later version.
11 ; This program is distributed in the hope that it will be useful,
12 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ; GNU General Public License for more details.
16 ; You should have received a copy of the GNU General Public License
17 ; along with this program; if not, write to the Free Software
18 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ;/*
22 ; * absolute difference error between a (16*h) block and a bidirectional
23 ; * prediction
24 ; *
25 ; * p2: address of top left pel of block
26 ; * pf,hxf,hyf: address and half pel flags of forward ref. block
27 ; * pb,hxb,hyb: address and half pel flags of backward ref. block
28 ; * h: height of block
29 ; * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
30 ; * mmX version
31 ; */
33 ;int bdist1_mmx(
34 ;unsigned char *pf, unsigned char *pb, unsigned char *p2,
35 ;int lx, int hxf, int hyf, int hxb, int hyb, int h)
37 ; unsigned char *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
39 ; Handy macros for readbility
41 %define pf [ebp+8]
42 %define pb [ebp+12]
43 %define p2 [ebp+16]
44 %define lx [ebp+20]
45 %define hxf [ebp+24]
46 %define hyf [ebp+28]
47 %define hxb [ebp+32]
48 %define hyb [ebp+36]
49 %define h [ebp+40]
52 %define pfa [esp+4]
53 %define pfb [esp+8]
54 %define pfc [esp+12]
55 %define pba [esp+16]
56 %define pbb [esp+20]
57 %define pbc [esp+24]
59 SECTION .text
60 global bdist1_mmx
62 align 32
63 bdist1_mmx:
64 push ebp ; save frame pointer
65 mov ebp, esp ; link
66 push ebx
67 push ecx
68 push edx
69 push esi
70 push edi
73 ;; Make space for local variables on stack
74 sub esp, 32
77 mov edx, hxb
78 mov eax, hxf
79 mov esi, lx
81 mov ecx, pf
82 add ecx, eax
83 mov pfa, ecx
84 mov ecx, esi
85 imul ecx, hyf
86 mov ebx, pf
87 add ecx, ebx
88 mov pfb, ecx
89 add eax, ecx
90 mov pfc, eax
91 mov eax, pb
92 add eax, edx
93 mov pba, eax
94 mov eax, esi
95 imul eax, hyb
96 mov ecx, pb
97 add eax, ecx
98 mov pbb, eax
99 add edx, eax
100 mov pbc, edx
101 xor esi, esi ; esi is "s" the accumulator
102 mov eax, esi
104 mov edi, h
105 test edi, edi ; h = 0?
106 jle near bdist1exit
108 pxor mm7, mm7
109 pxor mm6, mm6
110 pcmpeqw mm5, mm5
111 psubw mm6, mm5
112 psllw mm6, 1
114 bdist1top:
115 mov eax, pf
116 mov ebx, pfa
117 mov ecx, pfb
118 mov edx, pfc
119 movq mm0, [eax]
120 movq mm1, mm0
121 punpcklbw mm0, mm7
122 punpckhbw mm1, mm7
123 movq mm2, [ebx]
124 movq mm3, mm2
125 punpcklbw mm2, mm7
126 punpckhbw mm3, mm7
127 paddw mm0, mm2
128 paddw mm1, mm3
129 movq mm2, [ecx]
130 movq mm3, mm2
131 punpcklbw mm2, mm7
132 punpckhbw mm3, mm7
133 paddw mm0, mm2
134 paddw mm1, mm3
135 movq mm2, [edx]
136 movq mm3, mm2
137 punpcklbw mm2, mm7
138 punpckhbw mm3, mm7
139 paddw mm0, mm2
140 paddw mm1, mm3
141 paddw mm0, mm6
142 paddw mm1, mm6
143 psrlw mm0, 2
144 psrlw mm1, 2
145 mov eax, pb
146 mov ebx, pba
147 mov ecx, pbb
148 mov edx, pbc
149 movq mm2, [eax]
150 movq mm3, mm2
151 punpcklbw mm2, mm7
152 punpckhbw mm3, mm7
153 movq mm4, [ebx]
154 movq mm5, mm4
155 punpcklbw mm4, mm7
156 punpckhbw mm5, mm7
157 paddw mm2, mm4
158 paddw mm3, mm5
159 movq mm4, [ecx]
160 movq mm5, mm4
161 punpcklbw mm4, mm7
162 punpckhbw mm5, mm7
163 paddw mm2, mm4
164 paddw mm3, mm5
165 movq mm4, [edx]
166 movq mm5, mm4
167 punpcklbw mm4, mm7
168 punpckhbw mm5, mm7
169 paddw mm2, mm4
170 paddw mm3, mm5
171 paddw mm2, mm6
172 paddw mm3, mm6
173 psrlw mm2, 2
174 psrlw mm3, 2
175 paddw mm0, mm2
176 paddw mm1, mm3
177 psrlw mm6, 1
178 paddw mm0, mm6
179 paddw mm1, mm6
180 psllw mm6, 1
181 psrlw mm0, 1
182 psrlw mm1, 1
183 packuswb mm0, mm1
185 mov eax, p2
186 movq mm1, [eax]
187 movq mm2, mm0
188 psubusb mm0, mm1
189 psubusb mm1, mm2
190 por mm0, mm1
191 movq mm1, mm0
192 punpcklbw mm0, mm7
193 punpckhbw mm1, mm7
194 paddw mm0, mm1
195 movq mm1, mm0
196 punpcklwd mm0, mm7
197 punpckhwd mm1, mm7
199 paddd mm0, mm1
200 movd eax, mm0
201 psrlq mm0, 32
202 movd ebx, mm0
203 add esi, eax
204 add esi, ebx
205 mov eax, pf
206 mov ebx, pfa
207 mov ecx, pfb
208 mov edx, pfc
209 movq mm0, [eax+8]
210 movq mm1, mm0
211 punpcklbw mm0, mm7
212 punpckhbw mm1, mm7
213 movq mm2, [ebx+8]
214 movq mm3, mm2
215 punpcklbw mm2, mm7
216 punpckhbw mm3, mm7
217 paddw mm0, mm2
218 paddw mm1, mm3
219 movq mm2, [ecx+8]
220 movq mm3, mm2
221 punpcklbw mm2, mm7
222 punpckhbw mm3, mm7
223 paddw mm0, mm2
224 paddw mm1, mm3
225 movq mm2, [edx+8]
226 movq mm3, mm2
227 punpcklbw mm2, mm7
228 punpckhbw mm3, mm7
229 paddw mm0, mm2
230 paddw mm1, mm3
231 paddw mm0, mm6
232 paddw mm1, mm6
233 psrlw mm0, 2
234 psrlw mm1, 2
235 mov eax, pb
236 mov ebx, pba
237 mov ecx, pbb
238 mov edx, pbc
239 movq mm2, [eax+8]
240 movq mm3, mm2
241 punpcklbw mm2, mm7
242 punpckhbw mm3, mm7
243 movq mm4, [ebx+8]
244 movq mm5, mm4
245 punpcklbw mm4, mm7
246 punpckhbw mm5, mm7
247 paddw mm2, mm4
248 paddw mm3, mm5
249 movq mm4, [ecx+8]
250 movq mm5, mm4
251 punpcklbw mm4, mm7
252 punpckhbw mm5, mm7
253 paddw mm2, mm4
254 paddw mm3, mm5
255 movq mm4, [edx+8]
256 movq mm5, mm4
257 punpcklbw mm4, mm7
258 punpckhbw mm5, mm7
259 paddw mm2, mm4
260 paddw mm3, mm5
261 paddw mm2, mm6
262 paddw mm3, mm6
263 psrlw mm2, 2
264 psrlw mm3, 2
265 paddw mm0, mm2
266 paddw mm1, mm3
267 psrlw mm6, 1
268 paddW mm0, mm6
269 paddw mm1, mm6
270 psllw mm6, 1
271 psrlw mm0, 1
272 psrlw mm1, 1
273 packuswb mm0, mm1
274 mov eax, p2
275 movq mm1, [eax+8]
276 movq mm2, mm0
277 psubusb mm0, mm1
278 psubusb mm1, mm2
279 por mm0, mm1
280 movq mm1, mm0
281 punpcklbw mm0, mm7
282 punpckhbw mm1, mm7
283 paddw mm0, mm1
284 movq mm1, mm0
285 punpcklwd mm0, mm7
286 punpckhwd mm1, mm7
287 paddd mm0, mm1
288 movd eax, mm0
289 psrlq mm0, 32
290 movd ebx, mm0
291 add esi, eax
292 add esi, ebx
294 mov eax, lx
295 add p2, eax
296 add pf, eax
297 add pfa, eax
298 add pfb, eax
299 add pfc, eax
300 add pb, eax
301 add pba, eax
302 add pbb, eax
303 add pbc, eax
305 dec edi
306 jg near bdist1top
307 mov eax, esi
309 bdist1exit:
313 ;; Get rid of local variables
314 add esp, 32
316 ;; Retore (callee saves convention...)
318 pop edi
319 pop esi
320 pop edx
321 pop ecx
322 pop ebx
324 pop ebp ; restore stack pointer
326 emms ; clear mmx registers
327 ret