r125: This commit was manufactured by cvs2svn to create tag 'r1_1_7-last'.
[cinelerra_cv/mob.git] / hvirtual / mpeg2enc / bdist2_mmx.s
blobbe1c352f4db6ae7c4fa9e665da9dc98bd55ecf04
2 ; bdist2_mmx.s: MMX optimized bidirectional squared distance sum
4 ; Original believed to be Copyright (C) 2000 Brent Byeler
6 ; This program is free software; you can reaxstribute it and/or
7 ; modify it under the terms of the GNU General Public License
8 ; as published by the Free Software Foundation; either version 2
9 ; of the License, or (at your option) any later version.
11 ; This program is distributed in the hope that it will be useful,
12 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ; GNU General Public License for more details.
16 ; You should have received a copy of the GNU General Public License
17 ; along with this program; if not, write to the Free Software
18 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ;/*
22 ; * squared error between a (16*h) block and a bidirectional
23 ; * prediction
24 ; *
25 ; * p2: address of top left pel of block
26 ; * pf,hxf,hyf: address and half pel flags of forward ref. block
27 ; * pb,hxb,hyb: address and half pel flags of backward ref. block
28 ; * h: height of block
29 ; * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
30 ; * mmX version
31 ; */
33 ;int bdist2_mmx(
34 ;unsigned char *pf, unsigned char *pb, unsigned char *p2,
35 ;int lx, int hxf, int hyf, int hxb, int hyb, int h)
37 ; unsigned char *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
38 ; int s;
40 ; Handy macros for readbility
42 %define pf [ebp+8]
43 %define pb [ebp+12]
44 %define p2 [ebp+16]
45 %define lx [ebp+20]
46 %define hxf [ebp+24]
47 %define hyf [ebp+28]
48 %define hxb [ebp+32]
49 %define hyb [ebp+36]
50 %define h [ebp+40]
53 %define pfa [esp+4]
54 %define pfb [esp+8]
55 %define pfc [esp+12]
56 %define pba [esp+16]
57 %define pbb [esp+20]
58 %define pbc [esp+24]
60 SECTION .text
61 global bdist2_mmx
63 align 32
64 bdist2_mmx:
65 push ebp ; save frame pointer
66 mov ebp, esp ; link
67 push ebx
68 push ecx
69 push edx
70 push esi
71 push edi
74 ;; Make space for local variables on stack
75 sub esp, 32
77 mov edx, hxb
78 mov eax, hxf
79 mov esi, lx
81 mov ecx, pf
82 add ecx, eax
83 mov pfa, ecx
84 mov ecx, esi
85 imul ecx, hyf
86 mov ebx, pf
87 add ecx, ebx
88 mov pfb, ecx
89 add eax, ecx
90 mov pfc, eax
91 mov eax, pb
92 add eax, edx
93 mov pba, eax
94 mov eax, esi
95 imul eax, hyb
96 mov ecx, pb
97 add eax, ecx
98 mov pbb, eax
99 add edx, eax
100 mov pbc, edx
101 xor esi, esi ; esi = s (accumulated sym)
102 mov eax, esi
104 mov edi, h
105 test edi, edi ; h = 0?
106 jle near bdist2exit
108 pxor mm7, mm7
109 pxor mm6, mm6
110 pcmpeqw mm5, mm5
111 psubw mm6, mm5
112 psllw mm6, 1
114 bdist2top:
115 mov eax, pf
116 mov ebx, pfa
117 mov ecx, pfb
118 mov edx, pfc
119 movq mm0, [eax]
120 movq mm1, mm0
121 punpcklbw mm0, mm7
122 punpckhbw mm1, mm7
123 movq mm2, [ebx]
124 movq mm3, mm2
125 punpcklbw mm2, mm7
126 punpckhbw mm3, mm7
127 paddw mm0, mm2
128 paddw mm1, mm3
129 movq mm2, [ecx]
130 movq mm3, mm2
131 punpcklbw mm2, mm7
132 punpckhbw mm3, mm7
133 paddw mm0, mm2
134 paddw mm1, mm3
135 movq mm2, [edx]
136 movq mm3, mm2
137 punpcklbw mm2, mm7
138 punpckhbw mm3, mm7
139 paddw mm0, mm2
140 paddw mm1, mm3
141 paddw mm0, mm6
142 paddw mm1, mm6
143 psrlw mm0, 2
144 psrlw mm1, 2
146 mov eax, pb
147 mov ebx, pba
148 mov ecx, pbb
149 mov edx, pbc
150 movq mm2, [eax]
151 movq mm3, mm2
152 punpcklbw mm2, mm7
153 punpckhbw mm3, mm7
154 movq mm4, [ebx]
155 movq mm5, mm4
156 punpcklbw mm4, mm7
157 punpckhbw mm5, mm7
158 paddw mm2, mm4
159 paddw mm3, mm5
160 movq mm4, [ecx]
161 movq mm5, mm4
162 punpcklbw mm4, mm7
163 punpckhbw mm5, mm7
164 paddw mm2, mm4
165 paddw mm3, mm5
166 movq mm4, [edx]
167 movq mm5, mm4
168 punpcklbw mm4, mm7
169 punpckhbw mm5, mm7
170 paddw mm2, mm4
171 paddw mm3, mm5
173 paddw mm2, mm6
174 paddw mm3, mm6
175 psrlw mm2, 2
176 psrlw mm3, 2
178 paddw mm0, mm2
179 paddw mm1, mm3
180 psrlw mm6, 1
181 paddw mm0, mm6
182 paddw mm1, mm6
183 psllw mm6, 1
184 psrlw mm0, 1
185 psrlw mm1, 1
187 mov eax, p2
188 movq mm2, [eax]
189 movq mm3, mm2
190 punpcklbw mm2, mm7
191 punpckhbw mm3, mm7
193 psubw mm0, mm2
194 psubw mm1, mm3
195 pmaddwd mm0, mm0
196 pmaddwd mm1, mm1
197 paddd mm0, mm1
199 movd eax, mm0
200 psrlq mm0, 32
201 movd ebx, mm0
202 add esi, eax
203 add esi, ebx
205 mov eax, pf
206 mov ebx, pfa
207 mov ecx, pfb
208 mov edx, pfc
209 movq mm0, [eax+8]
210 movq mm1, mm0
211 punpcklbw mm0, mm7
212 punpckhbw mm1, mm7
213 movq mm2, [ebx+8]
214 movq mm3, mm2
215 punpcklbw mm2, mm7
216 punpckhbw mm3, mm7
217 paddw mm0, mm2
218 paddw mm1, mm3
219 movq mm2, [ecx+8]
220 movq mm3, mm2
221 punpcklbw mm2, mm7
222 punpckhbw mm3, mm7
223 paddw mm0, mm2
224 paddw mm1, mm3
225 movq mm2, [edx+8]
226 movq mm3, mm2
227 punpcklbw mm2, mm7
228 punpckhbw mm3, mm7
229 paddw mm0, mm2
230 paddw mm1, mm3
231 paddw mm0, mm6
232 paddw mm1, mm6
233 psrlw mm0, 2
234 psrlw mm1, 2
236 mov eax, pb
237 mov ebx, pba
238 mov ecx, pbb
239 mov edx, pbc
240 movq mm2, [eax+8]
241 movq mm3, mm2
242 punpcklbw mm2, mm7
243 punpckhbw mm3, mm7
244 movq mm4, [ebx+8]
245 movq mm5, mm4
246 punpcklbw mm4, mm7
247 punpckhbw mm5, mm7
248 paddw mm2, mm4
249 paddw mm3, mm5
250 movq mm4, [ecx+8]
251 movq mm5, mm4
252 punpcklbw mm4, mm7
253 punpckhbw mm5, mm7
254 paddw mm2, mm4
255 paddw mm3, mm5
256 movq mm4, [edx+8]
257 movq mm5, mm4
258 punpcklbw mm4, mm7
259 punpckhbw mm5, mm7
260 paddw mm2, mm4
261 paddw mm3, mm5
262 paddw mm2, mm6
263 paddw mm3, mm6
264 psrlw mm2, 2
265 psrlw mm3, 2
267 paddw mm0, mm2
268 paddw mm1, mm3
269 psrlw mm6, 1
270 paddW mm0, mm6
271 paddw mm1, mm6
272 psllw mm6, 1
273 psrlw mm0, 1
274 psrlw mm1, 1
276 mov eax, p2
277 movq mm2, [eax+8]
278 movq mm3, mm2
279 punpcklbw mm2, mm7
280 punpckhbw mm3, mm7
282 psubw mm0, mm2
283 psubw mm1, mm3
284 pmaddwd mm0, mm0
285 pmaddwd mm1, mm1
286 paddd mm0, mm1
288 movd eax, mm0
289 psrlq mm0, 32
290 movd ebx, mm0
291 add esi, eax
292 add esi, ebx
294 mov eax, lx
295 add p2, eax
296 add pf, eax
297 add pfa, eax
298 add pfb, eax
299 add pfc, eax
300 add pb, eax
301 add pba, eax
302 add pbb, eax
303 add pbc, eax
305 dec edi
306 jg near bdist2top
307 mov eax, esi
309 bdist2exit:
312 ;; Get rid of local variables
313 add esp, 32
315 ;; Retore (callee saves convention...)
317 pop edi
318 pop esi
319 pop edx
320 pop ecx
321 pop ebx
323 pop ebp ; restore stack pointer
325 emms ; clear mmx registers
326 ret