my first commit, i only added the file TEST to see how it works
[cinelerra_cv/mob.git] / mpeg2enc / predcomp_mmx.s
blob82fd01f7f1cfbd14804d79969eda795abe909e0e
1 ;;;
2 ;;; predcomp_00_mmx.s:
3 ;;;
4 ;;; Extended MMX prediction composition
5 ;;; routines handling the four different interpolation cases...
6 ;;;
7 ;;; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
9 ;;;
10 ;;; This program is free software; you can reaxstribute it and/or
11 ;;; modify it under the terms of the GNU General Public License
12 ;;; as published by the Free Software Foundation; either version 2
13 ;;; of the License, or (at your option) any later version.
14 ;;;
15 ;;; This program is distributed in the hope that it will be useful,
16 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;;; GNU General Public License for more details.
19 ;;;
20 ;;; You should have received a copy of the GNU General Public License
21 ;;; along with this program; if not, write to the Free Software
22 ;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 ;;; 02111-1307, USA.
24 ;;;
25 ;;;
26 ;;;
28 ;;; The no interpolation case...
30 global predcomp_00_mmx
32 ;;; void predcomp_<ix><iy>_mmx(char *src,char *dst,int lx, int w, int h, int addflag);
34 ;;; ix - Interpolation in x iy - Interpolation in y
36 ;;; eax = pdst
37 ;;; ebx = psrc
38 ;;; ecx = h left
39 ;;; edx = lx;
40 ;;; edi = w (8 or 16)
43 ;;; mm1 = one's mask for src
44 ;;; mm0 = zero mask for src...
48 align 32
49 predcomp_00_mmx:
50 push ebp ; save frame pointer
51 mov ebp, esp ; link
53 push eax
54 push ebx
55 push ecx
56 push edx
57 push edi
58 push esi
60 mov eax, 0x00010001
61 movd mm1, eax
62 punpckldq mm1,mm1
64 mov ebx, [ebp+8] ; get psrc
65 mov eax, [ebp+12] ; get pdst
66 mov edx, [ebp+16] ; get lx
67 mov edi, [ebp+20] ; get w
68 mov ecx, [ebp+24] ; get h
69 mov esi, [ebp+28] ; get addflag
70 ;; Extend addflag into bit-mask
71 pxor mm0, mm0
72 jmp predrow00m ; align for speed
73 align 32
74 predrow00m:
75 movq mm4, [ebx] ; first 8 bytes of row
76 cmp esi, 0
77 jz noadd00
79 movq mm5, mm4
80 punpcklbw mm4, mm0
81 punpckhbw mm5, mm0
83 movq mm2, [eax]
84 movq mm3, mm2
85 punpcklbw mm2, mm0
86 punpckhbw mm3, mm0
87 paddw mm4, mm2
88 paddw mm5, mm3
89 paddw mm4, mm1
90 paddw mm5, mm1
91 psrlw mm4, 1
92 psrlw mm5, 1
93 packuswb mm4, mm5
94 noadd00:
95 movq [eax], mm4
97 cmp edi, 8
98 jz eightwide00
100 movq mm4, [ebx+8] ; first 8 bytes of row
101 cmp esi, 0
102 jz noadd00w
104 movq mm5, mm4
105 punpcklbw mm4, mm0
106 punpckhbw mm5, mm0
108 movq mm2, [eax+8]
109 movq mm3, mm2
110 punpcklbw mm2, mm0
111 punpckhbw mm3, mm0
112 paddw mm4, mm2
113 paddw mm5, mm3
114 paddw mm4, mm1
115 paddw mm5, mm1
116 psrlw mm4, 1
117 psrlw mm5, 1
118 packuswb mm4, mm5
119 noadd00w:
120 movq [eax+8], mm4
122 eightwide00:
123 add eax, edx ; update pointer to next row
124 add ebx, edx ; ditto
126 sub ecx, 1 ; check h left
127 jnz near predrow00m
129 pop esi
130 pop edi
131 pop edx
132 pop ecx
133 pop ebx
134 pop eax
135 pop ebp
136 emms
137 ret
140 ;;; The x-axis interpolation case...
142 global predcomp_10_mmx
145 align 32
146 predcomp_10_mmx:
147 push ebp ; save frame pointer
148 mov ebp, esp ; link
150 push eax
151 push ebx
152 push ecx
153 push edx
154 push edi
155 push esi
157 mov eax, 0x00010001
158 movd mm1, eax
159 punpckldq mm1,mm1
161 mov ebx, [ebp+8] ; get psrc
162 mov eax, [ebp+12] ; get pdst
163 mov edx, [ebp+16] ; get lx
164 mov edi, [ebp+20] ; get w
165 mov ecx, [ebp+24] ; get h
166 mov esi, [ebp+28] ; get addflag
167 ;; Extend addflag into bit-mask
168 pxor mm0, mm0
169 jmp predrow10m ; align for speed
170 align 32
171 predrow10m:
172 movq mm4, [ebx] ; first 8 bytes of row
173 movq mm5, mm4
174 punpcklbw mm4, mm0
175 punpckhbw mm5, mm0
176 movq mm2, [ebx+1]
177 movq mm3, mm2
178 punpcklbw mm2, mm0
179 punpckhbw mm3, mm0
181 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
182 paddw mm5, mm3
183 paddw mm4, mm1
184 paddw mm5, mm1
185 psrlw mm4, 1
186 psrlw mm5, 1
188 cmp esi, 0
189 jz noadd10
191 movq mm2, [eax] ; Add
192 movq mm3, mm2
193 punpcklbw mm2, mm0
194 punpckhbw mm3, mm0
195 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
196 paddw mm5, mm3
197 paddw mm4, mm1
198 paddw mm5, mm1
199 psrlw mm4, 1
200 psrlw mm5, 1
201 noadd10:
202 packuswb mm4, mm5
203 movq [eax], mm4
205 cmp edi, 8
206 jz eightwide10
208 movq mm4, [ebx+8] ; first 8 bytes of row
209 movq mm5, mm4
210 punpcklbw mm4, mm0
211 punpckhbw mm5, mm0
212 movq mm2, [ebx+9]
213 movq mm3, mm2
214 punpcklbw mm2, mm0
215 punpckhbw mm3, mm0
217 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
218 paddw mm5, mm3
219 paddw mm4, mm1
220 paddw mm5, mm1
221 psrlw mm4, 1
222 psrlw mm5, 1
224 cmp esi, 0
225 jz noadd10w
227 movq mm2, [eax+8] ; Add
228 movq mm3, mm2
229 punpcklbw mm2, mm0
230 punpckhbw mm3, mm0
231 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
232 paddw mm5, mm3
233 paddw mm4, mm1
234 paddw mm5, mm1
235 psrlw mm4, 1
236 psrlw mm5, 1
237 noadd10w:
238 packuswb mm4, mm5
239 movq [eax+8], mm4
242 eightwide10:
243 add eax, edx ; update pointer to next row
244 add ebx, edx ; ditto
246 sub ecx, 1 ; check h left
247 jnz near predrow10m
249 pop esi
250 pop edi
251 pop edx
252 pop ecx
253 pop ebx
254 pop eax
255 pop ebp
256 emms
257 ret
259 ;;; The y-axis interpolation case...
261 global predcomp_01_mmx
264 align 32
265 predcomp_01_mmx:
266 push ebp ; save frame pointer
267 mov ebp, esp ; link
269 push eax
270 push ebx
271 push ecx
272 push edx
273 push edi
274 push esi
276 mov eax, 0x00010001
277 movd mm1, eax
278 punpckldq mm1,mm1
280 mov ebx, [ebp+8] ; get psrc
281 mov eax, [ebp+12] ; get pdst
282 mov edx, [ebp+16] ; get lx
283 mov edi, [ebp+20] ; get w
284 mov ecx, [ebp+24] ; get h
285 mov esi, [ebp+28] ; get addflag
286 pxor mm0, mm0
287 jmp predrow01m ; align for speed
289 align 32
290 predrow01m:
291 movq mm4, [ebx] ; first 8 bytes of row
292 movq mm5, mm4
293 add ebx, edx ; Next row
294 punpcklbw mm4, mm0
295 punpckhbw mm5, mm0
297 movq mm2, [ebx]
298 movq mm3, mm2
299 punpcklbw mm2, mm0
300 punpckhbw mm3, mm0
302 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
303 paddw mm5, mm3
304 paddw mm4, mm1
305 paddw mm5, mm1
306 psrlw mm4, 1
307 psrlw mm5, 1
309 cmp esi, 0
310 jz noadd01
312 movq mm2, [eax] ; Add
313 movq mm3, mm2
314 punpcklbw mm2, mm0
315 punpckhbw mm3, mm0
316 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
317 paddw mm5, mm3
318 paddw mm4, mm1
319 paddw mm5, mm1
320 psrlw mm4, 1
321 psrlw mm5, 1
322 noadd01:
323 packuswb mm4, mm5
324 movq [eax], mm4
326 cmp edi, 8
327 jz eightwide01
329 sub ebx, edx ; Back to first row...
330 movq mm4, [ebx+8] ; first 8 bytes of row
331 movq mm5, mm4
332 add ebx, edx ; Next row
333 punpcklbw mm4, mm0
334 punpckhbw mm5, mm0
335 movq mm2, [ebx+8]
336 movq mm3, mm2
337 punpcklbw mm2, mm0
338 punpckhbw mm3, mm0
340 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
341 paddw mm5, mm3
342 paddw mm4, mm1
343 paddw mm5, mm1
344 psrlw mm4, 1
345 psrlw mm5, 1
347 cmp esi, 0
348 jz noadd01w
350 movq mm2, [eax+8] ; Add
351 movq mm3, mm2
352 punpcklbw mm2, mm0
353 punpckhbw mm3, mm0
354 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
355 paddw mm5, mm3
356 paddw mm4, mm1
357 paddw mm5, mm1
358 psrlw mm4, 1
359 psrlw mm5, 1
360 noadd01w:
361 packuswb mm4, mm5
362 movq [eax+8], mm4
365 eightwide01:
366 add eax, edx ; ditto
368 sub ecx, 1 ; check h left
369 jnz near predrow01m
371 pop esi
372 pop edi
373 pop edx
374 pop ecx
375 pop ebx
376 pop eax
377 pop ebp
378 emms
379 ret
382 ;;; The x-axis and y-axis interpolation case...
384 global predcomp_11_mmx
386 ;;; mm0 = [0,0,0,0]W
387 ;;; mm1 = [1,1,1,1]W
388 ;;; mm2 = [2,2,2,2]W
389 align 32
390 predcomp_11_mmx:
391 push ebp ; save frame pointer
392 mov ebp, esp ; link
394 push eax
395 push ebx
396 push ecx
397 push edx
398 push edi
399 push esi
401 mov eax, 0x00020002
402 movd mm2, eax
403 punpckldq mm2,mm2
404 mov eax, 0x00010001
405 movd mm1, eax
406 punpckldq mm1,mm1
407 pxor mm0, mm0
409 mov ebx, [ebp+8] ; get psrc
410 mov eax, [ebp+12] ; get pdst
411 mov edx, [ebp+16] ; get lx
412 mov edi, [ebp+20] ; get w
413 mov ecx, [ebp+24] ; get h
414 mov esi, [ebp+28] ; Addflags
415 ;; Extend addflag into bit-mask
418 jmp predrow11 ; align for speed
419 align 32
420 predrow11:
421 movq mm4, [ebx] ; mm4 and mm6 accumulate partial sums for interp.
422 movq mm6, mm4
423 punpcklbw mm4, mm0
424 punpckhbw mm6, mm0
426 movq mm5, [ebx+1]
427 movq mm7, mm5
428 punpcklbw mm5, mm0
429 paddw mm4, mm5
430 punpckhbw mm7, mm0
431 paddw mm6, mm7
433 add ebx, edx ; update pointer to next row
435 movq mm5, [ebx] ; first 8 bytes 1st row: avg src in x
436 movq mm7, mm5
437 punpcklbw mm5, mm0 ; Accumulate partial interpolation
438 paddw mm4, mm5
439 punpckhbw mm7, mm0
440 paddw mm6, mm7
442 movq mm5, [ebx+1]
443 movq mm7, mm5
444 punpcklbw mm5, mm0
445 paddw mm4, mm5
446 punpckhbw mm7, mm0
447 paddw mm6, mm7
449 ;; Now round
450 paddw mm4, mm2
451 paddw mm6, mm2
452 psrlw mm4, 2
453 psrlw mm6, 2
455 cmp esi, 0
456 jz noadd11
458 movq mm5, [eax] ; Add
459 movq mm7, mm5
460 punpcklbw mm5, mm0
461 punpckhbw mm7, mm0
462 paddw mm4, mm5 ; Average mm4/mm6 and mm5/mm7
463 paddw mm6, mm7
464 paddw mm4, mm1
465 paddw mm6, mm1
466 psrlw mm4, 1
467 psrlw mm6, 1
469 noadd11:
470 packuswb mm4, mm6
471 movq [eax], mm4
473 cmp edi, 8
474 jz near eightwide11
476 sub ebx, edx ; Back to first row...
478 movq mm4, [ebx+8] ; mm4 and mm6 accumulate partial sums for interp.
479 movq mm6, mm4
480 punpcklbw mm4, mm0
481 punpckhbw mm6, mm0
483 movq mm5, [ebx+9]
484 movq mm7, mm5
485 punpcklbw mm5, mm0
486 paddw mm4, mm5
487 punpckhbw mm7, mm0
488 paddw mm6, mm7
490 add ebx, edx ; update pointer to next row
492 movq mm5, [ebx+8] ; first 8 bytes 1st row: avg src in x
493 movq mm7, mm5
494 punpcklbw mm5, mm0 ; Accumulate partial interpolation
495 paddw mm4, mm5
496 punpckhbw mm7, mm0
497 paddw mm6, mm7
499 movq mm5, [ebx+9]
500 movq mm7, mm5
501 punpcklbw mm5, mm0
502 paddw mm4, mm5
503 punpckhbw mm7, mm0
504 paddw mm6, mm7
506 ;; Now round
507 paddw mm4, mm2
508 paddw mm6, mm2
509 psrlw mm4, 2
510 psrlw mm6, 2
512 cmp esi, 0
513 jz noadd11w
515 movq mm5, [eax+8] ; Add and average
516 movq mm7, mm5
517 punpcklbw mm5, mm0
518 punpckhbw mm7, mm0
519 paddw mm4, mm5 ; Average mm4/mm6 and mm5/mm7
520 paddw mm6, mm7
521 paddw mm4, mm1
522 paddw mm6, mm1
523 psrlw mm4, 1
524 psrlw mm6, 1
525 noadd11w:
526 packuswb mm4, mm6
527 movq [eax+8], mm4
529 eightwide11:
530 add eax, edx ; update pointer to next row
533 sub ecx, 1 ; check h left
534 jnz near predrow11
536 pop esi
537 pop edi
538 pop edx
539 pop ecx
540 pop ebx
541 pop eax
542 pop ebp
543 emms