4 ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
5 ;;;
(interpolated
, 1-pel
, 2*2 sub-sampled pel
and 4*4 sub-sampled pel
)
7 ; dist1_
* Original Copyright
(C
) 2000 Chris Atenasio
<chris@crud.net
>
8 ; Enhancements
and rest Copyright
(C
) 2000 Andrew Stevens
<as@comlab.ox.ac.uk
>
11 ; This program is free software; you can redistribute it
and/or
12 ; modify it under the terms of the GNU General Public License
13 ; as published by the Free Software Foundation; either version
2
14 ; of the License
, or (at your option
) any later version.
16 ; This program is distributed in the hope that it will
be useful
,
17 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ; MERCHANTABILITY
or FITNESS FOR
A PARTICULAR PURPOSE. See the
19 ; GNU General Public License for more details.
21 ; You should have received
a copy of the GNU General Public License
22 ; along with this program; if
not, write to the Free Software
23 ; Foundation
, Inc.
, 59 Temple Place
- Suite
330, Boston
, MA
02111-1307, USA.
31 ; int dist1_mmx
(unsigned char
*blk1
,unsigned char
*blk2
,int
lx,int h
, int distlim
);
32 ; N.b. distlim is
*ignored
* as testing for it is more expensive than the
33 ; occasional saving by aborting the computionation early.
..
34 ; esi
= p1
(init
: blk1
)
35 ; edi
= p2
(init
: blk2
)
37 ; ecx
= rowsleft
(init
: h
)
40 ; mm0
= distance accumulators
(4 words
)
52 push ebp ; save frame pointer
55 push ebx ; Saves registers
(called saves convention in
56 push ecx ; x86 GCC it seems
)
61 pxor mm0
, mm0 ; zero acculumators
63 mov esi
, [ebp+
8] ; get p1
64 mov edi
, [ebp+
12] ; get p2
65 mov edx
, [ebp+
16] ; get
lx
66 mov ecx
, [ebp+
20] ; get rowsleft
67 ;mov ebx
, [ebp+
24] ; distlim
71 movq mm4
, [esi
] ; load first
8 bytes of p1 row
72 movq mm5
, [edi
] ; load first
8 bytes of p2 row
74 movq mm7
, mm4 ; mm5
= abs(mm4-mm5
)
79 ;;
Add the
abs(mm4-mm5
) bytes to the accumulators
80 movq mm2
, [esi+
8] ; load second
8 bytes of p1 row
(interleaved
)
81 movq mm7
,mm5 ; mm7
:= [i
: B0.
.3, mm1
]W
83 movq mm3
, [edi+
8] ; load second
8 bytes of p2 row
(interleaved
)
88 ;; This is logically where the mm2
, mm3 loads would go.
..
90 movq mm7
, mm2 ; mm3
= abs(mm2-mm3
)
95 ;;
Add the
abs(mm4-mm5
) bytes to the accumulators
96 movq mm7
,mm3 ; mm7
:= [i
: B0.
.3, mm1
]W
101 add esi
, edx ; update pointer to next row
113 ;; Sum the Accumulators
114 movq mm5
, mm0 ; mm5
:= [W0+W2
,W1+W3
, mm0
119 movq mm7
, mm4 ; mm6
:= [W0+W2+W1+W3
, mm0
]
122 movd eax
, mm4 ; store return value
133 emms ; clear mmx registers
136 ;;; dist1_01_mmx.s
: mmx1 optimised
7bit
*8 word absolute difference sum
137 ;;; We
're reduce to seven bits as otherwise we also have to mess
138 ;;; horribly with carries and signed only comparisons make the code
139 ;;; simply enormous (and probably barely faster than a simple loop).
140 ;;; Since signals with a bona-fide 8bit res will be rare we simply
141 ;;; take the precision hit...
142 ;;; Actually we don't worry about carries from the low-order bits
143 ;;; either so
1/4 of the time we
'll be 1 too low...
145 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
148 ; This program is free software; you can redistribute it and/or
149 ; modify it under the terms of the GNU General Public License
150 ; as published by the Free Software Foundation; either version 2
151 ; of the License, or (at your option) any later version.
153 ; This program is distributed in the hope that it will be useful,
154 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
155 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
156 ; GNU General Public License for more details.
158 ; You should have received a copy of the GNU General Public License
159 ; along with this program; if not, write to the Free Software
160 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
168 ; int dist1_01_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
170 ; esi = p1 (init: blk1)
171 ; edi = p2 (init: blk2)
172 ; ecx = rowsleft (init: h)
175 ; mm0 = distance accumulators (4 words)
179 ; mm4 = temp 4 bytes in words interpolating p1, p1+1
180 ; mm5 = temp 4 bytes in words from p2
181 ; mm6 = temp comparison bit mask p1,p2
182 ; mm7 = temp comparison bit mask p2,p1
187 push ebp ; save stack pointer
188 mov ebp, esp ; so that we can do this
190 push ebx ; Saves registers (called saves convention in
191 push ecx ; x86 GCC it seems)
196 pxor mm0, mm0 ; zero acculumators
198 mov esi, [ebp+8] ; get p1
199 mov edi, [ebp+12] ; get p2
200 mov edx, [ebp+16] ; get lx
201 mov ecx, [ebp+20] ; rowsleft := h
202 jmp nextrowmm01 ; snap to it
207 ;; First 8 bytes of row
210 ;; First 4 bytes of 8
212 movq mm4, [esi] ; mm4 := first 4 bytes p1
214 movq mm2, mm4 ; mm2 records all 8 bytes
215 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
217 movq mm6, [esi+1] ; mm6 := first 4 bytes p1+1
218 movq mm3, mm6 ; mm3 records all 8 bytes
220 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
223 movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
228 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
230 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
234 paddw mm0, mm6 ; Add to accumulator
236 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
238 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
241 paddw mm0, mm5 ; Add to accumulator
243 ;; Second 4 bytes of 8
245 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
248 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
251 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
254 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
258 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
260 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
264 paddw mm0, mm6 ; Add to accumulator
266 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
268 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
271 paddw mm0, mm5 ; Add to accumulator
275 ;; Second 8 bytes of row
277 ;; First 4 bytes of 8
279 movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
281 movq mm2, mm4 ; mm2 records all 8 bytes
282 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
284 movq mm6, [esi+9] ; mm6 := first 4 bytes p1+9
285 movq mm3, mm6 ; mm3 records all 8 bytes
287 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
290 movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
295 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
297 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
301 paddw mm0, mm6 ; Add to accumulator
303 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
305 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
308 paddw mm0, mm5 ; Add to accumulator
310 ;; Second 4 bytes of 8
312 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
315 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
318 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
321 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
325 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
327 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
331 paddw mm0, mm6 ; Add to accumulator
333 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
335 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
338 paddw mm0, mm5 ; Add to accumulator
342 ;; Loop termination condition... and stepping
345 add esi, edx ; update pointer to next row
349 test ecx, ecx ; check rowsleft
353 ;; Sum the Accumulators
360 movd eax, mm0 ; store return value
369 pop ebp ; restore stack pointer
371 emms ; clear mmx registers
372 ret ; we now return you to your regular programming
374 ;;; dist1_01_mmx.s: mmx1 optimised 7bit*8 word absolute difference sum
375 ;;; We're reduce to seven bits as otherwise we also have to mess
376 ;;; horribly with carries
and signed only comparisons make the code
377 ;;; simply enormous
(and probably barely faster than
a simple loop
).
378 ;;; Since signals with
a bona-fide
8bit res will
be rare we simply
379 ;;; take the precision hit.
..
380 ;;; Actually we don
't worry about carries from the low-order bits
381 ;;; either so 1/4 of the time we'll
be 1 too low.
..
383 ; Copyright
(C
) 2000 Andrew Stevens
<as@comlab.ox.ac.uk
>
386 ; This program is free software; you can redistribute it
and/or
387 ; modify it under the terms of the GNU General Public License
388 ; as published by the Free Software Foundation; either version
2
389 ; of the License
, or (at your option
) any later version.
391 ; This program is distributed in the hope that it will
be useful
,
392 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
393 ; MERCHANTABILITY
or FITNESS FOR
A PARTICULAR PURPOSE. See the
394 ; GNU General Public License for more details.
396 ; You should have received
a copy of the GNU General Public License
397 ; along with this program; if
not, write to the Free Software
398 ; Foundation
, Inc.
, 59 Temple Place
- Suite
330, Boston
, MA
02111-1307, USA.
406 ; int dist1_10_mmx
(unsigned char
*p1
,unsigned char
*p2
,int
lx,int h
);
408 ; esi
= p1
(init
: blk1
)
409 ; edi
= p2
(init
: blk2
)
411 ; ecx
= rowsleft
(init
: h
)
414 ; mm0
= distance accumulators
(4 words
)
418 ; mm4
= temp
4 bytes in words interpolating p1
, p1+
1
419 ; mm5
= temp
4 bytes in words from p2
420 ; mm6
= temp comparison bit mask p1
,p2
421 ; mm7
= temp comparison bit mask p2
,p1
426 push ebp ; save stack pointer
427 mov ebp
, esp ; so that we can do this
429 push ebx ; Saves registers
(called saves convention in
430 push ecx ; x86 GCC it seems
)
435 pxor mm0
, mm0 ; zero acculumators
437 mov esi
, [ebp+
8] ; get p1
438 mov edi
, [ebp+
12] ; get p2
439 mov edx
, [ebp+
16] ; get
lx
440 mov ecx
, [ebp+
20] ; rowsleft
:= h
443 jmp nextrowmm10 ; snap to it
448 ;; First
8 bytes of row
451 ;; First
4 bytes of
8
453 movq mm4
, [esi
] ; mm4
:= first
4 bytes p1
455 movq mm2
, mm4 ; mm2 records all
8 bytes
456 punpcklbw mm4
, mm7 ; First
4 bytes p1 in Words.
..
458 movq mm6
, [ebx
] ; mm6
:= first
4 bytes p1+
lx
459 movq mm3
, mm6 ; mm3 records all
8 bytes
461 paddw mm4
, mm6 ; mm4
:= First
4 bytes interpolated in words
464 movq mm5
, [edi
] ; mm5
:=first
4 bytes of p2 in words
469 pcmpgtw mm7
,mm5 ; mm7
:= [i
: W0.
.3,mm4
>mm5
]
471 movq mm6
,mm4 ; mm6
:= [i
: W0.
.3, (mm4-mm5
)*(mm4-mm5
> 0)]
475 paddw mm0
, mm6 ;
Add to accumulator
477 movq mm6
,mm5 ; mm6
:= [i
: W0.
.3,mm5
>mm4
]
479 psubw mm5
,mm4 ; mm5
:= [i
: B0.
.7, (mm5-mm4
)*(mm5-mm4
> 0)]
482 paddw mm0
, mm5 ;
Add to accumulator
484 ;; Second
4 bytes of
8
486 movq mm4
, mm2 ; mm4
:= Second
4 bytes p1 in words
489 movq mm6
, mm3 ; mm6
:= Second
4 bytes p1+
1 in words
492 paddw mm4
, mm6 ; mm4
:= First
4 Interpolated bytes in words
495 movq mm5
, mm1 ; mm5
:= second
4 bytes of p2 in words
499 pcmpgtw mm7
,mm5 ; mm7
:= [i
: W0.
.3,mm4
>mm5
]
501 movq mm6
,mm4 ; mm6
:= [i
: W0.
.3, (mm4-mm5
)*(mm4-mm5
> 0)]
505 paddw mm0
, mm6 ;
Add to accumulator
507 movq mm6
,mm5 ; mm6
:= [i
: W0.
.3,mm5
>mm4
]
509 psubw mm5
,mm4 ; mm5
:= [i
: B0.
.7, (mm5-mm4
)*(mm5-mm4
> 0)]
512 paddw mm0
, mm5 ;
Add to accumulator
516 ;; Second
8 bytes of row
518 ;; First
4 bytes of
8
520 movq mm4
, [esi+
8] ; mm4
:= first
4 bytes p1+
8
522 movq mm2
, mm4 ; mm2 records all
8 bytes
523 punpcklbw mm4
, mm7 ; First
4 bytes p1 in Words.
..
525 movq mm6
, [ebx+
8] ; mm6
:= first
4 bytes p1+lx+
8
526 movq mm3
, mm6 ; mm3 records all
8 bytes
528 paddw mm4
, mm6 ; mm4
:= First
4 bytes interpolated in words
531 movq mm5
, [edi+
8] ; mm5
:=first
4 bytes of p2+
8 in words
536 pcmpgtw mm7
,mm5 ; mm7
:= [i
: W0.
.3,mm4
>mm5
]
538 movq mm6
,mm4 ; mm6
:= [i
: W0.
.3, (mm4-mm5
)*(mm4-mm5
> 0)]
542 paddw mm0
, mm6 ;
Add to accumulator
544 movq mm6
,mm5 ; mm6
:= [i
: W0.
.3,mm5
>mm4
]
546 psubw mm5
,mm4 ; mm5
:= [i
: B0.
.7, (mm5-mm4
)*(mm5-mm4
> 0)]
549 paddw mm0
, mm5 ;
Add to accumulator
551 ;; Second
4 bytes of
8
553 movq mm4
, mm2 ; mm4
:= Second
4 bytes p1 in words
556 movq mm6
, mm3 ; mm6
:= Second
4 bytes p1+
1 in words
559 paddw mm4
, mm6 ; mm4
:= First
4 Interpolated bytes in words
562 movq mm5
, mm1 ; mm5
:= second
4 bytes of p2 in words
566 pcmpgtw mm7
,mm5 ; mm7
:= [i
: W0.
.3,mm4
>mm5
]
568 movq mm6
,mm4 ; mm6
:= [i
: W0.
.3, (mm4-mm5
)*(mm4-mm5
> 0)]
572 paddw mm0
, mm6 ;
Add to accumulator
574 movq mm6
,mm5 ; mm6
:= [i
: W0.
.3,mm5
>mm4
]
576 psubw mm5
,mm4 ; mm5
:= [i
: B0.
.7, (mm5-mm4
)*(mm5-mm4
> 0)]
579 paddw mm0
, mm5 ;
Add to accumulator
583 ;; Loop termination condition.
.. and stepping
586 add esi
, edx ; update pointer to next row
591 test ecx
, ecx ; check rowsleft
594 ;; Sum the Accumulators
601 movd eax
, mm0 ; store return value
611 pop ebp ; restore stack pointer
613 emms ; clear mmx registers
614 ret ; we now return you to your regular programming
618 ; Copyright
(C
) 2000 Andrew Stevens
<as@comlab.ox.ac.uk
>
621 ; This program is free software; you can redistribute it
and/or
622 ; modify it under the terms of the GNU General Public License
623 ; as published by the Free Software Foundation; either version
2
624 ; of the License
, or (at your option
) any later version.
626 ; This program is distributed in the hope that it will
be useful
,
627 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
628 ; MERCHANTABILITY
or FITNESS FOR
A PARTICULAR PURPOSE. See the
629 ; GNU General Public License for more details.
631 ; You should have received
a copy of the GNU General Public License
632 ; along with this program; if
not, write to the Free Software
633 ; Foundation
, Inc.
, 59 Temple Place
- Suite
330, Boston
, MA
02111-1307, USA.
641 ; int dist1_11_mmx
(unsigned char
*p1
,unsigned char
*p2
,int
lx,int h
);
643 ; esi
= p1
(init
: blk1
)
644 ; edi
= p2
(init
: blk2
)
646 ; ecx
= rowsleft
(init
: h
)
649 ; mm0
= distance accumulators
(4 words
)
653 ; I
'd love to find someplace to stash p1+1 and p1+lx+1's bytes
654 ; but I don
't think thats going to happen in iA32-land...
655 ; mm4 = temp 4 bytes in words interpolating p1, p1+1
656 ; mm5 = temp 4 bytes in words from p2
657 ; mm6 = temp comparison bit mask p1,p2
658 ; mm7 = temp comparison bit mask p2,p1
663 push ebp ; save stack pointer
664 mov ebp, esp ; so that we can do this
666 push ebx ; Saves registers (called saves convention in
667 push ecx ; x86 GCC it seems)
672 pxor mm0, mm0 ; zero acculumators
674 mov esi, [ebp+8] ; get p1
675 mov edi, [ebp+12] ; get p2
676 mov edx, [ebp+16] ; get lx
677 mov ecx, [ebp+20] ; rowsleft := h
680 jmp nextrowmm11 ; snap to it
685 ;; First 8 bytes of row
688 ;; First 4 bytes of 8
690 movq mm4, [esi] ; mm4 := first 4 bytes p1
692 movq mm2, mm4 ; mm2 records all 8 bytes
693 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
695 movq mm6, [ebx] ; mm6 := first 4 bytes p1+lx
696 movq mm3, mm6 ; mm3 records all 8 bytes
701 movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
702 punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
704 movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
708 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
710 movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
715 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
717 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
721 paddw mm0, mm6 ; Add to accumulator
723 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
725 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
728 paddw mm0, mm5 ; Add to accumulator
730 ;; Second 4 bytes of 8
732 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
735 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
739 movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
740 punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
742 movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
746 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
748 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
752 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
754 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
758 paddw mm0, mm6 ; Add to accumulator
760 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
762 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
765 paddw mm0, mm5 ; Add to accumulator
769 ;; Second 8 bytes of row
771 ;; First 4 bytes of 8
773 movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
775 movq mm2, mm4 ; mm2 records all 8 bytes
776 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
778 movq mm6, [ebx+8] ; mm6 := first 4 bytes p1+lx+8
779 movq mm3, mm6 ; mm3 records all 8 bytes
784 movq mm5, [esi+9] ; mm5 := first 4 bytes p1+9
785 punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
787 movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+9
791 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
793 movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
798 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
800 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
804 paddw mm0, mm6 ; Add to accumulator
806 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
808 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
811 paddw mm0, mm5 ; Add to accumulator
813 ;; Second 4 bytes of 8
815 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
818 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
822 movq mm5, [esi+9] ; mm5 := first 4 bytes p1+1
823 punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
825 movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+1
829 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
831 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
835 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
837 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
841 paddw mm0, mm6 ; Add to accumulator
843 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
845 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
848 paddw mm0, mm5 ; Add to accumulator
852 ;; Loop termination condition... and stepping
855 add esi, edx ; update pointer to next row
860 test ecx, ecx ; check rowsleft
863 ;; Sum the Accumulators
870 movd eax, mm0 ; store return value
879 pop ebp ; restore stack pointer
881 emms ; clear mmx registers
882 ret ; we now return you to your regular programming
887 ; int dist22_mmx(unsigned char *blk1,unsigned char *blk2,int lx,int h);
889 ; eax = p1 (init: blk1)
890 ; ebx = p2 (init: blk2)
891 ; ecx = rowsleft (init: h)
894 ; mm0 = distance accumulators (4 words)
906 push ebp ; save stack pointer
907 mov ebp, esp ; so that we can do this
909 push ebx ; Saves registers (called saves convention in
910 push ecx ; x86 GCC it seems)
913 pxor mm0, mm0 ; zero acculumators
915 mov eax, [ebp+8] ; get p1
916 mov ebx, [ebp+12] ; get p2
917 mov edx, [ebp+16] ; get lx
919 mov ecx, [ebp+20] ; get rowsleft
921 jmp nextrow ; snap to it
924 movq mm4, [eax] ; load 8 bytes of p1
925 movq mm5, [ebx] ; load 8 bytes of p2
927 movq mm7, mm4 ; mm5 = abs(*p1-*p2)
930 add eax, edx ; update pointer to next row
933 ;; Add the mm5 bytes to the accumulatores
938 add ebx, edx ; update pointer to next row
941 movq mm4, [eax] ; load 8 bytes of p1 (next row)
942 movq mm5, [ebx] ; load 8 bytes of p2 (next row)
944 movq mm7, mm4 ; mm5 = abs(*p1-*p2)
947 add eax, edx ; update pointer to next row
950 ;; Add the mm5 bytes to the accumulatores
953 add ebx, edx ; update pointer to next row
962 ;; Sum the Accumulators
973 movd eax, mm0 ; store return value
978 pop ebx ; ia86 needs a fizz instruction
980 pop ebp ; restore stack pointer
982 emms ; clear mmx registers
983 ret ; we now return you to your regular programming
990 ; int dist44_mmx(unsigned char *blk1,unsigned char *blk2,int qlx,int qh);
998 ; mm0 = distance accumulator left block p1
999 ; mm1 = distance accumulator right block p1
1001 ; mm3 = right block of p1
1002 ; mm4 = left block of p1
1009 push ebp ; save stack pointer
1010 mov ebp, esp ; so that we can do this
1017 pxor mm0, mm0 ; zero acculumator
1020 mov eax, [ebp+8] ; get p1
1021 mov ebx, [ebp+12] ; get p2
1022 mov edx, [ebp+16] ; get qlx
1023 mov esi, [ebp+20] ; get rowsleft
1024 jmp nextrowqd ; snap to it
1029 ;; Beware loop obfuscated by interleaving to try to
1030 ;; hide latencies...
1032 movq mm4, [eax] ; mm4 = first 4 bytes of p1 in words
1033 movq mm5, [ebx] ; mm5 = 4 bytes of p2 in words
1043 add eax, edx ; update a pointer to next row
1044 ; punpckhbw mm3, mm2 ; mm3 = 2nd 4 bytes of p1 in words
1047 paddw mm0, mm7 ; Add absolute differences to left block accumulators
1053 add ebx, edx ; update a pointer to next row
1057 ; paddw mm1, mm7 ; Add absolute differences to right block accumulators
1063 ;; Sum the accumulators
1071 movd eax, mm0 ; store return value
1090 pop ebp ; restore stack pointer
1092 emms ; clear mmx registers
1093 ret ; we now return you to your regular programming