r1007: Make configure detect and work on amd64.
[cinelerra_cv/mob.git] / mpeg2enc / mblock_sad_mmx.s
bloba58c580a5665d47c4152e4f2fcbcf517e61ce85c
1 ;;;
2 ;;; mblock_sad_mmxe.s:
3 ;;;
4 ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
5 ;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel)
7 ; dist1_* Original Copyright (C) 2000 Chris Atenasio <chris@crud.net>
8 ; Enhancements and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
11 ; This program is free software; you can redistribute it and/or
12 ; modify it under the terms of the GNU General Public License
13 ; as published by the Free Software Foundation; either version 2
14 ; of the License, or (at your option) any later version.
16 ; This program is distributed in the hope that it will be useful,
17 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ; GNU General Public License for more details.
21 ; You should have received a copy of the GNU General Public License
22 ; along with this program; if not, write to the Free Software
23 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
29 global dist1_00_mmx
31 ; int dist1_mmx(unsigned char *blk1,unsigned char *blk2,int lx,int h, int distlim);
32 ; N.b. distlim is *ignored* as testing for it is more expensive than the
33 ; occasional saving by aborting the computionation early...
34 ; esi = p1 (init: blk1)
35 ; edi = p2 (init: blk2)
36 ; ebx = distlim
37 ; ecx = rowsleft (init: h)
38 ; edx = lx;
40 ; mm0 = distance accumulators (4 words)
41 ; mm1 = temp
42 ; mm2 = temp
43 ; mm3 = temp
44 ; mm4 = temp
45 ; mm5 = temp
46 ; mm6 = 0
47 ; mm7 = temp
50 align 32
51 dist1_00_mmx:
52 push ebp ; save frame pointer
53 mov ebp, esp
55 push ebx ; Saves registers (called saves convention in
56 push ecx ; x86 GCC it seems)
57 push edx ;
58 push esi
59 push edi
61 pxor mm0, mm0 ; zero acculumators
62 pxor mm6, mm6
63 mov esi, [ebp+8] ; get p1
64 mov edi, [ebp+12] ; get p2
65 mov edx, [ebp+16] ; get lx
66 mov ecx, [ebp+20] ; get rowsleft
67 ;mov ebx, [ebp+24] ; distlim
68 jmp nextrowmm00
69 align 32
70 nextrowmm00:
71 movq mm4, [esi] ; load first 8 bytes of p1 row
72 movq mm5, [edi] ; load first 8 bytes of p2 row
74 movq mm7, mm4 ; mm5 = abs(mm4-mm5)
75 psubusb mm7, mm5
76 psubusb mm5, mm4
77 paddb mm5, mm7
79 ;; Add the abs(mm4-mm5) bytes to the accumulators
80 movq mm2, [esi+8] ; load second 8 bytes of p1 row (interleaved)
81 movq mm7,mm5 ; mm7 := [i : B0..3, mm1]W
82 punpcklbw mm7,mm6
83 movq mm3, [edi+8] ; load second 8 bytes of p2 row (interleaved)
84 paddw mm0, mm7
85 punpckhbw mm5,mm6
86 paddw mm0, mm5
88 ;; This is logically where the mm2, mm3 loads would go...
90 movq mm7, mm2 ; mm3 = abs(mm2-mm3)
91 psubusb mm7, mm3
92 psubusb mm3, mm2
93 paddb mm3, mm7
95 ;; Add the abs(mm4-mm5) bytes to the accumulators
96 movq mm7,mm3 ; mm7 := [i : B0..3, mm1]W
97 punpcklbw mm7,mm6
98 punpckhbw mm3,mm6
99 paddw mm0, mm7
101 add esi, edx ; update pointer to next row
102 add edi, edx ; ditto
104 paddw mm0, mm3
108 sub ecx,1
109 jnz near nextrowmm00
111 returnmm00:
113 ;; Sum the Accumulators
114 movq mm5, mm0 ; mm5 := [W0+W2,W1+W3, mm0
115 psrlq mm5, 32
116 movq mm4, mm0
117 paddw mm4, mm5
119 movq mm7, mm4 ; mm6 := [W0+W2+W1+W3, mm0]
120 psrlq mm7, 16
121 paddw mm4, mm7
122 movd eax, mm4 ; store return value
123 and eax, 0xffff
125 pop edi
126 pop esi
127 pop edx
128 pop ecx
129 pop ebx
131 pop ebp
133 emms ; clear mmx registers
134 ret
136 ;;; dist1_01_mmx.s: mmx1 optimised 7bit*8 word absolute difference sum
137 ;;; We're reduce to seven bits as otherwise we also have to mess
138 ;;; horribly with carries and signed only comparisons make the code
139 ;;; simply enormous (and probably barely faster than a simple loop).
140 ;;; Since signals with a bona-fide 8bit res will be rare we simply
141 ;;; take the precision hit...
142 ;;; Actually we don't worry about carries from the low-order bits
143 ;;; either so 1/4 of the time we'll be 1 too low...
144 ;;;
145 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
148 ; This program is free software; you can redistribute it and/or
149 ; modify it under the terms of the GNU General Public License
150 ; as published by the Free Software Foundation; either version 2
151 ; of the License, or (at your option) any later version.
153 ; This program is distributed in the hope that it will be useful,
154 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
155 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
156 ; GNU General Public License for more details.
158 ; You should have received a copy of the GNU General Public License
159 ; along with this program; if not, write to the Free Software
160 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
166 global dist1_01_mmx
168 ; int dist1_01_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
170 ; esi = p1 (init: blk1)
171 ; edi = p2 (init: blk2)
172 ; ecx = rowsleft (init: h)
173 ; edx = lx;
175 ; mm0 = distance accumulators (4 words)
176 ; mm1 = bytes p2
177 ; mm2 = bytes p1
178 ; mm3 = bytes p1+1
179 ; mm4 = temp 4 bytes in words interpolating p1, p1+1
180 ; mm5 = temp 4 bytes in words from p2
181 ; mm6 = temp comparison bit mask p1,p2
182 ; mm7 = temp comparison bit mask p2,p1
185 align 32
186 dist1_01_mmx:
187 push ebp ; save stack pointer
188 mov ebp, esp ; so that we can do this
190 push ebx ; Saves registers (called saves convention in
191 push ecx ; x86 GCC it seems)
192 push edx ;
193 push esi
194 push edi
196 pxor mm0, mm0 ; zero acculumators
198 mov esi, [ebp+8] ; get p1
199 mov edi, [ebp+12] ; get p2
200 mov edx, [ebp+16] ; get lx
201 mov ecx, [ebp+20] ; rowsleft := h
202 jmp nextrowmm01 ; snap to it
203 align 32
204 nextrowmm01:
207 ;; First 8 bytes of row
210 ;; First 4 bytes of 8
212 movq mm4, [esi] ; mm4 := first 4 bytes p1
213 pxor mm7, mm7
214 movq mm2, mm4 ; mm2 records all 8 bytes
215 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
217 movq mm6, [esi+1] ; mm6 := first 4 bytes p1+1
218 movq mm3, mm6 ; mm3 records all 8 bytes
219 punpcklbw mm6, mm7
220 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
221 psrlw mm4, 1
223 movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
224 movq mm1, mm5
225 punpcklbw mm5, mm7
227 movq mm7,mm4
228 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
230 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
231 psubw mm6,mm5
232 pand mm6, mm7
234 paddw mm0, mm6 ; Add to accumulator
236 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
237 pcmpgtw mm6,mm4
238 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
239 pand mm5, mm6
241 paddw mm0, mm5 ; Add to accumulator
243 ;; Second 4 bytes of 8
245 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
246 pxor mm7, mm7
247 punpckhbw mm4, mm7
248 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
249 punpckhbw mm6, mm7
251 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
252 psrlw mm4, 1
254 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
255 punpckhbw mm5, mm7
257 movq mm7,mm4
258 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
260 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
261 psubw mm6,mm5
262 pand mm6, mm7
264 paddw mm0, mm6 ; Add to accumulator
266 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
267 pcmpgtw mm6,mm4
268 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
269 pand mm5, mm6
271 paddw mm0, mm5 ; Add to accumulator
275 ;; Second 8 bytes of row
277 ;; First 4 bytes of 8
279 movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
280 pxor mm7, mm7
281 movq mm2, mm4 ; mm2 records all 8 bytes
282 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
284 movq mm6, [esi+9] ; mm6 := first 4 bytes p1+9
285 movq mm3, mm6 ; mm3 records all 8 bytes
286 punpcklbw mm6, mm7
287 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
288 psrlw mm4, 1
290 movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
291 movq mm1, mm5
292 punpcklbw mm5, mm7
294 movq mm7,mm4
295 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
297 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
298 psubw mm6,mm5
299 pand mm6, mm7
301 paddw mm0, mm6 ; Add to accumulator
303 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
304 pcmpgtw mm6,mm4
305 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
306 pand mm5, mm6
308 paddw mm0, mm5 ; Add to accumulator
310 ;; Second 4 bytes of 8
312 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
313 pxor mm7, mm7
314 punpckhbw mm4, mm7
315 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
316 punpckhbw mm6, mm7
318 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
319 psrlw mm4, 1
321 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
322 punpckhbw mm5, mm7
324 movq mm7,mm4
325 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
327 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
328 psubw mm6,mm5
329 pand mm6, mm7
331 paddw mm0, mm6 ; Add to accumulator
333 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
334 pcmpgtw mm6,mm4
335 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
336 pand mm5, mm6
338 paddw mm0, mm5 ; Add to accumulator
342 ;; Loop termination condition... and stepping
345 add esi, edx ; update pointer to next row
346 add edi, edx ; ditto
348 sub ecx,1
349 test ecx, ecx ; check rowsleft
350 jnz near nextrowmm01
353 ;; Sum the Accumulators
354 movq mm4, mm0
355 psrlq mm4, 32
356 paddw mm0, mm4
357 movq mm6, mm0
358 psrlq mm6, 16
359 paddw mm0, mm6
360 movd eax, mm0 ; store return value
361 and eax, 0xffff
363 pop edi
364 pop esi
365 pop edx
366 pop ecx
367 pop ebx
369 pop ebp ; restore stack pointer
371 emms ; clear mmx registers
372 ret ; we now return you to your regular programming
374 ;;; dist1_01_mmx.s: mmx1 optimised 7bit*8 word absolute difference sum
375 ;;; We're reduce to seven bits as otherwise we also have to mess
376 ;;; horribly with carries and signed only comparisons make the code
377 ;;; simply enormous (and probably barely faster than a simple loop).
378 ;;; Since signals with a bona-fide 8bit res will be rare we simply
379 ;;; take the precision hit...
380 ;;; Actually we don't worry about carries from the low-order bits
381 ;;; either so 1/4 of the time we'll be 1 too low...
382 ;;;
383 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
386 ; This program is free software; you can redistribute it and/or
387 ; modify it under the terms of the GNU General Public License
388 ; as published by the Free Software Foundation; either version 2
389 ; of the License, or (at your option) any later version.
391 ; This program is distributed in the hope that it will be useful,
392 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
393 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
394 ; GNU General Public License for more details.
396 ; You should have received a copy of the GNU General Public License
397 ; along with this program; if not, write to the Free Software
398 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
404 global dist1_10_mmx
406 ; int dist1_10_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
408 ; esi = p1 (init: blk1)
409 ; edi = p2 (init: blk2)
410 ; ebx = p1+lx
411 ; ecx = rowsleft (init: h)
412 ; edx = lx;
414 ; mm0 = distance accumulators (4 words)
415 ; mm1 = bytes p2
416 ; mm2 = bytes p1
417 ; mm3 = bytes p1+1
418 ; mm4 = temp 4 bytes in words interpolating p1, p1+1
419 ; mm5 = temp 4 bytes in words from p2
420 ; mm6 = temp comparison bit mask p1,p2
421 ; mm7 = temp comparison bit mask p2,p1
424 align 32
425 dist1_10_mmx:
426 push ebp ; save stack pointer
427 mov ebp, esp ; so that we can do this
429 push ebx ; Saves registers (called saves convention in
430 push ecx ; x86 GCC it seems)
431 push edx ;
432 push esi
433 push edi
435 pxor mm0, mm0 ; zero acculumators
437 mov esi, [ebp+8] ; get p1
438 mov edi, [ebp+12] ; get p2
439 mov edx, [ebp+16] ; get lx
440 mov ecx, [ebp+20] ; rowsleft := h
441 mov ebx, esi
442 add ebx, edx
443 jmp nextrowmm10 ; snap to it
444 align 32
445 nextrowmm10:
448 ;; First 8 bytes of row
451 ;; First 4 bytes of 8
453 movq mm4, [esi] ; mm4 := first 4 bytes p1
454 pxor mm7, mm7
455 movq mm2, mm4 ; mm2 records all 8 bytes
456 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
458 movq mm6, [ebx] ; mm6 := first 4 bytes p1+lx
459 movq mm3, mm6 ; mm3 records all 8 bytes
460 punpcklbw mm6, mm7
461 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
462 psrlw mm4, 1
464 movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
465 movq mm1, mm5
466 punpcklbw mm5, mm7
468 movq mm7,mm4
469 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
471 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
472 psubw mm6,mm5
473 pand mm6, mm7
475 paddw mm0, mm6 ; Add to accumulator
477 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
478 pcmpgtw mm6,mm4
479 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
480 pand mm5, mm6
482 paddw mm0, mm5 ; Add to accumulator
484 ;; Second 4 bytes of 8
486 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
487 pxor mm7, mm7
488 punpckhbw mm4, mm7
489 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
490 punpckhbw mm6, mm7
492 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
493 psrlw mm4, 1
495 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
496 punpckhbw mm5, mm7
498 movq mm7,mm4
499 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
501 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
502 psubw mm6,mm5
503 pand mm6, mm7
505 paddw mm0, mm6 ; Add to accumulator
507 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
508 pcmpgtw mm6,mm4
509 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
510 pand mm5, mm6
512 paddw mm0, mm5 ; Add to accumulator
516 ;; Second 8 bytes of row
518 ;; First 4 bytes of 8
520 movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
521 pxor mm7, mm7
522 movq mm2, mm4 ; mm2 records all 8 bytes
523 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
525 movq mm6, [ebx+8] ; mm6 := first 4 bytes p1+lx+8
526 movq mm3, mm6 ; mm3 records all 8 bytes
527 punpcklbw mm6, mm7
528 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
529 psrlw mm4, 1
531 movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
532 movq mm1, mm5
533 punpcklbw mm5, mm7
535 movq mm7,mm4
536 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
538 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
539 psubw mm6,mm5
540 pand mm6, mm7
542 paddw mm0, mm6 ; Add to accumulator
544 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
545 pcmpgtw mm6,mm4
546 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
547 pand mm5, mm6
549 paddw mm0, mm5 ; Add to accumulator
551 ;; Second 4 bytes of 8
553 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
554 pxor mm7, mm7
555 punpckhbw mm4, mm7
556 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
557 punpckhbw mm6, mm7
559 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
560 psrlw mm4, 1
562 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
563 punpckhbw mm5, mm7
565 movq mm7,mm4
566 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
568 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
569 psubw mm6,mm5
570 pand mm6, mm7
572 paddw mm0, mm6 ; Add to accumulator
574 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
575 pcmpgtw mm6,mm4
576 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
577 pand mm5, mm6
579 paddw mm0, mm5 ; Add to accumulator
583 ;; Loop termination condition... and stepping
586 add esi, edx ; update pointer to next row
587 add edi, edx ; ditto
588 add ebx, edx
590 sub ecx,1
591 test ecx, ecx ; check rowsleft
592 jnz near nextrowmm10
594 ;; Sum the Accumulators
595 movq mm4, mm0
596 psrlq mm4, 32
597 paddw mm0, mm4
598 movq mm6, mm0
599 psrlq mm6, 16
600 paddw mm0, mm6
601 movd eax, mm0 ; store return value
602 and eax, 0xffff
605 pop edi
606 pop esi
607 pop edx
608 pop ecx
609 pop ebx
611 pop ebp ; restore stack pointer
613 emms ; clear mmx registers
614 ret ; we now return you to your regular programming
616 ;;; dist1_01_mmx.s:
617 ;;;
618 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
621 ; This program is free software; you can redistribute it and/or
622 ; modify it under the terms of the GNU General Public License
623 ; as published by the Free Software Foundation; either version 2
624 ; of the License, or (at your option) any later version.
626 ; This program is distributed in the hope that it will be useful,
627 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
628 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
629 ; GNU General Public License for more details.
631 ; You should have received a copy of the GNU General Public License
632 ; along with this program; if not, write to the Free Software
633 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
639 global dist1_11_mmx
641 ; int dist1_11_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
643 ; esi = p1 (init: blk1)
644 ; edi = p2 (init: blk2)
645 ; ebx = p1+lx
646 ; ecx = rowsleft (init: h)
647 ; edx = lx;
649 ; mm0 = distance accumulators (4 words)
650 ; mm1 = bytes p2
651 ; mm2 = bytes p1
652 ; mm3 = bytes p1+lx
653 ; I'd love to find someplace to stash p1+1 and p1+lx+1's bytes
654 ; but I don't think thats going to happen in iA32-land...
655 ; mm4 = temp 4 bytes in words interpolating p1, p1+1
656 ; mm5 = temp 4 bytes in words from p2
657 ; mm6 = temp comparison bit mask p1,p2
658 ; mm7 = temp comparison bit mask p2,p1
661 align 32
662 dist1_11_mmx:
663 push ebp ; save stack pointer
664 mov ebp, esp ; so that we can do this
666 push ebx ; Saves registers (called saves convention in
667 push ecx ; x86 GCC it seems)
668 push edx ;
669 push esi
670 push edi
672 pxor mm0, mm0 ; zero acculumators
674 mov esi, [ebp+8] ; get p1
675 mov edi, [ebp+12] ; get p2
676 mov edx, [ebp+16] ; get lx
677 mov ecx, [ebp+20] ; rowsleft := h
678 mov ebx, esi
679 add ebx, edx
680 jmp nextrowmm11 ; snap to it
681 align 32
682 nextrowmm11:
685 ;; First 8 bytes of row
688 ;; First 4 bytes of 8
690 movq mm4, [esi] ; mm4 := first 4 bytes p1
691 pxor mm7, mm7
692 movq mm2, mm4 ; mm2 records all 8 bytes
693 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
695 movq mm6, [ebx] ; mm6 := first 4 bytes p1+lx
696 movq mm3, mm6 ; mm3 records all 8 bytes
697 punpcklbw mm6, mm7
698 paddw mm4, mm6
701 movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
702 punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
703 paddw mm4, mm5
704 movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
705 punpcklbw mm6, mm7
706 paddw mm4, mm6
708 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
710 movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
711 movq mm1, mm5
712 punpcklbw mm5, mm7
714 movq mm7,mm4
715 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
717 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
718 psubw mm6,mm5
719 pand mm6, mm7
721 paddw mm0, mm6 ; Add to accumulator
723 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
724 pcmpgtw mm6,mm4
725 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
726 pand mm5, mm6
728 paddw mm0, mm5 ; Add to accumulator
730 ;; Second 4 bytes of 8
732 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
733 pxor mm7, mm7
734 punpckhbw mm4, mm7
735 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
736 punpckhbw mm6, mm7
737 paddw mm4, mm6
739 movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
740 punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
741 paddw mm4, mm5
742 movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
743 punpckhbw mm6, mm7
744 paddw mm4, mm6
746 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
748 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
749 punpckhbw mm5, mm7
751 movq mm7,mm4
752 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
754 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
755 psubw mm6,mm5
756 pand mm6, mm7
758 paddw mm0, mm6 ; Add to accumulator
760 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
761 pcmpgtw mm6,mm4
762 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
763 pand mm5, mm6
765 paddw mm0, mm5 ; Add to accumulator
769 ;; Second 8 bytes of row
771 ;; First 4 bytes of 8
773 movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
774 pxor mm7, mm7
775 movq mm2, mm4 ; mm2 records all 8 bytes
776 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
778 movq mm6, [ebx+8] ; mm6 := first 4 bytes p1+lx+8
779 movq mm3, mm6 ; mm3 records all 8 bytes
780 punpcklbw mm6, mm7
781 paddw mm4, mm6
784 movq mm5, [esi+9] ; mm5 := first 4 bytes p1+9
785 punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
786 paddw mm4, mm5
787 movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+9
788 punpcklbw mm6, mm7
789 paddw mm4, mm6
791 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
793 movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
794 movq mm1, mm5
795 punpcklbw mm5, mm7
797 movq mm7,mm4
798 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
800 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
801 psubw mm6,mm5
802 pand mm6, mm7
804 paddw mm0, mm6 ; Add to accumulator
806 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
807 pcmpgtw mm6,mm4
808 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
809 pand mm5, mm6
811 paddw mm0, mm5 ; Add to accumulator
813 ;; Second 4 bytes of 8
815 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
816 pxor mm7, mm7
817 punpckhbw mm4, mm7
818 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
819 punpckhbw mm6, mm7
820 paddw mm4, mm6
822 movq mm5, [esi+9] ; mm5 := first 4 bytes p1+1
823 punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
824 paddw mm4, mm5
825 movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+1
826 punpckhbw mm6, mm7
827 paddw mm4, mm6
829 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
831 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
832 punpckhbw mm5, mm7
834 movq mm7,mm4
835 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
837 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
838 psubw mm6,mm5
839 pand mm6, mm7
841 paddw mm0, mm6 ; Add to accumulator
843 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
844 pcmpgtw mm6,mm4
845 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
846 pand mm5, mm6
848 paddw mm0, mm5 ; Add to accumulator
852 ;; Loop termination condition... and stepping
855 add esi, edx ; update pointer to next row
856 add edi, edx ; ditto
857 add ebx, edx
859 sub ecx,1
860 test ecx, ecx ; check rowsleft
861 jnz near nextrowmm11
863 ;; Sum the Accumulators
864 movq mm4, mm0
865 psrlq mm4, 32
866 paddw mm0, mm4
867 movq mm6, mm0
868 psrlq mm6, 16
869 paddw mm0, mm6
870 movd eax, mm0 ; store return value
871 and eax, 0xffff
873 pop edi
874 pop esi
875 pop edx
876 pop ecx
877 pop ebx
879 pop ebp ; restore stack pointer
881 emms ; clear mmx registers
882 ret ; we now return you to your regular programming
885 global dist22_mmx
887 ; int dist22_mmx(unsigned char *blk1,unsigned char *blk2,int lx,int h);
889 ; eax = p1 (init: blk1)
890 ; ebx = p2 (init: blk2)
891 ; ecx = rowsleft (init: h)
892 ; edx = lx;
894 ; mm0 = distance accumulators (4 words)
895 ; mm1 = temp
896 ; mm2 = temp
897 ; mm3 = temp
898 ; mm4 = temp
899 ; mm5 = temp
900 ; mm6 = 0
901 ; mm7 = temp
904 align 32
905 dist22_mmx:
906 push ebp ; save stack pointer
907 mov ebp, esp ; so that we can do this
909 push ebx ; Saves registers (called saves convention in
910 push ecx ; x86 GCC it seems)
911 push edx ;
913 pxor mm0, mm0 ; zero acculumators
914 pxor mm6, mm6
915 mov eax, [ebp+8] ; get p1
916 mov ebx, [ebp+12] ; get p2
917 mov edx, [ebp+16] ; get lx
919 mov ecx, [ebp+20] ; get rowsleft
921 jmp nextrow ; snap to it
922 align 32
923 nextrow:
924 movq mm4, [eax] ; load 8 bytes of p1
925 movq mm5, [ebx] ; load 8 bytes of p2
927 movq mm7, mm4 ; mm5 = abs(*p1-*p2)
928 psubusb mm7, mm5
929 psubusb mm5, mm4
930 add eax, edx ; update pointer to next row
931 paddb mm5,mm7
933 ;; Add the mm5 bytes to the accumulatores
934 movq mm7,mm5
935 punpcklbw mm7,mm6
936 paddw mm0, mm7
937 punpckhbw mm5,mm6
938 add ebx, edx ; update pointer to next row
939 paddw mm0, mm5
941 movq mm4, [eax] ; load 8 bytes of p1 (next row)
942 movq mm5, [ebx] ; load 8 bytes of p2 (next row)
944 movq mm7, mm4 ; mm5 = abs(*p1-*p2)
945 psubusb mm7, mm5
946 psubusb mm5, mm4
947 add eax, edx ; update pointer to next row
948 paddb mm5,mm7
950 ;; Add the mm5 bytes to the accumulatores
951 movq mm7,mm5
952 punpcklbw mm7,mm6
953 add ebx, edx ; update pointer to next row
954 paddw mm0, mm7
955 punpckhbw mm5,mm6
956 sub ecx,2
957 paddw mm0, mm5
960 jnz nextrow
962 ;; Sum the Accumulators
963 movq mm1, mm0
964 psrlq mm1, 16
965 movq mm2, mm0
966 psrlq mm2, 32
967 movq mm3, mm0
968 psrlq mm3, 48
969 paddw mm0, mm1
970 paddw mm2, mm3
971 paddw mm0, mm2
973 movd eax, mm0 ; store return value
974 and eax, 0xffff
976 pop edx ; pop pop
977 pop ecx ; fizz fizz
978 pop ebx ; ia86 needs a fizz instruction
980 pop ebp ; restore stack pointer
982 emms ; clear mmx registers
983 ret ; we now return you to your regular programming
988 global dist44_mmx
990 ; int dist44_mmx(unsigned char *blk1,unsigned char *blk2,int qlx,int qh);
992 ; eax = p1
993 ; ebx = p2
994 ; ecx = temp
995 ; edx = qlx;
996 ; esi = rowsleft
998 ; mm0 = distance accumulator left block p1
999 ; mm1 = distance accumulator right block p1
1000 ; mm2 = 0
1001 ; mm3 = right block of p1
1002 ; mm4 = left block of p1
1003 ; mm5 = p2
1004 ; mm6 = temp
1005 ; mm7 = temp
1007 align 32
1008 dist44_mmx:
1009 push ebp ; save stack pointer
1010 mov ebp, esp ; so that we can do this
1012 push ebx
1013 push ecx
1014 push edx
1015 push esi
1017 pxor mm0, mm0 ; zero acculumator
1018 pxor mm1, mm1
1019 pxor mm2, mm2
1020 mov eax, [ebp+8] ; get p1
1021 mov ebx, [ebp+12] ; get p2
1022 mov edx, [ebp+16] ; get qlx
1023 mov esi, [ebp+20] ; get rowsleft
1024 jmp nextrowqd ; snap to it
1025 align 32
1026 nextrowqd:
1029 ;; Beware loop obfuscated by interleaving to try to
1030 ;; hide latencies...
1032 movq mm4, [eax] ; mm4 = first 4 bytes of p1 in words
1033 movq mm5, [ebx] ; mm5 = 4 bytes of p2 in words
1034 movq mm3, mm4
1035 punpcklbw mm4, mm2
1036 punpcklbw mm5, mm2
1038 movq mm7, mm4
1039 movq mm6, mm5
1040 psubusw mm7, mm5
1041 psubusw mm6, mm4
1043 add eax, edx ; update a pointer to next row
1044 ; punpckhbw mm3, mm2 ; mm3 = 2nd 4 bytes of p1 in words
1046 paddw mm7, mm6
1047 paddw mm0, mm7 ; Add absolute differences to left block accumulators
1049 ; movq mm7,mm3
1050 ; psubusw mm7, mm5
1051 ; psubusw mm5, mm3
1053 add ebx, edx ; update a pointer to next row
1054 sub esi, 1
1056 ; paddw mm7, mm5
1057 ; paddw mm1, mm7 ; Add absolute differences to right block accumulators
1061 jnz nextrowqd
1063 ;; Sum the accumulators
1065 movq mm4, mm0
1066 psrlq mm4, 32
1067 paddw mm0, mm4
1068 movq mm6, mm0
1069 psrlq mm6, 16
1070 paddw mm0, mm6
1071 movd eax, mm0 ; store return value
1073 ; movq mm4, mm1
1074 ; psrlq mm4, 32
1075 ; paddw mm1, mm4
1076 ; movq mm6, mm1
1077 ; psrlq mm6, 16
1078 ; paddw mm1, mm6
1079 ; movd ebx, mm1
1081 and eax, 0xffff
1082 ; sal ebx, 16
1083 ; or eax, ebx
1085 pop esi
1086 pop edx
1087 pop ecx
1088 pop ebx
1090 pop ebp ; restore stack pointer
1092 emms ; clear mmx registers
1093 ret ; we now return you to your regular programming