4 ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
5 ;;;
(interpolated
, 1-pel
, 2*2 sub-sampled pel
and 4*4 sub-sampled pel
)
7 ; dist1_
* Original Copyright
(C
) 2000 Chris Atenasio
<chris@crud.net
>
8 ; Enhancements
and rest Copyright
(C
) 2000 Andrew Stevens
<as@comlab.ox.ac.uk
>
10 ;; Yes
, I tried prefetch-ing. It makes no difference
or makes
14 ; This program is free software; you can reaxstribute it
and/or
15 ; modify it under the terms of the GNU General Public License
16 ; as published by the Free Software Foundation; either version
2
17 ; of the License
, or (at your option
) any later version.
19 ; This program is distributed in the hope that it will
be useful
,
20 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ; MERCHANTABILITY
or FITNESS FOR
A PARTICULAR PURPOSE. See the
22 ; GNU General Public License for more details.
24 ; You should have received
a copy of the GNU General Public License
25 ; along with this program; if
not, write to the Free Software
26 ; Foundation
, Inc.
, 59 Temple Place
- Suite
330, Boston
, MA
02111-1307, USA.
34 ; int dist1_00
(char
*blk1
,char
*blk2
,int
lx,int h
,int distlim
);
35 ; distlim unused
- costs more to check than the savings of
36 ; aborting the computation early from time to time.
..
42 ; mm0
= distance accumulator
53 push ebp ; save frame pointer
60 pxor mm0
, mm0 ; zero acculumator
62 mov eax
, [ebp+
8] ; get p1
64 mov ebx
, [ebp+
12] ; get p2
65 mov edx
, [ebp+
16] ; get
lx
67 mov ecx
, [ebp+
20] ; get rowsleft
71 movq mm4
, [eax
] ; load first
8 bytes of p1
(row
1)
72 psadbw mm4
, [ebx
] ; compare to first
8 bytes of p2
(row
1)
73 movq mm5
, [eax+
8] ; load next
8 bytes of p1
(row
1)
74 add eax
, edx ; update pointer to next row
75 paddd mm0
, mm4 ; accumulate difference
77 psadbw mm5
, [ebx+
8] ; compare to next
8 bytes of p2
(row
1)
79 paddd mm0
, mm5 ; accumulate difference
82 movq mm6
, [eax
] ; load first
8 bytes of p1
(row
2)
83 psadbw mm6
, [ebx
] ; compare to first
8 bytes of p2
(row
2)
84 movq mm4
, [eax+
8] ; load next
8 bytes of p1
(row
2)
85 add eax
, edx ; update pointer to next row
86 paddd mm0
, mm6 ; accumulate difference
88 psadbw mm4
, [ebx+
8] ; compare to next
8 bytes of p2
(row
2)
90 paddd mm0
, mm4 ; accumulate difference
92 ;psubd mm2
, mm3 ; decrease rowsleft
93 ;movq mm5
, mm1 ; copy distlim
94 ;pcmpgtd mm5
, mm0 ; distlim
> dist?
95 ;pand mm2
, mm5 ; mask rowsleft with answer
96 ;movd ecx
, mm2 ; move rowsleft to ecx
98 ;
add eax
, edx ; update pointer to next row
101 ;test ecx
, ecx ; check rowsleft
105 movd eax
, mm0 ; store return value
118 global dist1_00_Ammxe
119 ;; This is
a special version that only does aligned accesses.
..
120 ;; Wonder if it
'll make it faster on a P-III
121 ;; ANSWER: NO its slower hence no longer used.
123 ; int dist1_00(char *blk1,char *blk2,int lx,int h,int distlim);
124 ; distlim unused - costs more to check than the savings of
125 ; aborting the computation early from time to time...
131 ; mm0 = distance accumulator
133 ; mm2 = right shift to adjust for mis-align
134 ; mm3 = left shift to adjust for mis-align
142 push ebp ; save frame pointer
149 pxor mm0, mm0 ; zero acculumator
151 mov eax, [ebp+8] ; get p1
153 and ebx, 7 ; Misalignment!
155 jz near dist1_00_0misalign
156 sub eax, ebx ; Align eax
157 mov ecx, 8 ; ecx = 8-misalignment
159 shl ebx, 3 ; Convert into bit-shifts...
161 movd mm2, ebx ; mm2 = shift to start msb
162 movd mm3, ecx ; mm3 = shift to end lsb
164 mov ebx, [ebp+12] ; get p2
165 mov edx, [ebp+16] ; get lx
166 mov ecx, [ebp+20] ; get rowsleft
170 movq mm4, [eax] ; load first 8 bytes of aligned p1 (row 1)
171 movq mm5, [eax+8] ; load next 8 bytes of aligned p1 (row 1)
173 psrlq mm4, mm2 ; mm4 first 8 bytes of p1 proper
176 psadbw mm4, [ebx] ; compare to first 8 bytes of p2
178 movq mm7, [eax+16] ; load last 8 bytes of aligned p1
179 add eax, edx ; update pointer to next row
180 psrlq mm6, mm2 ; mm6 2nd 8 bytes of p1 proper
185 paddd mm0, mm4 ; accumulate difference
187 psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
189 paddd mm0, mm6 ; accumulate difference
194 movd eax, mm0 ; store return value
207 ; int dist1_01(char *blk1,char *blk2,int lx,int h);
214 ; mm0 = distance accumulator
217 ; mm3 = 2 (rows per loop)
232 pxor mm0, mm0 ; zero acculumator
234 mov eax, [ebp+8] ; get p1
235 mov ebx, [ebp+12] ; get p2
236 mov edx, [ebp+16] ; get lx
238 mov ecx, [ebp+20] ; get rowsleft
239 jmp nextrow01 ; snap to it
242 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
243 pavgb mm4, [eax+1] ; Interpolate...
244 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
245 paddd mm0, mm4 ; accumulate difference
247 movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
248 pavgb mm5, [eax+9] ; Interpolate
249 psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
250 paddd mm0, mm5 ; accumulate difference
252 add eax, edx ; update pointer to next row
255 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
256 pavgb mm6, [eax+1] ; Interpolate
257 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
258 paddd mm0, mm6 ; accumulate difference
260 movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2)
262 psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
263 paddd mm0, mm7 ; accumulate difference
265 add eax, edx ; update pointer to next row
268 sub ecx, 2 ; check rowsleft
269 jnz nextrow01 ; rinse and repeat
271 movd eax, mm0 ; store return value
277 pop ebp ; restore stack pointer
279 emms ; clear mmx registers
280 ret ; we now return you to your regular programming
285 ; int dist1_10(char *blk1,char *blk2,int lx,int h);
293 ; mm0 = distance accumulator
295 ; mm3 = 2 (rows per loop)
303 push ebp ; save stack pointer
311 pxor mm0, mm0 ; zero acculumator
313 mov eax, [ebp+8] ; get p1
314 mov ebx, [ebp+12] ; get p2
315 mov edx, [ebp+16] ; get lx
318 mov ecx, [ebp+20] ; get rowsleft
319 jmp nextrow10 ; snap to it
322 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
323 pavgb mm4, [edi] ; Interpolate...
324 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
325 paddd mm0, mm4 ; accumulate difference
327 movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
328 pavgb mm5, [edi+8] ; Interpolate
329 psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
330 paddd mm0, mm5 ; accumulate difference
332 add eax, edx ; update pointer to next row
336 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
337 pavgb mm6, [edi] ; Interpolate
338 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
339 paddd mm0, mm6 ; accumulate difference
341 movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2)
343 psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
344 paddd mm0, mm7 ; accumulate difference
346 psubd mm2, mm3 ; decrease rowsleft
348 add eax, edx ; update pointer to next row
352 sub ecx, 2 ; check rowsleft (we're doing
2 at
a time
)
353 jnz nextrow10 ; rinse
and repeat
355 movd eax
, mm0 ; store return value
362 pop ebp ; restore stack pointer
364 emms ; clear mmx registers
365 ret ; we now return you to your regular programming
370 ; int dist1_11
(char
*blk1
,char
*blk2
,int
lx,int h
);
379 ; mm0
= distance accumulator
381 ; mm3
= 2 (rows per loop
)
389 push ebp ; save stack pointer
390 mov ebp
, esp ; so that we can do this
392 push ebx ; save the pigs
393 push ecx ; make them squeal
394 push edx ; lets have pigs for every meal
397 pxor mm0
, mm0 ; zero acculumator
399 mov eax
, [ebp+
8] ; get p1
400 mov ebx
, [ebp+
12] ; get p2
401 mov edx
, [ebp+
16] ; get
lx
404 mov ecx
, [ebp+
20] ; get rowsleft
405 jmp nextrow11 ; snap to it
408 movq mm4
, [eax
] ; load first
8 bytes of p1
(row
1)
409 pavgb mm4
, [edi
] ; Interpolate.
..
413 psadbw mm4
, [ebx
] ; compare to first
8 bytes of p2
(row
1)
414 paddd mm0
, mm4 ; accumulate difference
416 movq mm6
, [eax+
8] ; load next
8 bytes of p1
(row
1)
417 pavgb mm6
, [edi+
8] ; Interpolate
421 psadbw mm6
, [ebx+
8] ; compare to next
8 bytes of p2
(row
1)
422 paddd mm0
, mm6 ; accumulate difference
424 add eax
, edx ; update pointer to next row
428 movq mm4
, [eax
] ; load first
8 bytes of p1
(row
1)
429 pavgb mm4
, [edi
] ; Interpolate.
..
433 psadbw mm4
, [ebx
] ; compare to first
8 bytes of p2
(row
1)
434 paddd mm0
, mm4 ; accumulate difference
436 movq mm6
, [eax+
8] ; load next
8 bytes of p1
(row
1)
437 pavgb mm6
, [edi+
8] ; Interpolate
441 psadbw mm6
, [ebx+
8] ; compare to next
8 bytes of p2
(row
1)
442 paddd mm0
, mm6 ; accumulate difference
444 add eax
, edx ; update pointer to next row
449 sub ecx
, 2 ; check rowsleft
450 jnz near nextrow11 ; rinse
and repeat
452 movd eax
, mm0 ; store return value
459 pop ebp ; restore stack pointer
461 emms ; clear mmx registers
462 ret ; we now return you to your regular programming
466 ; int dist22_mmxe
(unsigned char
*blk1
,unsigned char
*blk2
,int flx
,int fh
);
473 ; mm0
= distance accumulator
475 ; mm3
= 2 (rows per loop
)
483 push ebp ; save frame pointer
490 pxor mm0
, mm0 ; zero acculumator
492 mov eax
, [ebp+
8] ; get p1
493 mov ebx
, [ebp+
12] ; get p2
494 mov edx
, [ebp+
16] ; get
lx
500 movq mm4
, [eax
] ; load first
8 bytes of p1
(row
1)
501 add eax
, edx ; update pointer to next row
502 psadbw mm4
, [ebx
] ; compare to first
8 bytes of p2
(row
1)
504 paddd mm0
, mm4 ; accumulate difference
507 movq mm6
, [eax
] ; load first
8 bytes of p1
(row
2)
508 add eax
, edx ; update pointer to next row
509 psadbw mm6
, [ebx
] ; compare to first
8 bytes of p2
(row
2)
511 paddd mm0
, mm6 ; accumulate difference
534 ; int dist44_mmxe
(unsigned char
*blk1
,unsigned char
*blk2
,int qlx
,int qh
);
542 ; mm0
= distance accumulator left block p1
543 ; mm1
= distance accumulator right block p1
561 pxor mm0
, mm0 ; zero acculumator
564 mov eax
, [ebp+
8] ; get p1
565 mov ebx
, [ebp+
12] ; get p2
566 mov edx
, [ebp+
16] ; get qlx
568 mov esi
, [ebp+
20] ; get rowsleft
569 jmp nextrowqd ; snap to it
572 movq mm4
, [eax
] ; load
8 bytes of p1
(two blocks
!)
573 add eax
, edx ; update pointer to next row
575 mov ecx
, [ebx
] ; load
4 bytes of p2
576 punpcklbw mm4
, mm2 ; mm4
= bytes
0..3 p1 (spaced out)
578 punpcklbw mm5
, mm2 ; mm5
= bytes
0..3 p2 (spaced out)
579 psadbw mm4
, mm5 ; compare to left block
582 ; punpckhbw mm6
, mm2 ; mm6
= bytes
4..7 p1 (spaced out)
584 paddd mm0
, mm4 ; accumulate difference left block
586 ; psadbw mm6
,mm5 ; compare to right block
589 ; paddd mm1
, mm6 ; accumulate difference right block
604 pop ebp ; restore stack pointer
606 emms ; clear mmx registers
607 ret ; we now return you to your regular programming