1 /* motion.c, motion estimation */
3 /* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
6 * Disclaimer of Warranty
8 * These software programs are available to the user without any license fee or
9 * royalty on an "as is" basis. The MPEG Software Simulation Group disclaims
10 * any and all warranties, whether express, implied, or statuary, including any
11 * implied warranties or merchantability or of fitness for a particular
12 * purpose. In no event shall the copyright-holder be liable for any
13 * incidental, punitive, or consequential damages of any kind whatsoever
14 * arising from the use of these programs.
16 * This disclaimer of warranty extends to the user of these programs and user's
17 * customers, employees, agents, transferees, successors, and assigns.
19 * The MPEG Software Simulation Group does not represent or warrant that the
20 * programs furnished hereunder are free of infringement of any third-party
23 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
24 * are subject to royalty fees to patent holders. Many of these patents are
25 * general enough such that they are unavoidable regardless of implementation
34 #include "cpu_accel.h"
37 /* Macro-block Motion compensation results record */
39 typedef struct _blockcrd
{
46 blockxy pos
; // Half-pel co-ordinates of source block
47 int sad
; // Sum of absolute difference
49 uint8_t *blk
; // Source block data (in luminace data array)
50 int hx
, hy
; // Half-pel offsets
51 int fieldsel
; // 0 = top 1 = bottom
52 int fieldoff
; // Offset from start of frame data to first line
53 // of field. (top = 0, bottom = width );
56 typedef struct mb_motion mb_motion_s
;
61 uint8_t *mb
; // One pel
62 uint8_t *fmb
; // Two-pel subsampled
63 uint8_t *qmb
; // Four-pel subsampled
64 uint8_t *umb
; // U compoenent one-pel
65 uint8_t *vmb
; // V component one-pel
68 typedef struct subsampled_mb subsampled_mb_s
;
71 static void frame_ME (motion_engine_t
*engine
,
75 int i
, int j
, struct mbinfo
*mbi
);
77 static void field_ME (motion_engine_t
*engine
,
86 static void frame_estimate (motion_engine_t
*engine
,
89 subsampled_mb_s
*ssmb
,
95 int imins
[2][2], int jmins
[2][2]);
97 static void field_estimate (motion_engine_t
*engine
,
103 subsampled_mb_s
*ssmb
,
104 int i
, int j
, int sx
, int sy
, int ipflag
,
108 mb_motion_s
*bestsp
);
110 static void dpframe_estimate (motion_engine_t
*engine
,
111 pict_data_s
*picture
,
113 subsampled_mb_s
*ssmb
,
114 int i
, int j
, int iminf
[2][2], int jminf
[2][2],
116 int *imindmvp
, int *jmindmvp
,
119 static void dpfield_estimate (motion_engine_t
*engine
,
120 pict_data_s
*picture
,
125 int imins
, int jmins
,
129 static void fullsearch (motion_engine_t
*engine
,
130 uint8_t *org
, uint8_t *ref
,
131 subsampled_mb_s
*ssblk
,
132 int lx
, int i0
, int j0
,
133 int sx
, int sy
, int h
,
135 mb_motion_s
*motion
);
137 static void find_best_one_pel( motion_engine_t
*engine
,
138 uint8_t *org
, uint8_t *blk
,
146 static int build_sub22_mcomps( motion_engine_t
*engine
,
147 int i0
, int j0
, int ihigh
, int jhigh
,
149 uint8_t *s22org
, uint8_t *s22blk
,
150 int flx
, int fh
, int searched_sub44_size
);
153 static void find_best_one_pel_mmxe( motion_engine_t
*engine
,
154 uint8_t *org
, uint8_t *blk
,
163 static int build_sub22_mcomps_mmxe( motion_engine_t
*engine
, int i0
, int j0
, int ihigh
, int jhigh
,
165 uint8_t *s22org
, uint8_t *s22blk
,
166 int flx
, int fh
, int searched_sub44_size
);
167 static int (*pmblock_sub44_dists
)( uint8_t *blk
, uint8_t *ref
,
169 int ihigh
, int jhigh
,
170 int h
, int rowstride
,
172 mc_result_s
*resvec
);
175 static int unidir_pred_var( const mb_motion_s
*motion
,
176 uint8_t *mb
, int lx
, int h
);
177 static int bidir_pred_var( const mb_motion_s
*motion_f
,
178 const mb_motion_s
*motion_b
,
179 uint8_t *mb
, int lx
, int h
);
180 static int bidir_pred_sad( const mb_motion_s
*motion_f
,
181 const mb_motion_s
*motion_b
,
182 uint8_t *mb
, int lx
, int h
);
184 static int variance( uint8_t *mb
, int size
, int lx
);
186 static int dist22 ( uint8_t *blk1
, uint8_t *blk2
, int qlx
, int qh
);
188 static int dist44 ( uint8_t *blk1
, uint8_t *blk2
, int flx
, int fh
);
189 static int dist2_22( uint8_t *blk1
, uint8_t *blk2
,
191 static int bdist2_22( uint8_t *blk1f
, uint8_t *blk1b
,
196 static void (*pfind_best_one_pel
)( motion_engine_t
*engine
,
197 uint8_t *org
, uint8_t *blk
,
205 static int (*pbuild_sub22_mcomps
)( motion_engine_t
*engine
,
206 int i0
, int j0
, int ihigh
, int jhigh
,
208 uint8_t *s22org
, uint8_t *s22blk
,
209 int flx
, int fh
, int searched_sub44_size
);
211 static int (*pdist2_22
)( uint8_t *blk1
, uint8_t *blk2
,
213 static int (*pbdist2_22
)( uint8_t *blk1f
, uint8_t *blk1b
,
217 static int dist1_00( uint8_t *blk1
, uint8_t *blk2
, int lx
, int h
, int distlim
);
218 static int dist1_01(uint8_t *blk1
, uint8_t *blk2
, int lx
, int h
);
219 static int dist1_10(uint8_t *blk1
, uint8_t *blk2
, int lx
, int h
);
220 static int dist1_11(uint8_t *blk1
, uint8_t *blk2
, int lx
, int h
);
221 static int dist2 (uint8_t *blk1
, uint8_t *blk2
,
222 int lx
, int hx
, int hy
, int h
);
223 static int bdist2 (uint8_t *pf
, uint8_t *pb
,
224 uint8_t *p2
, int lx
, int hxf
, int hyf
, int hxb
, int hyb
, int h
);
225 static int bdist1 (uint8_t *pf
, uint8_t *pb
,
226 uint8_t *p2
, int lx
, int hxf
, int hyf
, int hxb
, int hyb
, int h
);
229 static int (*pdist22
) ( uint8_t *blk1
, uint8_t *blk2
, int flx
, int fh
);
230 static int (*pdist44
) ( uint8_t *blk1
, uint8_t *blk2
, int qlx
, int qh
);
231 static int (*pdist1_00
) ( uint8_t *blk1
, uint8_t *blk2
, int lx
, int h
, int distlim
);
232 static int (*pdist1_01
) ( uint8_t *blk1
, uint8_t *blk2
, int lx
, int h
);
233 static int (*pdist1_10
) ( uint8_t *blk1
, uint8_t *blk2
, int lx
, int h
);
234 static int (*pdist1_11
) ( uint8_t *blk1
, uint8_t *blk2
, int lx
, int h
);
236 static int (*pdist2
) (uint8_t *blk1
, uint8_t *blk2
,
237 int lx
, int hx
, int hy
, int h
);
240 static int (*pbdist2
) (uint8_t *pf
, uint8_t *pb
,
241 uint8_t *p2
, int lx
, int hxf
, int hyf
, int hxb
, int hyb
, int h
);
243 static int (*pbdist1
) (uint8_t *pf
, uint8_t *pb
,
244 uint8_t *p2
, int lx
, int hxf
, int hyf
, int hxb
, int hyb
, int h
);
248 static void check_mc( const mb_motion_s *motion, int rx, int ry, int i, int j, char *str )
251 if( motion->sad < 0 || motion->sad > 0x10000 )
253 printf( "SAD ooops %s\n", str );
256 if( motion->pos.x-i*2 < -rx || motion->pos.x-i*2 >= rx )
258 printf( "X MC ooops %s = %d [%d]\n", str, motion->pos.x-i*2,rx );
261 if( motion->pos.y-j*2 < -ry || motion->pos.y-j*2 >= ry )
263 printf( "Y MC ooops %s %d [%d]\n", str, motion->pos.y-j*2, ry );
268 static void init_mc( mb_motion_s *motion )
271 motion->pos.x = -1000;
272 motion->pos.y = -1000;
277 Compute the combined variance of luminance and chrominance information
278 for a particular non-intra macro block after unidirectional
279 motion compensation...
281 Note: results are scaled to give chrominance equal weight to
282 chrominance. The variance of the luminance portion is computed
283 at the time the motion compensation is computed.
285 TODO: Perhaps we should compute the whole thing in fullsearch not
286 seperate it. However, that would involve a lot of fiddling with
287 field_* and until its thoroughly debugged and tested I think I'll
288 leave that alone. Furthermore, it is unclear if its really worth
289 doing these computations for B *and* P frames.
291 TODO: BUG: ONLY works for 420 video...
295 static int unidir_chrom_var_sum( mb_motion_s
*lum_mc
,
297 subsampled_mb_s
*ssblk
,
302 /* N.b. MC co-ordinates are computed in half-pel units! */
303 int cblkoffset
= (lum_mc
->fieldoff
>>1) +
304 (lum_mc
->pos
.x
>>2) + (lum_mc
->pos
.y
>>2)*uvlx
;
306 return ((*pdist2_22
)( ref
[1] + cblkoffset
, ssblk
->umb
, uvlx
, uvh
) +
307 (*pdist2_22
)( ref
[2] + cblkoffset
, ssblk
->vmb
, uvlx
, uvh
))*2;
312 Compute the combined variance of luminance and chrominance information
313 for a particular non-intra macro block after unidirectional
314 motion compensation...
316 Note: results are scaled to give chrominance equal weight to
317 chrominance. The variance of the luminance portion is computed
318 at the time the motion compensation is computed.
320 Note: results scaled to give chrominance equal weight to chrominance.
322 TODO: BUG: ONLY works for 420 video...
324 NOTE: Currently unused but may be required if it turns out that taking
325 chrominance into account in B frames is needed.
329 int bidir_chrom_var_sum( mb_motion_s
*lum_mc_f
,
330 mb_motion_s
*lum_mc_b
,
333 subsampled_mb_s
*ssblk
,
338 /* N.b. MC co-ordinates are computed in half-pel units! */
339 int cblkoffset_f
= (lum_mc_f
->fieldoff
>>1) +
340 (lum_mc_f
->pos
.x
>>2) + (lum_mc_f
->pos
.y
>>2)*uvlx
;
341 int cblkoffset_b
= (lum_mc_b
->fieldoff
>>1) +
342 (lum_mc_b
->pos
.x
>>2) + (lum_mc_f
->pos
.y
>>2)*uvlx
;
345 (*pbdist2_22
)( ref_f
[1] + cblkoffset_f
, ref_b
[1] + cblkoffset_b
,
346 ssblk
->umb
, uvlx
, uvh
) +
347 (*pbdist2_22
)( ref_f
[2] + cblkoffset_f
, ref_b
[2] + cblkoffset_b
,
348 ssblk
->vmb
, uvlx
, uvh
))*2;
352 static int chrom_var_sum( subsampled_mb_s
*ssblk
, int h
, int lx
)
354 return (variance(ssblk
->umb
,(h
>>1),(lx
>>1)) +
355 variance(ssblk
->vmb
,(h
>>1),(lx
>>1))) * 2;
359 * frame picture motion estimation
361 * org: top left pel of source reference frame
362 * ref: top left pel of reconstructed reference frame
363 * ssmb: macroblock to be matched
364 * i,j: location of mb relative to ref (=center of search window)
365 * sx,sy: half widths of search window
366 * besfr: location and SAD of best frame prediction
367 * besttop: location of best field pred. for top field of mb
368 * bestbo : location of best field pred. for bottom field of mb
371 static void frame_estimate(motion_engine_t
*engine
,
374 subsampled_mb_s
*ssmb
,
375 int i
, int j
, int sx
, int sy
,
377 mb_motion_s
*besttop
,
378 mb_motion_s
*bestbot
,
383 subsampled_mb_s botssmb
;
384 mb_motion_s topfld_mc
;
385 mb_motion_s botfld_mc
;
387 botssmb
.mb
= ssmb
->mb
+width
;
388 botssmb
.fmb
= ssmb
->mb
+(width
>>1);
389 botssmb
.qmb
= ssmb
->qmb
+(width
>>2);
390 botssmb
.umb
= ssmb
->umb
+(width
>>1);
391 botssmb
.vmb
= ssmb
->vmb
+(width
>>1);
393 /* frame prediction */
394 fullsearch(engine
, org
,ref
,ssmb
,width
,i
,j
,sx
,sy
,16,width
,height
,
396 bestfr
->fieldsel
= 0;
397 bestfr
->fieldoff
= 0;
399 /* predict top field from top field */
400 fullsearch(engine
, org
,ref
,ssmb
,width
<<1,i
,j
>>1,sx
,sy
>>1,8,width
,height
>>1,
403 /* predict top field from bottom field */
404 fullsearch(engine
, org
+width
,ref
+width
,ssmb
, width
<<1,i
,j
>>1,sx
,sy
>>1,8,
405 width
,height
>>1, &botfld_mc
);
407 /* set correct field selectors... */
408 topfld_mc
.fieldsel
= 0;
409 botfld_mc
.fieldsel
= 1;
410 topfld_mc
.fieldoff
= 0;
411 botfld_mc
.fieldoff
= width
;
413 imins
[0][0] = topfld_mc
.pos
.x
;
414 jmins
[0][0] = topfld_mc
.pos
.y
;
415 imins
[1][0] = botfld_mc
.pos
.x
;
416 jmins
[1][0] = botfld_mc
.pos
.y
;
418 /* select prediction for top field */
419 if (topfld_mc
.sad
<=botfld_mc
.sad
)
421 *besttop
= topfld_mc
;
425 *besttop
= botfld_mc
;
428 /* predict bottom field from top field */
429 fullsearch(engine
, org
,ref
,&botssmb
,
430 width
<<1,i
,j
>>1,sx
,sy
>>1,8,width
,height
>>1,
433 /* predict bottom field from bottom field */
434 fullsearch(engine
, org
+width
,ref
+width
,&botssmb
,
435 width
<<1,i
,j
>>1,sx
,sy
>>1,8,width
,height
>>1,
438 /* set correct field selectors... */
439 topfld_mc
.fieldsel
= 0;
440 botfld_mc
.fieldsel
= 1;
441 topfld_mc
.fieldoff
= 0;
442 botfld_mc
.fieldoff
= width
;
444 imins
[0][1] = topfld_mc
.pos
.x
;
445 jmins
[0][1] = topfld_mc
.pos
.y
;
446 imins
[1][1] = botfld_mc
.pos
.x
;
447 jmins
[1][1] = botfld_mc
.pos
.y
;
449 /* select prediction for bottom field */
450 if (botfld_mc
.sad
<=topfld_mc
.sad
)
452 *bestbot
= botfld_mc
;
456 *bestbot
= topfld_mc
;
461 * field picture motion estimation subroutine
463 * toporg: address of original top reference field
464 * topref: address of reconstructed top reference field
465 * botorg: address of original bottom reference field
466 * botref: address of reconstructed bottom reference field
467 * ssmmb: macroblock to be matched
468 * i,j: location of mb (=center of search window)
469 * sx,sy: half width/height of search window
471 * bestfld: location and distance of best field prediction
472 * best8u: location of best 16x8 pred. for upper half of mb
473 * best8lp: location of best 16x8 pred. for lower half of mb
474 * bdestsp: location and distance of best same parity field
475 * prediction (needed for dual prime, only valid if
479 static void field_estimate (motion_engine_t
*engine
,
480 pict_data_s
*picture
,
485 subsampled_mb_s
*ssmb
,
486 int i
, int j
, int sx
, int sy
, int ipflag
,
487 mb_motion_s
*bestfld
,
493 mb_motion_s topfld_mc
;
494 mb_motion_s botfld_mc
;
497 subsampled_mb_s botssmb
;
499 botssmb
.mb
= ssmb
->mb
+width
;
500 botssmb
.umb
= ssmb
->umb
+(width
>>1);
501 botssmb
.vmb
= ssmb
->vmb
+(width
>>1);
502 botssmb
.fmb
= ssmb
->fmb
+(width
>>1);
503 botssmb
.qmb
= ssmb
->qmb
+(width
>>2);
505 /* if ipflag is set, predict from field of opposite parity only */
506 notop
= ipflag
&& (picture
->pict_struct
==TOP_FIELD
);
507 nobot
= ipflag
&& (picture
->pict_struct
==BOTTOM_FIELD
);
509 /* field prediction */
511 /* predict current field from top field */
513 topfld_mc
.sad
= dt
= 65536; /* infinity */
515 fullsearch(engine
, toporg
,topref
,ssmb
,width
<<1,
516 i
,j
,sx
,sy
>>1,16,width
,height
>>1,
519 /* predict current field from bottom field */
521 botfld_mc
.sad
= db
= 65536; /* infinity */
523 fullsearch(engine
, botorg
,botref
,ssmb
,width
<<1,
524 i
,j
,sx
,sy
>>1,16,width
,height
>>1,
527 /* Set correct field selectors */
528 topfld_mc
.fieldsel
= 0;
529 botfld_mc
.fieldsel
= 1;
530 topfld_mc
.fieldoff
= 0;
531 botfld_mc
.fieldoff
= width
;
533 /* same parity prediction (only valid if ipflag==0) */
534 if (picture
->pict_struct
==TOP_FIELD
)
543 /* select field prediction */
546 *bestfld
= topfld_mc
;
550 *bestfld
= botfld_mc
;
554 /* 16x8 motion compensation */
556 /* predict upper half field from top field */
558 topfld_mc
.sad
= dt
= 65536;
560 fullsearch(engine
, toporg
,topref
,ssmb
,width
<<1,
561 i
,j
,sx
,sy
>>1,8,width
,height
>>1,
564 /* predict upper half field from bottom field */
566 botfld_mc
.sad
= db
= 65536;
568 fullsearch(engine
, botorg
,botref
,ssmb
,width
<<1,
569 i
,j
,sx
,sy
>>1,8,width
,height
>>1,
573 /* Set correct field selectors */
574 topfld_mc
.fieldsel
= 0;
575 botfld_mc
.fieldsel
= 1;
576 topfld_mc
.fieldoff
= 0;
577 botfld_mc
.fieldoff
= width
;
579 /* select prediction for upper half field */
589 /* predict lower half field from top field */
591 N.b. For interlaced data width<<4 (width*16) takes us 8 rows
592 down in the same field.
593 Thus for the fast motion data (2*2
594 sub-sampled) we need to go 4 rows down in the same field.
595 This requires adding width*4 = (width<<2).
596 For the 4*4 sub-sampled motion data we need to go down 2 rows.
597 This requires adding width = width
601 topfld_mc
.sad
= dt
= 65536;
603 fullsearch(engine
, toporg
,topref
,&botssmb
,
605 i
,j
+8,sx
,sy
>>1,8,width
,height
>>1,
606 /* &imint,&jmint, &dt,*/ &topfld_mc
);
608 /* predict lower half field from bottom field */
610 botfld_mc
.sad
= db
= 65536;
612 fullsearch(engine
, botorg
,botref
,&botssmb
,width
<<1,
613 i
,j
+8,sx
,sy
>>1,8,width
,height
>>1,
614 /* &iminb,&jminb, &db,*/ &botfld_mc
);
616 /* Set correct field selectors */
617 topfld_mc
.fieldsel
= 0;
618 botfld_mc
.fieldsel
= 1;
619 topfld_mc
.fieldoff
= 0;
620 botfld_mc
.fieldoff
= width
;
622 /* select prediction for lower half field */
633 static void dpframe_estimate (motion_engine_t
*engine
,
634 pict_data_s
*picture
,
636 subsampled_mb_s
*ssmb
,
638 int i
, int j
, int iminf
[2][2], int jminf
[2][2],
639 mb_motion_s
*best_mc
,
640 int *imindmvp
, int *jmindmvp
,
643 int pref
,ppred
,delta_x
,delta_y
;
644 int is
,js
,it
,jt
,ib
,jb
,it0
,jt0
,ib0
,jb0
;
645 int imins
,jmins
,imint
,jmint
,iminb
,jminb
,imindmv
,jmindmv
;
648 /* Calculate Dual Prime distortions for 9 delta candidates
649 * for each of the four minimum field vectors
650 * Note: only for P pictures!
653 /* initialize minimum dual prime distortion to large value */
656 for (pref
=0; pref
<2; pref
++)
658 for (ppred
=0; ppred
<2; ppred
++)
660 /* convert Cartesian absolute to relative motion vector
661 * values (wrt current macroblock address (i,j)
663 is
= iminf
[pref
][ppred
] - (i
<<1);
664 js
= jminf
[pref
][ppred
] - (j
<<1);
668 /* vertical field shift adjustment */
674 /* mvxs and mvys scaling*/
677 if (picture
->topfirst
== ppred
)
679 /* second field: scale by 1/3 */
680 is
= (is
>=0) ? (is
+1)/3 : -((-is
+1)/3);
681 js
= (js
>=0) ? (js
+1)/3 : -((-js
+1)/3);
687 /* vector for prediction from field of opposite 'parity' */
688 if (picture
->topfirst
)
690 /* vector for prediction of top field from bottom field */
691 it0
= ((is
+(is
>0))>>1);
692 jt0
= ((js
+(js
>0))>>1) - 1;
694 /* vector for prediction of bottom field from top field */
695 ib0
= ((3*is
+(is
>0))>>1);
696 jb0
= ((3*js
+(js
>0))>>1) + 1;
700 /* vector for prediction of top field from bottom field */
701 it0
= ((3*is
+(is
>0))>>1);
702 jt0
= ((3*js
+(js
>0))>>1) - 1;
704 /* vector for prediction of bottom field from top field */
705 ib0
= ((is
+(is
>0))>>1);
706 jb0
= ((js
+(js
>0))>>1) + 1;
709 /* convert back to absolute half-pel field picture coordinates */
717 if (is
>= 0 && is
<= (width
-16)<<1 &&
718 js
>= 0 && js
<= (height
-16))
720 for (delta_y
=-1; delta_y
<=1; delta_y
++)
722 for (delta_x
=-1; delta_x
<=1; delta_x
++)
724 /* opposite field coordinates */
730 if (it
>= 0 && it
<= (width
-16)<<1 &&
731 jt
>= 0 && jt
<= (height
-16) &&
732 ib
>= 0 && ib
<= (width
-16)<<1 &&
733 jb
>= 0 && jb
<= (height
-16))
735 /* compute prediction error */
736 local_dist
= (*pbdist2
)(
737 ref
+ (is
>>1) + (width
<<1)*(js
>>1),
738 ref
+ width
+ (it
>>1) + (width
<<1)*(jt
>>1),
739 ssmb
->mb
, /* current mb location */
740 width
<<1, /* adjacent line distance */
741 is
&1, js
&1, it
&1, jt
&1, /* half-pel flags */
742 8); /* block height */
743 local_dist
+= (*pbdist2
)(
744 ref
+ width
+ (is
>>1) + (width
<<1)*(js
>>1),
745 ref
+ (ib
>>1) + (width
<<1)*(jb
>>1),
746 ssmb
->mb
+ width
, /* current mb location */
747 width
<<1, /* adjacent line distance */
748 is
&1, js
&1, ib
&1, jb
&1, /* half-pel flags */
749 8); /* block height */
751 /* update delta with least distortion vector */
752 if (local_dist
< vmc
)
765 } /* end delta x loop */
766 } /* end delta y loop */
771 /* TODO: This is now likely to be obsolete... */
772 /* Compute L1 error for decision purposes */
773 local_dist
= (*pbdist1
)(
774 ref
+ (imins
>>1) + (width
<<1)*(jmins
>>1),
775 ref
+ width
+ (imint
>>1) + (width
<<1)*(jmint
>>1),
778 imins
&1, jmins
&1, imint
&1, jmint
&1,
780 //printf("motion 1 %p\n", pbdist1);
781 local_dist
+= (*pbdist1
)(
782 ref
+ width
+ (imins
>>1) + (width
<<1)*(jmins
>>1),
783 ref
+ (iminb
>>1) + (width
<<1)*(jminb
>>1),
786 imins
&1, jmins
&1, iminb
&1, jminb
&1,
788 //printf("motion 2\n");
790 best_mc
->sad
= local_dist
;
791 best_mc
->pos
.x
= imins
;
792 best_mc
->pos
.y
= jmins
;
796 //printf("motion 2\n");
799 static void dpfield_estimate(motion_engine_t
*engine
,
800 pict_data_s
*picture
,
804 int i
, int j
, int imins
, int jmins
,
805 mb_motion_s
*bestdp_mc
,
810 uint8_t *sameref
, *oppref
;
811 int io0
,jo0
,io
,jo
,delta_x
,delta_y
,mvxs
,mvys
,mvxo0
,mvyo0
;
812 int imino
,jmino
,imindmv
,jmindmv
,vmc_dp
,local_dist
;
814 /* Calculate Dual Prime distortions for 9 delta candidates */
815 /* Note: only for P pictures! */
817 /* Assign opposite and same reference pointer */
818 if (picture
->pict_struct
==TOP_FIELD
)
829 /* convert Cartesian absolute to relative motion vector
830 * values (wrt current macroblock address (i,j)
832 mvxs
= imins
- (i
<<1);
833 mvys
= jmins
- (j
<<1);
835 /* vector for prediction from field of opposite 'parity' */
836 mvxo0
= (mvxs
+(mvxs
>0)) >> 1; /* mvxs / / */
837 mvyo0
= (mvys
+(mvys
>0)) >> 1; /* mvys / / 2*/
839 /* vertical field shift correction */
840 if (picture
->pict_struct
==TOP_FIELD
)
845 /* convert back to absolute coordinates */
846 io0
= mvxo0
+ (i
<<1);
847 jo0
= mvyo0
+ (j
<<1);
849 /* initialize minimum dual prime distortion to large value */
852 for (delta_y
= -1; delta_y
<= 1; delta_y
++)
854 for (delta_x
= -1; delta_x
<=1; delta_x
++)
856 /* opposite field coordinates */
860 if (io
>= 0 && io
<= (width
-16)<<1 &&
861 jo
>= 0 && jo
<= (height2
-16)<<1)
863 /* compute prediction error */
864 local_dist
= (*pbdist2
)(
865 sameref
+ (imins
>>1) + width2
*(jmins
>>1),
866 oppref
+ (io
>>1) + width2
*(jo
>>1),
867 mb
, /* current mb location */
868 width2
, /* adjacent line distance */
869 imins
&1, jmins
&1, io
&1, jo
&1, /* half-pel flags */
870 16); /* block height */
872 /* update delta with least distortion vector */
873 if (local_dist
< vmc_dp
)
882 } /* end delta x loop */
883 } /* end delta y loop */
885 /* Compute L1 error for decision purposes */
888 sameref
+ (imins
>>1) + width2
*(jmins
>>1),
889 oppref
+ (imino
>>1) + width2
*(jmino
>>1),
890 mb
, /* current mb location */
891 width2
, /* adjacent line distance */
892 imins
&1, jmins
&1, imino
&1, jmino
&1, /* half-pel flags */
893 16); /* block height */
895 bestdp_mc
->pos
.x
= imindmv
;
896 bestdp_mc
->pos
.x
= imindmv
;
902 * Vectors of motion compensations
906 Take a vector of motion compensations and repeatedly make passes
907 discarding all elements whose sad "weight" is above the current mean weight.
910 static void sub_mean_reduction( mc_result_s
*matches
, int len
,
912 int *newlen_res
, int *minweight_res
)
917 int min_weight
= 100000;
920 *minweight_res
= 100000;
928 for( i
= 0; i
< len
; ++i
)
929 weight_sum
+= matches
[i
].weight
;
930 mean_weight
= weight_sum
/ len
;
936 for( i
=0; i
< len
; ++i
)
938 if( matches
[i
].weight
<= mean_weight
)
942 min_weight
= matches
[i
].weight
;
944 matches
[j
] = matches
[i
];
952 *minweight_res
= mean_weight
;
956 Build a vector of the top 4*4 sub-sampled motion compensations in the box
957 (ilow,jlow) to (ihigh,jhigh).
959 The algorithm is as follows:
960 1. coarse matches on an 8*8 grid of positions are collected that fall below
961 a (very conservative) sad threshold (basically 50% more than moving average of
962 the mean sad of such matches).
964 2. The worse than-average matches are discarded.
966 3. The remaining coarse matches are expanded with the left/lower neighbouring
967 4*4 grid matches. Again only those better than a threshold (this time the mean
968 of the 8*8 grid matches are retained.
970 4. Multiple passes are made discarding worse than-average matches. The number of
971 passes is specified by the user. The default it 2 (leaving roughly 1/4 of the matches).
973 The net result is very fast and find good matches if they're to be found. I.e. the
974 penalty over exhaustive search is pretty low.
976 NOTE: The "discard below average" trick depends critically on having some variation in
977 the matches. The slight penalty imposed for distant matches (reasonably since the
978 motion vectors have to be encoded) is *vital* as otherwise pathologically bad
979 performance results on highly uniform images.
981 TODO: We should probably allow the user to eliminate the initial thinning of 8*8
982 grid matches if ultimate quality is demanded (e.g. for low bit-rate applications).
987 static int build_sub44_mcomps(motion_engine_t
*engine
,
988 int ilow
, int jlow
, int ihigh
, int jhigh
,
991 uint8_t *s44org
, uint8_t *s44blk
, int qlx
, int qh
)
1003 /*int rangex, rangey;
1004 static int rough_num_mcomps;
1005 static mc_result_s rough_mcomps[MAX_44_MATCHES];
1011 uint8_t *old_s44orgblk
;
1013 /* N.b. we may ignore the right hand block of the pair going over the
1014 right edge as we have carefully allocated the buffer oversize to ensure
1015 no memory faults. The later motion compensation calculations
1016 performed on the results of this pass will filter out
1017 out-of-range blocks...
1021 engine
->sub44_num_mcomps
= 0;
1023 threshold
= 6*null_mc_sad
/ (4*4*mc_44_red
);
1024 s44orgblk
= s44org
+(ilow
>>2)+qlx
*(jlow
>>2);
1026 /* Exhaustive search on 4*4 sub-sampled data. This is affordable because
1027 (a) it is only 16th of the size of the real 1-pel data
1028 (b) we ignore those matches with an sad above our threshold.
1032 /* Invariant: s44orgblk = s44org+(i>>2)+qlx*(j>>2) */
1033 s44orgblk
= s44org
+(ilow
>>2)+qlx
*(jlow
>>2);
1034 for( j
= jstrt
; j
<= jend
; j
+= 4 )
1036 old_s44orgblk
= s44orgblk
;
1037 for( i
= istrt
; i
<= iend
; i
+= 4 )
1039 s1
= ((*pdist44
)( s44orgblk
,s44blk
,qlx
,qh
) & 0xffff) + abs(i
-i0
) + abs(j
-j0
);
1040 if( s1
< threshold
)
1042 engine
->sub44_mcomps
[engine
->sub44_num_mcomps
].x
= i
;
1043 engine
->sub44_mcomps
[engine
->sub44_num_mcomps
].y
= j
;
1044 engine
->sub44_mcomps
[engine
->sub44_num_mcomps
].weight
= s1
;
1045 ++engine
->sub44_num_mcomps
;
1049 s44orgblk
= old_s44orgblk
+ qlx
;
1054 engine
->sub44_num_mcomps
1055 = (*pmblock_sub44_dists
)( s44orgblk
, s44blk
,
1060 engine
->sub44_mcomps
);
1063 /* If we're really pushing quality we reduce once otherwise twice. */
1065 sub_mean_reduction( engine
->sub44_mcomps
, engine
->sub44_num_mcomps
, 1+(mc_44_red
>1),
1066 &engine
->sub44_num_mcomps
, &mean_weight
);
1069 return engine
->sub44_num_mcomps
;
1073 /* Build a vector of the best 2*2 sub-sampled motion
1074 compensations using the best 4*4 matches as starting points. As
1075 with with the 4*4 matches We don't collect them densely as they're
1076 just search starting points for 1-pel search and ones that are 1 out
1077 should still give better than average matches...
1082 static int build_sub22_mcomps(motion_engine_t
*engine
,
1083 int i0
, int j0
, int ihigh
, int jhigh
,
1085 uint8_t *s22org
, uint8_t *s22blk
,
1086 int flx
, int fh
, int searched_sub44_size
)
1089 int threshold
= 6*null_mc_sad
/ (2 * 2*mc_22_red
);
1092 int ilim
= ihigh
-i0
;
1093 int jlim
= jhigh
-j0
;
1097 engine
->sub22_num_mcomps
= 0;
1098 for( k
= 0; k
< searched_sub44_size
; ++k
)
1101 matchrec
.x
= engine
->sub44_mcomps
[k
].x
;
1102 matchrec
.y
= engine
->sub44_mcomps
[k
].y
;
1104 s22orgblk
= s22org
+((matchrec
.y
+j0
)>>1)*flx
+((matchrec
.x
+i0
)>>1);
1105 for( i
= 0; i
< 4; ++i
)
1107 if( matchrec
.x
<= ilim
&& matchrec
.y
<= jlim
)
1109 s
= (*pdist22
)( s22orgblk
,s22blk
,flx
,fh
);
1112 engine
->sub22_mcomps
[engine
->sub22_num_mcomps
].x
= (int8_t)matchrec
.x
;
1113 engine
->sub22_mcomps
[engine
->sub22_num_mcomps
].y
= (int8_t)matchrec
.y
;
1114 engine
->sub22_mcomps
[engine
->sub22_num_mcomps
].weight
= s
;
1115 ++engine
->sub22_num_mcomps
;
1136 sub_mean_reduction( engine
->sub22_mcomps
, engine
->sub22_num_mcomps
, 2,
1137 &engine
->sub22_num_mcomps
, &min_weight
);
1138 return engine
->sub22_num_mcomps
;
1142 int build_sub22_mcomps_mmxe(motion_engine_t
*engine
,
1143 int i0
, int j0
, int ihigh
, int jhigh
,
1145 uint8_t *s22org
, uint8_t *s22blk
,
1146 int flx
, int fh
, int searched_sub44_size
)
1149 int threshold
= 6*null_mc_sad
/ (2 * 2*mc_22_red
);
1152 int ilim
= ihigh
-i0
;
1153 int jlim
= jhigh
-j0
;
1158 /* TODO: The calculation of the lstrow offset really belongs in
1160 int lstrow
=(fh
-1)*flx
;
1162 engine
->sub22_num_mcomps
= 0;
1163 for( k
= 0; k
< searched_sub44_size
; ++k
)
1166 matchrec
.x
= engine
->sub44_mcomps
[k
].x
;
1167 matchrec
.y
= engine
->sub44_mcomps
[k
].y
;
1169 s22orgblk
= s22org
+((matchrec
.y
+j0
)>>1)*flx
+((matchrec
.x
+i0
)>>1);
1170 mblockq_dist22_mmxe(s22orgblk
+lstrow
, s22blk
+lstrow
, flx
, fh
, resvec
);
1171 for( i
= 0; i
< 4; ++i
)
1173 if( matchrec
.x
<= ilim
&& matchrec
.y
<= jlim
)
1178 engine
->sub22_mcomps
[engine
->sub22_num_mcomps
].x
= (int8_t)matchrec
.x
;
1179 engine
->sub22_mcomps
[engine
->sub22_num_mcomps
].y
= (int8_t)matchrec
.y
;
1180 engine
->sub22_mcomps
[engine
->sub22_num_mcomps
].weight
= s
;
1181 ++engine
->sub22_num_mcomps
;
1199 sub_mean_reduction( engine
->sub22_mcomps
, engine
->sub22_num_mcomps
, 2,
1200 &engine
->sub22_num_mcomps
, &min_weight
);
1201 return engine
->sub22_num_mcomps
;
1207 Search for the best 1-pel match within 1-pel of a good 2*2-pel match.
1208 TODO: Its a bit silly to cart around absolute M/C co-ordinates that
1209 eventually get turned into relative ones anyway...
1213 static void find_best_one_pel(motion_engine_t
*engine
,
1214 uint8_t *org
, uint8_t *blk
,
1226 blockxy minpos
= res
->pos
;
1234 init_search
= searched_size
;
1235 init_size
= engine
->sub22_num_mcomps
;
1236 for( k
= 0; k
< searched_size
; ++k
)
1238 matchrec
.x
= i0
+ engine
->sub22_mcomps
[k
].x
;
1239 matchrec
.y
= j0
+ engine
->sub22_mcomps
[k
].y
;
1240 orgblk
= org
+ matchrec
.x
+lx
*matchrec
.y
;
1241 penalty
= abs(matchrec
.x
)+abs(matchrec
.y
);
1243 for( i
= 0; i
< 4; ++i
)
1245 if( matchrec
.x
<= xmax
&& matchrec
.y
<= ymax
)
1248 d
= penalty
+(*pdist1_00
)(orgblk
,blk
,lx
,h
, dmin
);
1270 res
->blk
= org
+ minpos
.x
+lx
*minpos
.y
;
1276 void find_best_one_pel_mmxe(motion_engine_t
*engine
,
1277 uint8_t *org
, uint8_t *blk
,
1289 blockxy minpos
= res
->pos
;
1299 init_search
= searched_size
;
1300 init_size
= engine
->sub22_num_mcomps
;
1301 for( k
= 0; k
< searched_size
; ++k
)
1303 matchrec
.x
= i0
+ engine
->sub22_mcomps
[k
].x
;
1304 matchrec
.y
= j0
+ engine
->sub22_mcomps
[k
].y
;
1305 orgblk
= org
+ matchrec
.x
+lx
*matchrec
.y
;
1306 penalty
= abs(matchrec
.x
)+abs(matchrec
.y
);
1309 mblockq_dist1_mmxe(orgblk
,blk
,lx
,h
, resvec
);
1311 for( i
= 0; i
< 4; ++i
)
1313 if( matchrec
.x
<= xmax
&& matchrec
.y
<= ymax
)
1316 d
= penalty
+resvec
[i
];
1338 res
->blk
= org
+ minpos
.x
+lx
*minpos
.y
;
1345 * full search block matching
1347 * A.Stevens 2000: This is now a big misnomer. The search is now a hierarchical/sub-sampling
1348 * search not a full search. However, experiments have shown it is always close to
1349 * optimal and almost always very close or optimal.
1351 * blk: top left pel of (16*h) block
1352 * s22blk: top element of fast motion compensation block corresponding to blk
1353 * h: height of block
1354 * lx: distance (in bytes) of vertically adjacent pels in ref,blk
1355 * org: top left pel of source reference picture
1356 * ref: top left pel of reconstructed reference picture
1357 * i0,j0: center of search window
1358 * sx,sy: half widths of search window
1359 * xmax,ymax: right/bottom limits of search area
1360 * iminp,jminp: pointers to where the result is stored
1361 * result is given as half pel offset from ref(0,0)
1362 * i.e. NOT relative to (i0,j0)
1366 static void fullsearch(motion_engine_t
*engine
,
1369 subsampled_mb_s
*ssblk
,
1370 int lx
, int i0
, int j0
,
1371 int sx
, int sy
, int h
,
1373 /* int *iminp, int *jminp, int *sadminp, */
1378 /* int imin, jmin, dmin */
1379 int i
,j
,ilow
,ihigh
,jlow
,jhigh
;
1382 /* NOTE: Surprisingly, the initial motion compensation search
1383 works better when the original image not the reference (reconstructed)
1386 uint8_t *s22org
= (uint8_t*)(org
+fsubsample_offset
);
1387 uint8_t *s44org
= (uint8_t*)(org
+qsubsample_offset
);
1395 /* xmax and ymax into more useful form... */
1400 /* The search radii are *always* multiples of 4 to avoid messiness
1401 in the initial 4*4 pel search. This is handled by the
1402 parameter checking/processing code in readparmfile() */
1404 /* Create a distance-order mcomps of possible motion compensations
1405 based on the fast estimation data - 4*4 pel sums (4*4
1406 sub-sampled) rather than actual pel's. 1/16 the size... */
1408 jlow
= jlow
< 0 ? 0 : jlow
;
1410 jhigh
= jhigh
> ymax
? ymax
: jhigh
;
1412 ilow
= ilow
< 0 ? 0 : ilow
;
1414 ihigh
= ihigh
> xmax
? xmax
: ihigh
;
1417 Very rarely this may fail to find matchs due to all the good
1418 looking ones being over threshold. hence we make sure we
1419 fall back to a 0 motion compensation in this case.
1421 The sad for the 0 motion compensation is also very useful as
1422 a basis for setting thresholds for rejecting really dud 4*4
1423 and 2*2 sub-sampled matches.
1425 best
.sad
= (*pdist1_00
)(ref
+ i0
+ j0
* lx
,
1433 engine
->sub44_num_mcomps
= build_sub44_mcomps(engine
,
1434 ilow
, jlow
, ihigh
, jhigh
,
1438 ssblk
->qmb
, qlx
, qh
);
1441 /* Now create a distance-ordered mcomps of possible motion
1442 compensations based on the fast estimation data - 2*2 pel sums
1443 using the best fraction of the 4*4 estimates However we cover
1444 only coarsely... on 4-pel boundaries... */
1446 engine
->sub22_num_mcomps
= (*pbuild_sub22_mcomps
)( engine
, i0
, j0
, ihigh
, jhigh
,
1448 s22org
, ssblk
->fmb
, flx
, fh
,
1449 engine
->sub44_num_mcomps
);
1452 /* Now choose best 1-pel match from what approximates (not exact
1453 due to the pre-processing trick with the mean) the top 1/2 of
1458 (*pfind_best_one_pel
)( engine
, ref
, ssblk
->mb
, engine
->sub22_num_mcomps
,
1460 ilow
, jlow
, xmax
, ymax
,
1463 /* Final polish: half-pel search of best candidate against
1464 reconstructed image.
1472 ilow
= best
.pos
.x
- (best
.pos
.x
>(ilow
<<1));
1473 ihigh
= best
.pos
.x
+ (best
.pos
.x
<((ihigh
)<<1));
1474 jlow
= best
.pos
.y
- (best
.pos
.y
>(jlow
<<1));
1475 jhigh
= best
.pos
.y
+ (best
.pos
.y
<((jhigh
)<<1));
1477 for (j
=jlow
; j
<=jhigh
; j
++)
1479 for (i
=ilow
; i
<=ihigh
; i
++)
1481 orgblk
= ref
+(i
>>1)+((j
>>1)*lx
);
1485 d
= (*pdist1_11
)(orgblk
,ssblk
->mb
,lx
,h
);
1487 d
= (*pdist1_01
)(orgblk
,ssblk
->mb
,lx
,h
);
1492 d
= (*pdist1_10
)(orgblk
,ssblk
->mb
,lx
,h
);
1494 d
= (*pdist1_00
)(orgblk
,ssblk
->mb
,lx
,h
,best
.sad
);
1507 best
.var
= (*pdist2
)(best
.blk
, ssblk
->mb
, lx
, best
.hx
, best
.hy
, h
);
1512 * total absolute difference between two (16*h) blocks
1513 * including optional half pel interpolation of blk1 (hx,hy)
1514 * blk1,blk2: addresses of top left pels of both blocks
1515 * lx: distance (in bytes) of vertically adjacent pels
1516 * hx,hy: flags for horizontal and/or vertical interpolation
1517 * h: height of block (usually 8 or 16)
1518 * distlim: bail out if sum exceeds this value
1521 /* A.Stevens 2000: New version for highly pipelined CPUs where branching is
1522 costly. Really it sucks that C doesn't define a stdlib abs that could
1523 be realised as a compiler intrinsic using appropriate CPU instructions.
1524 That 1970's heritage...
1528 static int dist1_00(uint8_t *blk1
,uint8_t *blk2
,
1529 int lx
, int h
,int distlim
)
1542 #define pipestep(o) v = p1[o]-p2[o]; s+= abs(v);
1543 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1544 pipestep(4); pipestep(5); pipestep(6); pipestep(7);
1545 pipestep(8); pipestep(9); pipestep(10); pipestep(11);
1546 pipestep(12); pipestep(13); pipestep(14); pipestep(15);
1558 static int dist1_01(uint8_t *blk1
,uint8_t *blk2
,
1571 for (i
=0; i
<16; i
++)
1574 v
= ((unsigned int)(p1
[i
]+p1
[i
+1])>>1) - p2
[i
];
1576 v = ((p1[i]>>1)+(p1[i+1]>>1)>>1) - (p2[i]>>1);
1586 static int dist1_10(uint8_t *blk1
,uint8_t *blk2
,
1589 uint8_t *p1
,*p1a
,*p2
;
1600 for (i
=0; i
<16; i
++)
1602 v
= ((unsigned int)(p1
[i
]+p1a
[i
])>>1) - p2
[i
];
1613 static int dist1_11(uint8_t *blk1
,uint8_t *blk2
,
1616 uint8_t *p1
,*p1a
,*p2
;
1628 for (i
=0; i
<16; i
++)
1630 v
= ((unsigned int)(p1
[i
]+p1
[i
+1]+p1a
[i
]+p1a
[i
+1])>>2) - p2
[i
];
1640 /* USED only during debugging...
1641 void check_fast_motion_data(uint8_t *blk, char *label )
1646 uint8_t *start_s22blk, *start_s44blk;
1648 unsigned int mismatch;
1650 start_s22blk = (blk+height*width);
1651 start_s44blk = (blk+height*width+(height>>1)*(width>>1));
1653 if (pict_struct==FRAME_PICTURE)
1655 nextfieldline = width;
1659 nextfieldline = 2*width;
1668 for( i = 0; i < nextfieldline/2; ++i )
1670 mismatch |= ((p[0] + p[1] + p[nextfieldline] + p[nextfieldline+1])>>2) != *pb;
1678 printf( "Mismatch detected check %s for buffer %08x\n", label, blk );
1685 Append fast motion estimation data to original luminance
1686 data. N.b. memory allocation for luminance data allows space
1687 for this information...
1690 void fast_motion_data(uint8_t *blk
, int picture_struct
)
1695 uint8_t *start_s22blk
, *start_s44blk
;
1696 uint16_t *start_rowblk
, *start_colblk
;
1699 #ifdef TEST_RCSEARCH
1700 uint16_t *pc
, *pr
,*p
;
1703 int down16
= width
*16;
1705 uint16_t rowsums
[2048];
1706 uint16_t colsums
[2048]; /* TODO: BUG: should resize with width */
1710 /* In an interlaced field the "next" line is 2 width's down
1711 rather than 1 width down */
1713 if (picture_struct
==FRAME_PICTURE
)
1715 nextfieldline
= width
;
1719 nextfieldline
= 2*width
;
1722 start_s22blk
= blk
+fsubsample_offset
;
1723 start_s44blk
= blk
+qsubsample_offset
;
1724 start_rowblk
= (uint16_t *)blk
+rowsums_offset
;
1725 start_colblk
= (uint16_t *)blk
+colsums_offset
;
1727 nb
= (blk
+nextfieldline
);
1728 /* Sneaky stuff here... we can do lines in both fields at once */
1729 pb
= (uint8_t *) start_s22blk
;
1731 while( nb
< start_s22blk
)
1733 for( i
= 0; i
< nextfieldline
/4; ++i
) /* We're doing 4 pels horizontally at once */
1735 /* TODO: A.Stevens this has to be the most word-length dependent
1736 code in the world. Better than MMX assembler though I guess... */
1737 pb
[0] = (b
[0]+b
[1]+nb
[0]+nb
[1])>>2;
1738 pb
[1] = (b
[2]+b
[3]+nb
[2]+nb
[3])>>2;
1744 nb
= b
+ nextfieldline
;
1748 /* Now create the 4*4 sub-sampled data from the 2*2
1749 N.b. the 2*2 sub-sampled motion data preserves the interlace structure of the
1750 original. Albeit half as many lines and pixels...
1753 nextfieldline
= nextfieldline
>> 1;
1757 nb
= (start_s22blk
+nextfieldline
);
1759 while( nb
< start_s44blk
)
1761 for( i
= 0; i
< nextfieldline
/4; ++i
)
1763 /* TODO: BRITTLE: A.Stevens - this only works for uint8_t = uint8_t */
1764 qb
[0] = (b
[0]+b
[1]+nb
[0]+nb
[1])>>2;
1765 qb
[1] = (b
[2]+b
[3]+nb
[2]+nb
[3])>>2;
1771 nb
= b
+ nextfieldline
;
1774 #ifdef TEST_RCSEARCH
1775 /* TODO: BUG: THIS CODE DOES NOT YET ALLOW FOR INTERLACED FIELDS.... */
1778 Initial row sums....
1781 for(j
= 0; j
< height
; ++j
)
1784 for( i
= 0; i
< 16; ++ i
)
1788 rowsums
[j
] = rowsum
;
1795 for( i
= 0; i
< width
; ++i
)
1800 for( j
= 0; j
< 16; ++j
)
1802 for( i
= 0; i
< width
; ++i
)
1809 /* Now fill in the row/column sum tables...
1810 Note: to allow efficient construction of sum/col differences for a
1811 given position row sums are held in a *column major* array
1812 (changing y co-ordinate makes for small index changes)
1813 the col sums are held in a *row major* array
1818 for(j
= 0; j
<32; ++j
)
1821 rowsum
= rowsums
[j
];
1822 for( i
= 0; i
< width
-16; ++i
)
1826 colsums
[i
] = (colsums
[i
] + pb
[down16
] )-pb
[0];
1827 rowsum
= (rowsum
+ pb
[16]) - pb
[0];
1831 pb
+= 16; /* move pb on to next row... rememember we only did width-16! */
1838 static int dist22( uint8_t *s22blk1
, uint8_t *s22blk2
,int flx
,int fh
)
1840 uint8_t *p1
= s22blk1
;
1841 uint8_t *p2
= s22blk2
;
1845 for( j
= 0; j
< fh
; ++j
)
1848 #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff)
1849 pipestep(0); pipestep(1);
1850 pipestep(2); pipestep(3);
1851 pipestep(4); pipestep(5);
1852 pipestep(6); pipestep(7);
1864 Sum absolute differences for 4*4 sub-sampled data.
1866 TODO: currently assumes only 16*16 or 16*8 motion compensation will be used...
1867 I.e. 4*4 or 4*2 sub-sampled blocks will be compared.
1871 static int dist44( uint8_t *s44blk1
, uint8_t *s44blk2
,int qlx
,int qh
)
1873 register uint8_t *p1
= s44blk1
;
1874 register uint8_t *p2
= s44blk2
;
1878 /* #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) */
1879 #define pipestep(o) diff = p1[o]-p2[o]; s += diff < 0 ? -diff : diff;
1880 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1883 p1
+= qlx
; p2
+= qlx
;
1884 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1887 p1
+= qlx
; p2
+= qlx
;
1888 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1889 p1
+= qlx
; p2
+= qlx
;
1890 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1899 * total squared difference between two (8*h) blocks of 2*2 sub-sampled pels
1900 * blk1,blk2: addresses of top left pels of both blocks
1901 * lx: distance (in bytes) of vertically adjacent pels
1902 * h: height of block (usually 8 or 16)
1905 static int dist2_22(uint8_t *blk1
, uint8_t *blk2
, int lx
, int h
)
1907 uint8_t *p1
= blk1
, *p2
= blk2
;
1923 /* total squared difference between bidirection prediction of (8*h)
1924 * blocks of 2*2 sub-sampled pels and reference
1925 * blk1f, blk1b,blk2: addresses of top left
1927 * lx: distance (in bytes) of vertically adjacent
1929 * h: height of block (usually 4 or 8)
1932 static int bdist2_22(uint8_t *blk1f
, uint8_t *blk1b
, uint8_t *blk2
,
1935 uint8_t *p1f
= blk1f
,*p1b
= blk1b
,*p2
= blk2
;
1942 v
= ((p1f
[i
]+p1b
[i
]+1)>>1) - p2
[i
];
1953 * total squared difference between two (16*h) blocks
1954 * including optional half pel interpolation of blk1 (hx,hy)
1955 * blk1,blk2: addresses of top left pels of both blocks
1956 * lx: distance (in bytes) of vertically adjacent pels
1957 * hx,hy: flags for horizontal and/or vertical interpolation
1958 * h: height of block (usually 8 or 16)
1962 static int dist2(blk1
,blk2
,lx
,hx
,hy
,h
)
1963 uint8_t *blk1
,*blk2
;
1966 uint8_t *p1
,*p1a
,*p2
;
1976 for (i
=0; i
<16; i
++)
1987 for (i
=0; i
<16; i
++)
1989 v
= ((unsigned int)(p1
[i
]+p1
[i
+1]+1)>>1) - p2
[i
];
2000 for (i
=0; i
<16; i
++)
2002 v
= ((unsigned int)(p1
[i
]+p1a
[i
]+1)>>1) - p2
[i
];
2010 else /* if (hx && hy) */
2015 for (i
=0; i
<16; i
++)
2017 v
= ((unsigned int)(p1
[i
]+p1
[i
+1]+p1a
[i
]+p1a
[i
+1]+2)>>2) - p2
[i
];
2031 * absolute difference error between a (16*h) block and a bidirectional
2034 * p2: address of top left pel of block
2035 * pf,hxf,hyf: address and half pel flags of forward ref. block
2036 * pb,hxb,hyb: address and half pel flags of backward ref. block
2037 * h: height of block
2038 * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2042 static int bdist1(pf
,pb
,p2
,lx
,hxf
,hyf
,hxb
,hyb
,h
)
2043 uint8_t *pf
,*pb
,*p2
;
2044 int lx
,hxf
,hyf
,hxb
,hyb
,h
;
2046 uint8_t *pfa
,*pfb
,*pfc
,*pba
,*pbb
,*pbc
;
2062 for (i
=0; i
<16; i
++)
2064 v
= ((((unsigned int)(*pf
++ + *pfa
++ + *pfb
++ + *pfc
++ + 2)>>2) +
2065 ((unsigned int)(*pb
++ + *pba
++ + *pbb
++ + *pbc
++ + 2)>>2) + 1)>>1)
2084 * squared error between a (16*h) block and a bidirectional
2087 * p2: address of top left pel of block
2088 * pf,hxf,hyf: address and half pel flags of forward ref. block
2089 * pb,hxb,hyb: address and half pel flags of backward ref. block
2090 * h: height of block
2091 * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2095 static int bdist2(pf
,pb
,p2
,lx
,hxf
,hyf
,hxb
,hyb
,h
)
2096 uint8_t *pf
,*pb
,*p2
;
2097 int lx
,hxf
,hyf
,hxb
,hyb
,h
;
2099 uint8_t *pfa
,*pfb
,*pfc
,*pba
,*pbb
,*pbc
;
2115 for (i
=0; i
<16; i
++)
2117 v
= ((((unsigned int)(*pf
++ + *pfa
++ + *pfb
++ + *pfc
++ + 2)>>2) +
2118 ((unsigned int)(*pb
++ + *pba
++ + *pbb
++ + *pbc
++ + 2)>>2) + 1)>>1)
2138 * variance of a (size*size) block, multiplied by 256
2139 * p: address of top left pel of block
2140 * lx: distance (in bytes) of vertically adjacent pels
2142 static int variance(uint8_t *p
, int size
, int lx
)
2145 unsigned int v
,s
,s2
;
2149 for (j
=0; j
<size
; j
++)
2151 for (i
=0; i
<size
; i
++)
2159 return s2
- (s
*s
)/(size
*size
);
2163 Compute the variance of the residual of uni-directionally motion
2167 static int unidir_pred_var( const mb_motion_s
*motion
,
2172 return (*pdist2
)(motion
->blk
, mb
, lx
, motion
->hx
, motion
->hy
, h
);
2177 Compute the variance of the residual of bi-directionally motion
2181 static int bidir_pred_var( const mb_motion_s
*motion_f
,
2182 const mb_motion_s
*motion_b
,
2186 return (*pbdist2
)( motion_f
->blk
, motion_b
->blk
,
2188 motion_f
->hx
, motion_f
->hy
,
2189 motion_b
->hx
, motion_b
->hy
,
2194 Compute SAD for bi-directionally motion compensated blocks...
2197 static int bidir_pred_sad( const mb_motion_s
*motion_f
,
2198 const mb_motion_s
*motion_b
,
2202 return (*pbdist1
)(motion_f
->blk
, motion_b
->blk
,
2204 motion_f
->hx
, motion_f
->hy
,
2205 motion_b
->hx
, motion_b
->hy
,
2209 static void frame_ME(motion_engine_t
*engine
,
2210 pict_data_s
*picture
,
2216 mb_motion_s framef_mc
;
2217 mb_motion_s frameb_mc
;
2218 mb_motion_s dualpf_mc
;
2219 mb_motion_s topfldf_mc
;
2220 mb_motion_s botfldf_mc
;
2221 mb_motion_s topfldb_mc
;
2222 mb_motion_s botfldb_mc
;
2225 int vmc
,vmcf
,vmcr
,vmci
;
2226 int vmcfieldf
,vmcfieldr
,vmcfieldi
;
2227 subsampled_mb_s ssmb
;
2228 int imins
[2][2],jmins
[2][2];
2229 int imindmv
,jmindmv
,vmc_dp
;
2230 //printf("frame_ME 1\n");
2233 /* A.Stevens fast motion estimation data is appended to actual
2234 luminance information
2236 ssmb
.mb
= mc
->cur
[0] + mb_row_start
+ i
;
2237 ssmb
.umb
= (uint8_t*)(mc
->cur
[1] + (i
>>1) + (mb_row_start
>>2));
2238 ssmb
.vmb
= (uint8_t*)(mc
->cur
[2] + (i
>>1) + (mb_row_start
>>2));
2239 ssmb
.fmb
= (uint8_t*)(mc
->cur
[0] + fsubsample_offset
+
2240 ((i
>>1) + (mb_row_start
>>2)));
2241 ssmb
.qmb
= (uint8_t*)(mc
->cur
[0] + qsubsample_offset
+
2242 (i
>>2) + (mb_row_start
>>4));
2244 /* Compute variance MB as a measure of Intra-coding complexity
2245 We include chrominance information here, scaled to compensate
2246 for sub-sampling. Silly MPEG forcing chrom/lum to have same
2249 var
= variance(ssmb
.mb
,16,width
);
2251 //printf("motion %d\n", picture->pict_type);
2252 if (picture
->pict_type
==I_TYPE
)
2254 mbi
->mb_type
= MB_INTRA
;
2256 else if (picture
->pict_type
==P_TYPE
)
2258 /* For P pictures we take into account chrominance. This
2259 provides much better performance at scene changes */
2260 var
+= chrom_var_sum(&ssmb
,16,width
);
2262 if (picture
->frame_pred_dct
)
2264 fullsearch(engine
, mc
->oldorg
[0],mc
->oldref
[0],&ssmb
,
2265 width
,i
,j
,mc
->sxf
,mc
->syf
,16,width
,height
,
2267 framef_mc
.fieldoff
= 0;
2268 vmc
= framef_mc
.var
+
2269 unidir_chrom_var_sum( &framef_mc
, mc
->oldref
, &ssmb
, width
, 16 );
2270 mbi
->motion_type
= MC_FRAME
;
2274 //printf("frame_ME 2 %p %p\n", mc->oldorg[0], mc->oldref[0]);
2275 frame_estimate(engine
, mc
->oldorg
[0],
2287 //printf("frame_ME 3\n");
2288 vmcf
= framef_mc
.var
+
2289 unidir_chrom_var_sum( &framef_mc
, mc
->oldref
, &ssmb
, width
, 16 );
2292 unidir_chrom_var_sum( &topfldf_mc
, mc
->oldref
, &ssmb
, (width
<<1), 8 ) +
2294 unidir_chrom_var_sum( &botfldf_mc
, mc
->oldref
, &ssmb
, (width
<<1), 8 );
2295 /* DEBUG DP currently disabled... */
2298 // dpframe_estimate(engine, picture,mc->oldref[0],&ssmb,
2299 // i,j>>1,imins,jmins,
2301 // &imindmv,&jmindmv, &vmc_dp);
2304 /* NOTE: Typically M =3 so DP actually disabled... */
2305 /* select between dual prime, frame and field prediction */
2306 // if ( M==1 && vmc_dp<vmcf && vmc_dp<vmcfieldf)
2308 // mbi->motion_type = MC_DMV;
2309 // /* No chrominance squared difference measure yet.
2310 // Assume identical to luminance */
2311 // vmc = vmc_dp + vmc_dp;
2314 if ( vmcf
< vmcfieldf
)
2316 mbi
->motion_type
= MC_FRAME
;
2322 mbi
->motion_type
= MC_FIELD
;
2328 /* select between intra or non-intra coding:
2330 * selection is based on intra block variance (var) vs.
2331 * prediction error variance (vmc)
2333 * Used to be: blocks with small prediction error are always
2334 * coded non-intra even if variance is smaller (is this reasonable?
2336 * TODO: A.Stevens Jul 2000
2337 * The bbmpeg guys have found this to be *unreasonable*.
2338 * I'm not sure I buy their solution using vmc*2. It is probabably
2339 * the vmc>= 9*256 test that is suspect.
2344 if (vmc
>var
/*&& vmc>=(3*3)*16*16*2*/ )
2346 mbi
->mb_type
= MB_INTRA
;
2352 /* select between MC / No-MC
2354 * use No-MC if var(No-MC) <= 1.25*var(MC)
2355 * (i.e slightly biased towards No-MC)
2357 * blocks with small prediction error are always coded as No-MC
2358 * (requires no motion vectors, allows skipping)
2360 v0
= (*pdist2
)(mc
->oldref
[0]+i
+width
*j
,ssmb
.mb
,width
,0,0,16);
2366 mbi
->mb_type
= MB_FORWARD
;
2367 if (mbi
->motion_type
==MC_FRAME
)
2369 mbi
->MV
[0][0][0] = framef_mc
.pos
.x
- (i
<<1);
2370 mbi
->MV
[0][0][1] = framef_mc
.pos
.y
- (j
<<1);
2372 else if (mbi
->motion_type
==MC_DMV
)
2374 /* these are FRAME vectors */
2375 /* same parity vector */
2376 mbi
->MV
[0][0][0] = dualpf_mc
.pos
.x
- (i
<<1);
2377 mbi
->MV
[0][0][1] = (dualpf_mc
.pos
.y
<<1) - (j
<<1);
2379 /* opposite parity vector */
2380 mbi
->dmvector
[0] = imindmv
;
2381 mbi
->dmvector
[1] = jmindmv
;
2385 /* these are FRAME vectors */
2386 mbi
->MV
[0][0][0] = topfldf_mc
.pos
.x
- (i
<<1);
2387 mbi
->MV
[0][0][1] = (topfldf_mc
.pos
.y
<<1) - (j
<<1);
2388 mbi
->MV
[1][0][0] = botfldf_mc
.pos
.x
- (i
<<1);
2389 mbi
->MV
[1][0][1] = (botfldf_mc
.pos
.y
<<1) - (j
<<1);
2390 mbi
->mv_field_sel
[0][0] = topfldf_mc
.fieldsel
;
2391 mbi
->mv_field_sel
[1][0] = botfldf_mc
.fieldsel
;
2400 mbi
->motion_type
= MC_FRAME
;
2401 mbi
->MV
[0][0][0] = 0;
2402 mbi
->MV
[0][0][1] = 0;
2406 else /* if (pict_type==B_TYPE) */
2408 if (picture
->frame_pred_dct
)
2410 var
= variance(ssmb
.mb
,16,width
);
2412 fullsearch(engine
, mc
->oldorg
[0],mc
->oldref
[0],&ssmb
,
2413 width
,i
,j
,mc
->sxf
,mc
->syf
,
2417 framef_mc
.fieldoff
= 0;
2418 vmcf
= framef_mc
.var
;
2421 fullsearch(engine
, mc
->neworg
[0],mc
->newref
[0],&ssmb
,
2422 width
,i
,j
,mc
->sxb
,mc
->syb
,
2425 frameb_mc
.fieldoff
= 0;
2426 vmcr
= frameb_mc
.var
;
2428 /* interpolated (bidirectional) */
2430 vmci
= bidir_pred_var( &framef_mc
, &frameb_mc
, ssmb
.mb
, width
, 16 );
2434 /* select between forward/backward/interpolated prediction:
2435 * use the one with smallest mean sqaured prediction error
2437 if (vmcf
<=vmcr
&& vmcf
<=vmci
)
2440 mbi
->mb_type
= MB_FORWARD
;
2442 else if (vmcr
<=vmci
)
2445 mbi
->mb_type
= MB_BACKWARD
;
2450 mbi
->mb_type
= MB_FORWARD
|MB_BACKWARD
;
2453 mbi
->motion_type
= MC_FRAME
;
2457 /* forward prediction */
2458 frame_estimate(engine
, mc
->oldorg
[0],mc
->oldref
[0],&ssmb
,
2459 i
,j
,mc
->sxf
,mc
->syf
,
2466 /* backward prediction */
2467 frame_estimate(engine
, mc
->neworg
[0],mc
->newref
[0],&ssmb
,
2468 i
,j
,mc
->sxb
,mc
->syb
,
2474 vmcf
= framef_mc
.var
;
2475 vmcr
= frameb_mc
.var
;
2476 vmci
= bidir_pred_var( &framef_mc
, &frameb_mc
, ssmb
.mb
, width
, 16 );
2478 vmcfieldf
= topfldf_mc
.var
+ botfldf_mc
.var
;
2479 vmcfieldr
= topfldb_mc
.var
+ botfldb_mc
.var
;
2480 vmcfieldi
= bidir_pred_var( &topfldf_mc
, &topfldb_mc
, ssmb
.mb
,
2482 bidir_pred_var( &botfldf_mc
, &botfldb_mc
, ssmb
.mb
,
2485 /* select prediction type of minimum distance from the
2486 * six candidates (field/frame * forward/backward/interpolated)
2488 if (vmci
<vmcfieldi
&& vmci
<vmcf
&& vmci
<vmcfieldf
2489 && vmci
<vmcr
&& vmci
<vmcfieldr
)
2491 /* frame, interpolated */
2492 mbi
->mb_type
= MB_FORWARD
|MB_BACKWARD
;
2493 mbi
->motion_type
= MC_FRAME
;
2496 else if ( vmcfieldi
<vmcf
&& vmcfieldi
<vmcfieldf
2497 && vmcfieldi
<vmcr
&& vmcfieldi
<vmcfieldr
)
2499 /* field, interpolated */
2500 mbi
->mb_type
= MB_FORWARD
|MB_BACKWARD
;
2501 mbi
->motion_type
= MC_FIELD
;
2504 else if (vmcf
<vmcfieldf
&& vmcf
<vmcr
&& vmcf
<vmcfieldr
)
2506 /* frame, forward */
2507 mbi
->mb_type
= MB_FORWARD
;
2508 mbi
->motion_type
= MC_FRAME
;
2512 else if ( vmcfieldf
<vmcr
&& vmcfieldf
<vmcfieldr
)
2514 /* field, forward */
2515 mbi
->mb_type
= MB_FORWARD
;
2516 mbi
->motion_type
= MC_FIELD
;
2519 else if (vmcr
<vmcfieldr
)
2521 /* frame, backward */
2522 mbi
->mb_type
= MB_BACKWARD
;
2523 mbi
->motion_type
= MC_FRAME
;
2528 /* field, backward */
2529 mbi
->mb_type
= MB_BACKWARD
;
2530 mbi
->motion_type
= MC_FIELD
;
2536 /* select between intra or non-intra coding:
2538 * selection is based on intra block variance (var) vs.
2539 * prediction error variance (vmc)
2541 * Used to be: blocks with small prediction error are always
2542 * coded non-intra even if variance is smaller (is this reasonable?
2544 * TODO: A.Stevens Jul 2000
2545 * The bbmpeg guys have found this to be *unreasonable*.
2546 * I'm not sure I buy their solution using vmc*2 in the first comparison.
2547 * It is probabably the vmc>= 9*256 test that is suspect.
2550 if (vmc
>var
&& vmc
>=9*256)
2551 mbi
->mb_type
= MB_INTRA
;
2555 if (mbi
->motion_type
==MC_FRAME
)
2558 mbi
->MV
[0][0][0] = framef_mc
.pos
.x
- (i
<<1);
2559 mbi
->MV
[0][0][1] = framef_mc
.pos
.y
- (j
<<1);
2561 mbi
->MV
[0][1][0] = frameb_mc
.pos
.x
- (i
<<1);
2562 mbi
->MV
[0][1][1] = frameb_mc
.pos
.y
- (j
<<1);
2566 /* these are FRAME vectors */
2568 mbi
->MV
[0][0][0] = topfldf_mc
.pos
.x
- (i
<<1);
2569 mbi
->MV
[0][0][1] = (topfldf_mc
.pos
.y
<<1) - (j
<<1);
2570 mbi
->MV
[1][0][0] = botfldf_mc
.pos
.x
- (i
<<1);
2571 mbi
->MV
[1][0][1] = (botfldf_mc
.pos
.y
<<1) - (j
<<1);
2572 mbi
->mv_field_sel
[0][0] = topfldf_mc
.fieldsel
;
2573 mbi
->mv_field_sel
[1][0] = botfldf_mc
.fieldsel
;
2575 mbi
->MV
[0][1][0] = topfldb_mc
.pos
.x
- (i
<<1);
2576 mbi
->MV
[0][1][1] = (topfldb_mc
.pos
.y
<<1) - (j
<<1);
2577 mbi
->MV
[1][1][0] = botfldb_mc
.pos
.x
- (i
<<1);
2578 mbi
->MV
[1][1][1] = (botfldb_mc
.pos
.y
<<1) - (j
<<1);
2579 mbi
->mv_field_sel
[0][1] = topfldb_mc
.fieldsel
;
2580 mbi
->mv_field_sel
[1][1] = botfldb_mc
.fieldsel
;
2589 * motion estimation for field pictures
2591 * mbi: pointer to macroblock info structure
2592 * secondfield: indicates second field of a frame (in P fields this means
2593 * that reference field of opposite parity is in curref instead
2595 * ipflag: indicates a P type field which is the second field of a frame
2596 * in which the first field is I type (this restricts predictions
2597 * to be based only on the opposite parity (=I) field)
2601 * mb_type: 0, MB_INTRA, MB_FORWARD, MB_BACKWARD, MB_FORWARD|MB_BACKWARD
2602 * MV[][][]: motion vectors (field format)
2603 * mv_field_sel: top/bottom field
2604 * motion_type: MC_FIELD, MC_16X8
2606 * uses global vars: pict_type, pict_struct
2608 static void field_ME(motion_engine_t
*engine
,
2609 pict_data_s
*picture
,
2614 int secondfield
, int ipflag
)
2617 uint8_t *toporg
, *topref
, *botorg
, *botref
;
2618 subsampled_mb_s ssmb
;
2619 mb_motion_s fields_mc
, dualp_mc
;
2620 mb_motion_s fieldf_mc
, fieldb_mc
;
2621 mb_motion_s field8uf_mc
, field8lf_mc
;
2622 mb_motion_s field8ub_mc
, field8lb_mc
;
2623 int var
,vmc
,v0
,dmc
,dmcfieldi
,dmcfield
,dmcfieldf
,dmcfieldr
,dmc8i
;
2630 /* Fast motion data sits at the end of the luminance buffer */
2631 ssmb
.mb
= mc
->cur
[0] + i
+ w2
*j
;
2632 ssmb
.umb
= mc
->cur
[1] + ((i
>>1)+(w2
>>1)*(j
>>1));
2633 ssmb
.vmb
= mc
->cur
[2] + ((i
>>1)+(w2
>>1)*(j
>>1));
2634 ssmb
.fmb
= mc
->cur
[0] + fsubsample_offset
+((i
>>1)+(w2
>>1)*(j
>>1));
2635 ssmb
.qmb
= mc
->cur
[0] + qsubsample_offset
+ (i
>>2)+(w2
>>2)*(j
>>2);
2637 if (picture
->pict_struct
==BOTTOM_FIELD
)
2640 ssmb
.umb
+= (width
>> 1);
2641 ssmb
.vmb
+= (width
>> 1);
2642 ssmb
.fmb
+= (width
>> 1);
2643 ssmb
.qmb
+= (width
>> 2);
2646 var
= variance(ssmb
.mb
,16,w2
) +
2647 ( variance(ssmb
.umb
,8,(width
>>1)) + variance(ssmb
.vmb
,8,(width
>>1)))*2;
2649 if (picture
->pict_type
==I_TYPE
)
2650 mbi
->mb_type
= MB_INTRA
;
2651 else if (picture
->pict_type
==P_TYPE
)
2653 toporg
= mc
->oldorg
[0];
2654 topref
= mc
->oldref
[0];
2655 botorg
= mc
->oldorg
[0] + width
;
2656 botref
= mc
->oldref
[0] + width
;
2660 /* opposite parity field is in same frame */
2661 if (picture
->pict_struct
==TOP_FIELD
)
2663 /* current is top field */
2664 botorg
= mc
->cur
[0] + width
;
2665 botref
= mc
->curref
[0] + width
;
2669 /* current is bottom field */
2670 toporg
= mc
->cur
[0];
2671 topref
= mc
->curref
[0];
2675 field_estimate(engine
,
2677 toporg
,topref
,botorg
,botref
,&ssmb
,
2678 i
,j
,mc
->sxf
,mc
->syf
,ipflag
,
2683 dmcfield
= fieldf_mc
.sad
;
2684 dmc8f
= field8uf_mc
.sad
+ field8lf_mc
.sad
;
2686 // if (M==1 && !ipflag) /* generic condition which permits Dual Prime */
2688 // dpfield_estimate(engine,
2690 // topref,botref,ssmb.mb,i,j,imins,jmins,
2693 // dmc_dp = dualp_mc.sad;
2696 // /* select between dual prime, field and 16x8 prediction */
2697 // if (M==1 && !ipflag && dmc_dp<dmc8f && dmc_dp<dmcfield)
2699 // /* Dual Prime prediction */
2700 // mbi->motion_type = MC_DMV;
2701 // dmc = dualp_mc.sad;
2708 /* 16x8 prediction */
2709 mbi
->motion_type
= MC_16X8
;
2710 /* upper and lower half blocks */
2711 vmc
= unidir_pred_var( &field8uf_mc
, ssmb
.mb
, w2
, 8);
2712 vmc
+= unidir_pred_var( &field8lf_mc
, ssmb
.mb
, w2
, 8);
2716 /* field prediction */
2717 mbi
->motion_type
= MC_FIELD
;
2718 vmc
= unidir_pred_var( &fieldf_mc
, ssmb
.mb
, w2
, 16 );
2721 /* select between intra and non-intra coding */
2723 if ( vmc
>var
&& vmc
>=9*256)
2724 mbi
->mb_type
= MB_INTRA
;
2727 /* zero MV field prediction from same parity ref. field
2728 * (not allowed if ipflag is set)
2731 v0
= (*pdist2
)(((picture
->pict_struct
==BOTTOM_FIELD
)?botref
:topref
) + i
+ w2
*j
,
2733 if (ipflag
|| (4*v0
>5*vmc
&& v0
>=9*256))
2736 mbi
->mb_type
= MB_FORWARD
;
2737 if (mbi
->motion_type
==MC_FIELD
)
2739 mbi
->MV
[0][0][0] = fieldf_mc
.pos
.x
- (i
<<1);
2740 mbi
->MV
[0][0][1] = fieldf_mc
.pos
.y
- (j
<<1);
2741 mbi
->mv_field_sel
[0][0] = fieldf_mc
.fieldsel
;
2743 else if (mbi
->motion_type
==MC_DMV
)
2745 /* same parity vector */
2746 mbi
->MV
[0][0][0] = imins
- (i
<<1);
2747 mbi
->MV
[0][0][1] = jmins
- (j
<<1);
2749 /* opposite parity vector */
2750 mbi
->dmvector
[0] = dualp_mc
.pos
.x
;
2751 mbi
->dmvector
[1] = dualp_mc
.pos
.y
;
2755 mbi
->MV
[0][0][0] = field8uf_mc
.pos
.x
- (i
<<1);
2756 mbi
->MV
[0][0][1] = field8uf_mc
.pos
.y
- (j
<<1);
2757 mbi
->MV
[1][0][0] = field8lf_mc
.pos
.x
- (i
<<1);
2758 mbi
->MV
[1][0][1] = field8lf_mc
.pos
.y
- ((j
+8)<<1);
2759 mbi
->mv_field_sel
[0][0] = field8uf_mc
.fieldsel
;
2760 mbi
->mv_field_sel
[1][0] = field8lf_mc
.fieldsel
;
2768 mbi
->motion_type
= MC_FIELD
;
2769 mbi
->MV
[0][0][0] = 0;
2770 mbi
->MV
[0][0][1] = 0;
2771 mbi
->mv_field_sel
[0][0] = (picture
->pict_struct
==BOTTOM_FIELD
);
2775 else /* if (pict_type==B_TYPE) */
2777 /* forward prediction */
2778 field_estimate(engine
,
2780 mc
->oldorg
[0],mc
->oldref
[0],
2781 mc
->oldorg
[0]+width
,mc
->oldref
[0]+width
,&ssmb
,
2782 i
,j
,mc
->sxf
,mc
->syf
,0,
2787 dmcfieldf
= fieldf_mc
.sad
;
2788 dmc8f
= field8uf_mc
.sad
+ field8lf_mc
.sad
;
2790 /* backward prediction */
2791 field_estimate(engine
,
2793 mc
->neworg
[0],mc
->newref
[0],mc
->neworg
[0]+width
,mc
->newref
[0]+width
,
2795 i
,j
,mc
->sxb
,mc
->syb
,0,
2800 dmcfieldr
= fieldb_mc
.sad
;
2801 dmc8r
= field8ub_mc
.sad
+ field8lb_mc
.sad
;
2803 /* calculate distances for bidirectional prediction */
2805 dmcfieldi
= bidir_pred_sad( &fieldf_mc
, &fieldb_mc
, ssmb
.mb
, w2
, 16);
2807 /* 16x8 upper and lower half blocks */
2808 dmc8i
= bidir_pred_sad( &field8uf_mc
, &field8ub_mc
, ssmb
.mb
, w2
, 16 );
2809 dmc8i
+= bidir_pred_sad( &field8lf_mc
, &field8lb_mc
, ssmb
.mb
, w2
, 16 );
2811 /* select prediction type of minimum distance */
2812 if (dmcfieldi
<dmc8i
&& dmcfieldi
<dmcfieldf
&& dmcfieldi
<dmc8f
2813 && dmcfieldi
<dmcfieldr
&& dmcfieldi
<dmc8r
)
2815 /* field, interpolated */
2816 mbi
->mb_type
= MB_FORWARD
|MB_BACKWARD
;
2817 mbi
->motion_type
= MC_FIELD
;
2818 vmc
= bidir_pred_var( &fieldf_mc
, &fieldb_mc
, ssmb
.mb
, w2
, 16);
2820 else if (dmc8i
<dmcfieldf
&& dmc8i
<dmc8f
2821 && dmc8i
<dmcfieldr
&& dmc8i
<dmc8r
)
2823 /* 16x8, interpolated */
2824 mbi
->mb_type
= MB_FORWARD
|MB_BACKWARD
;
2825 mbi
->motion_type
= MC_16X8
;
2827 /* upper and lower half blocks */
2828 vmc
= bidir_pred_var( &field8uf_mc
, &field8ub_mc
, ssmb
.mb
, w2
, 8);
2829 vmc
+= bidir_pred_var( &field8lf_mc
, &field8lb_mc
, ssmb
.mb
, w2
, 8);
2831 else if (dmcfieldf
<dmc8f
&& dmcfieldf
<dmcfieldr
&& dmcfieldf
<dmc8r
)
2833 /* field, forward */
2834 mbi
->mb_type
= MB_FORWARD
;
2835 mbi
->motion_type
= MC_FIELD
;
2836 vmc
= unidir_pred_var( &fieldf_mc
, ssmb
.mb
, w2
, 16);
2838 else if (dmc8f
<dmcfieldr
&& dmc8f
<dmc8r
)
2841 mbi
->mb_type
= MB_FORWARD
;
2842 mbi
->motion_type
= MC_16X8
;
2844 /* upper and lower half blocks */
2845 vmc
= unidir_pred_var( &field8uf_mc
, ssmb
.mb
, w2
, 8);
2846 vmc
+= unidir_pred_var( &field8lf_mc
, ssmb
.mb
, w2
, 8);
2848 else if (dmcfieldr
<dmc8r
)
2850 /* field, backward */
2851 mbi
->mb_type
= MB_BACKWARD
;
2852 mbi
->motion_type
= MC_FIELD
;
2853 vmc
= unidir_pred_var( &fieldb_mc
, ssmb
.mb
, w2
, 16 );
2857 /* 16x8, backward */
2858 mbi
->mb_type
= MB_BACKWARD
;
2859 mbi
->motion_type
= MC_16X8
;
2861 /* upper and lower half blocks */
2862 vmc
= unidir_pred_var( &field8ub_mc
, ssmb
.mb
, w2
, 8);
2863 vmc
+= unidir_pred_var( &field8lb_mc
, ssmb
.mb
, w2
, 8);
2867 /* select between intra and non-intra coding */
2868 if (vmc
>var
&& vmc
>=9*256)
2869 mbi
->mb_type
= MB_INTRA
;
2873 if (mbi
->motion_type
==MC_FIELD
)
2876 mbi
->MV
[0][0][0] = fieldf_mc
.pos
.x
- (i
<<1);
2877 mbi
->MV
[0][0][1] = fieldf_mc
.pos
.y
- (j
<<1);
2878 mbi
->mv_field_sel
[0][0] = fieldf_mc
.fieldsel
;
2880 mbi
->MV
[0][1][0] = fieldb_mc
.pos
.x
- (i
<<1);
2881 mbi
->MV
[0][1][1] = fieldb_mc
.pos
.y
- (j
<<1);
2882 mbi
->mv_field_sel
[0][1] = fieldb_mc
.fieldsel
;
2887 mbi
->MV
[0][0][0] = field8uf_mc
.pos
.x
- (i
<<1);
2888 mbi
->MV
[0][0][1] = field8uf_mc
.pos
.y
- (j
<<1);
2889 mbi
->mv_field_sel
[0][0] = field8uf_mc
.fieldsel
;
2890 mbi
->MV
[1][0][0] = field8lf_mc
.pos
.x
- (i
<<1);
2891 mbi
->MV
[1][0][1] = field8lf_mc
.pos
.y
- ((j
+8)<<1);
2892 mbi
->mv_field_sel
[1][0] = field8lf_mc
.fieldsel
;
2894 mbi
->MV
[0][1][0] = field8ub_mc
.pos
.x
- (i
<<1);
2895 mbi
->MV
[0][1][1] = field8ub_mc
.pos
.y
- (j
<<1);
2896 mbi
->mv_field_sel
[0][1] = field8ub_mc
.fieldsel
;
2897 mbi
->MV
[1][1][0] = field8lb_mc
.pos
.x
- (i
<<1);
2898 mbi
->MV
[1][1][1] = field8lb_mc
.pos
.y
- ((j
+8)<<1);
2899 mbi
->mv_field_sel
[1][1] = field8lb_mc
.fieldsel
;
2909 Initialise motion compensation - currently purely selection of which
2910 versions of the various low level computation routines to use
2916 int cpucap
= cpu_accel();
2918 if( cpucap
== 0 ) /* No MMX/SSE etc support available */
2922 pdist1_00
= dist1_00
;
2923 pdist1_01
= dist1_01
;
2924 pdist1_10
= dist1_10
;
2925 pdist1_11
= dist1_11
;
2929 pdist2_22
= dist2_22
;
2930 pbdist2_22
= bdist2_22
;
2931 pfind_best_one_pel
= find_best_one_pel
;
2932 pbuild_sub22_mcomps
= build_sub22_mcomps
;
2935 else if(cpucap
& ACCEL_X86_MMXEXT
) /* AMD MMX or SSE... */
2937 if(verbose
) fprintf( stderr
, "SETTING EXTENDED MMX for MOTION!\n");
2938 pdist22
= dist22_mmxe
;
2939 pdist44
= dist44_mmxe
;
2940 pdist1_00
= dist1_00_mmxe
;
2941 pdist1_01
= dist1_01_mmxe
;
2942 pdist1_10
= dist1_10_mmxe
;
2943 pdist1_11
= dist1_11_mmxe
;
2944 pbdist1
= bdist1_mmx
;
2946 pbdist2
= bdist2_mmx
;
2947 pdist2_22
= dist2_22_mmx
;
2948 pbdist2_22
= bdist2_22_mmx
;
2949 pfind_best_one_pel
= find_best_one_pel_mmxe
;
2950 pbuild_sub22_mcomps
= build_sub22_mcomps_mmxe
;
2951 pmblock_sub44_dists
= mblock_sub44_dists_mmxe
;
2954 else if(cpucap
& ACCEL_X86_MMX
) /* Ordinary MMX CPU */
2956 if(verbose
) fprintf( stderr
, "SETTING MMX for MOTION!\n");
2957 pdist22
= dist22_mmx
;
2958 pdist44
= dist44_mmx
;
2959 pdist1_00
= dist1_00_mmx
;
2960 pdist1_01
= dist1_01_mmx
;
2961 pdist1_10
= dist1_10_mmx
;
2962 pdist1_11
= dist1_11_mmx
;
2963 pbdist1
= bdist1_mmx
;
2965 pbdist2
= bdist2_mmx
;
2966 pdist2_22
= dist2_22_mmx
;
2967 pbdist2_22
= bdist2_22_mmx
;
2968 pfind_best_one_pel
= find_best_one_pel
;
2969 pbuild_sub22_mcomps
= build_sub22_mcomps
;
2970 pmblock_sub44_dists
= mblock_sub44_dists_mmx
;
2976 void motion_engine_loop(motion_engine_t
*engine
)
2978 while(!engine
->done
)
2980 pthread_mutex_lock(&(engine
->input_lock
));
2984 pict_data_s
*picture
= engine
->pict_data
;
2985 motion_comp_s
*mc_data
= engine
->motion_comp
;
2986 int secondfield
= engine
->secondfield
;
2987 int ipflag
= engine
->ipflag
;
2988 mbinfo_s
*mbi
= picture
->mbinfo
+ (engine
->start_row
/ 16) * (width
/ 16);
2990 int mb_row_incr
; /* Offset increment to go down 1 row of mb's */
2993 if (picture
->pict_struct
== FRAME_PICTURE
)
2995 mb_row_incr
= 16*width
;
2996 mb_row_start
= engine
->start_row
/ 16 * mb_row_incr
;
2997 /* loop through all macroblocks of a frame picture */
2998 for (j
=engine
->start_row
; j
< engine
->end_row
; j
+=16)
3000 for (i
=0; i
<width
; i
+=16)
3002 frame_ME(engine
, picture
, mc_data
, mb_row_start
, i
,j
, mbi
);
3005 mb_row_start
+= mb_row_incr
;
3010 mb_row_incr
= (16 * 2) * width
;
3011 mb_row_start
= engine
->start_row
/ 16 * mb_row_incr
;
3012 /* loop through all macroblocks of a field picture */
3013 for (j
=engine
->start_row
; j
< engine
->end_row
; j
+=16)
3015 for (i
=0; i
<width
; i
+=16)
3017 field_ME(engine
, picture
, mc_data
, mb_row_start
, i
,j
,
3018 mbi
,secondfield
,ipflag
);
3021 mb_row_start
+= mb_row_incr
;
3026 pthread_mutex_unlock(&(engine
->output_lock
));
3031 void motion_estimation(pict_data_s
*picture
,
3032 motion_comp_s
*mc_data
,
3033 int secondfield
, int ipflag
)
3037 for(i
= 0; i
< processors
; i
++)
3039 motion_engines
[i
].motion_comp
= mc_data
;
3041 motion_engines
[i
].pict_data
= picture
;
3043 motion_engines
[i
].secondfield
= secondfield
;
3044 motion_engines
[i
].ipflag
= ipflag
;
3045 pthread_mutex_unlock(&(motion_engines
[i
].input_lock
));
3048 /* Wait for completion */
3049 for(i
= 0; i
< processors
; i
++)
3051 pthread_mutex_lock(&(motion_engines
[i
].output_lock
));
3055 void start_motion_engines()
3058 int rows_per_processor
= (int)((float)height2
/ 16 / processors
+ 0.5);
3059 int current_row
= 0;
3060 pthread_attr_t attr
;
3061 pthread_mutexattr_t mutex_attr
;
3063 pthread_mutexattr_init(&mutex_attr
);
3064 pthread_attr_init(&attr
);
3065 motion_engines
= calloc(1, sizeof(motion_engine_t
) * processors
);
3066 for(i
= 0; i
< processors
; i
++)
3068 motion_engines
[i
].start_row
= current_row
* i
* 16;
3069 current_row
+= rows_per_processor
;
3070 if(current_row
> height2
/ 16) current_row
= height2
/ 16;
3071 motion_engines
[i
].end_row
= current_row
* 16;
3072 pthread_mutex_init(&(motion_engines
[i
].input_lock
), &mutex_attr
);
3073 pthread_mutex_lock(&(motion_engines
[i
].input_lock
));
3074 pthread_mutex_init(&(motion_engines
[i
].output_lock
), &mutex_attr
);
3075 pthread_mutex_lock(&(motion_engines
[i
].output_lock
));
3076 motion_engines
[i
].done
= 0;
3077 pthread_create(&(motion_engines
[i
].tid
),
3079 (void*)motion_engine_loop
,
3080 &motion_engines
[i
]);
3084 void stop_motion_engines()
3087 for(i
= 0; i
< processors
; i
++)
3089 motion_engines
[i
].done
= 1;
3090 pthread_mutex_unlock(&(motion_engines
[i
].input_lock
));
3091 pthread_join(motion_engines
[i
].tid
, 0);
3092 pthread_mutex_destroy(&(motion_engines
[i
].input_lock
));
3093 pthread_mutex_destroy(&(motion_engines
[i
].output_lock
));
3095 free(motion_engines
);