quicktime/ffmpeg/libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  *
  19  */
  20
  21 /**
  22  * @file h264.c
  23  * H.264 / AVC / MPEG4 part10 codec.
  24  * @author Michael Niedermayer <michaelni@gmx.at>
  25  */
  26
  27 #include "common.h"
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264data.h"
  32 #include "golomb.h"
  33
  34 #include "cabac.h"
  35
  36 #undef NDEBUG
  37 #include <assert.h>
  38
  39 #define interlaced_dct interlaced_dct_is_a_bad_name
  40 #define mb_intra mb_intra_isnt_initalized_see_mb_type
  41
  42 #define LUMA_DC_BLOCK_INDEX   25
  43 #define CHROMA_DC_BLOCK_INDEX 26
  44
  45 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
  46 #define COEFF_TOKEN_VLC_BITS           8
  47 #define TOTAL_ZEROS_VLC_BITS           9
  48 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
  49 #define RUN_VLC_BITS                   3
  50 #define RUN7_VLC_BITS                  6
  51
  52 #define MAX_SPS_COUNT 32
  53 #define MAX_PPS_COUNT 256
  54
  55 #define MAX_MMCO_COUNT 66
  56
  57 /**
  58  * Sequence parameter set
  59  */
  60 typedef struct SPS{
  61
  62     int profile_idc;
  63     int level_idc;
  64     int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
  65     int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
  66     int poc_type;                      ///< pic_order_cnt_type
  67     int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
  68     int delta_pic_order_always_zero_flag;
  69     int offset_for_non_ref_pic;
  70     int offset_for_top_to_bottom_field;
  71     int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
  72     int ref_frame_count;               ///< num_ref_frames
  73     int gaps_in_frame_num_allowed_flag;
  74     int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
  75     int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
  76     int frame_mbs_only_flag;
  77     int mb_aff;                        ///<mb_adaptive_frame_field_flag
  78     int direct_8x8_inference_flag;
  79     int crop;                   ///< frame_cropping_flag
  80     int crop_left;              ///< frame_cropping_rect_left_offset
  81     int crop_right;             ///< frame_cropping_rect_right_offset
  82     int crop_top;               ///< frame_cropping_rect_top_offset
  83     int crop_bottom;            ///< frame_cropping_rect_bottom_offset
  84     int vui_parameters_present_flag;
  85     AVRational sar;
  86     int timing_info_present_flag;
  87     uint32_t num_units_in_tick;
  88     uint32_t time_scale;
  89     int fixed_frame_rate_flag;
  90     short offset_for_ref_frame[256]; //FIXME dyn aloc?
  91     int bitstream_restriction_flag;
  92     int num_reorder_frames;
  93 }SPS;
  94
  95 /**
  96  * Picture parameter set
  97  */
  98 typedef struct PPS{
  99     int sps_id;
 100     int cabac;                  ///< entropy_coding_mode_flag
 101     int pic_order_present;      ///< pic_order_present_flag
 102     int slice_group_count;      ///< num_slice_groups_minus1 + 1
 103     int mb_slice_group_map_type;
 104     int ref_count[2];           ///< num_ref_idx_l0/1_active_minus1 + 1
 105     int weighted_pred;          ///< weighted_pred_flag
 106     int weighted_bipred_idc;
 107     int init_qp;                ///< pic_init_qp_minus26 + 26
 108     int init_qs;                ///< pic_init_qs_minus26 + 26
 109     int chroma_qp_index_offset;
 110     int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
 111     int constrained_intra_pred; ///< constrained_intra_pred_flag
 112     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
 113     int transform_8x8_mode;     ///< transform_8x8_mode_flag
 114 }PPS;
 115
 116 /**
 117  * Memory management control operation opcode.
 118  */
 119 typedef enum MMCOOpcode{
 120     MMCO_END=0,
 121     MMCO_SHORT2UNUSED,
 122     MMCO_LONG2UNUSED,
 123     MMCO_SHORT2LONG,
 124     MMCO_SET_MAX_LONG,
 125     MMCO_RESET,
 126     MMCO_LONG,
 127 } MMCOOpcode;
 128
 129 /**
 130  * Memory management control operation.
 131  */
 132 typedef struct MMCO{
 133     MMCOOpcode opcode;
 134     int short_frame_num;
 135     int long_index;
 136 } MMCO;
 137
 138 /**
 139  * H264Context
 140  */
 141 typedef struct H264Context{
 142     MpegEncContext s;
 143     int nal_ref_idc;
 144     int nal_unit_type;
 145 #define NAL_SLICE               1
 146 #define NAL_DPA                 2
 147 #define NAL_DPB                 3
 148 #define NAL_DPC                 4
 149 #define NAL_IDR_SLICE           5
 150 #define NAL_SEI                 6
 151 #define NAL_SPS                 7
 152 #define NAL_PPS                 8
 153 #define NAL_PICTURE_DELIMITER   9
 154 #define NAL_FILTER_DATA         10
 155     uint8_t *rbsp_buffer;
 156     int rbsp_buffer_size;
 157
 158     /**
 159       * Used to parse AVC variant of h264
 160       */
 161     int is_avc; ///< this flag is != 0 if codec is avc1
 162     int got_avcC; ///< flag used to parse avcC data only once
 163     int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
 164
 165     int chroma_qp; //QPc
 166
 167     int prev_mb_skipped; //FIXME remove (IMHO not used)
 168
 169     //prediction stuff
 170     int chroma_pred_mode;
 171     int intra16x16_pred_mode;
 172
 173     int top_mb_xy;
 174     int left_mb_xy[2];
 175
 176     int8_t intra4x4_pred_mode_cache[5*8];
 177     int8_t (*intra4x4_pred_mode)[8];
 178     void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
 179     void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
 180     void (*pred8x8  [4+3])(uint8_t *src, int stride);
 181     void (*pred16x16[4+3])(uint8_t *src, int stride);
 182     unsigned int topleft_samples_available;
 183     unsigned int top_samples_available;
 184     unsigned int topright_samples_available;
 185     unsigned int left_samples_available;
 186     uint8_t (*top_borders[2])[16+2*8];
 187     uint8_t left_border[2*(17+2*9)];
 188
 189     /**
 190      * non zero coeff count cache.
 191      * is 64 if not available.
 192      */
 193     uint8_t non_zero_count_cache[6*8] __align8;
 194     uint8_t (*non_zero_count)[16];
 195
 196     /**
 197      * Motion vector cache.
 198      */
 199     int16_t mv_cache[2][5*8][2] __align8;
 200     int8_t ref_cache[2][5*8] __align8;
 201 #define LIST_NOT_USED -1 //FIXME rename?
 202 #define PART_NOT_AVAILABLE -2
 203
 204     /**
 205      * is 1 if the specific list MV&references are set to 0,0,-2.
 206      */
 207     int mv_cache_clean[2];
 208
 209     /**
 210      * number of neighbors (top and/or left) that used 8x8 dct
 211      */
 212     int neighbor_transform_size;
 213
 214     /**
 215      * block_offset[ 0..23] for frame macroblocks
 216      * block_offset[24..47] for field macroblocks
 217      */
 218     int block_offset[2*(16+8)];
 219
 220     uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
 221     uint32_t *mb2b8_xy;
 222     int b_stride; //FIXME use s->b4_stride
 223     int b8_stride;
 224
 225     int halfpel_flag;
 226     int thirdpel_flag;
 227
 228     int unknown_svq3_flag;
 229     int next_slice_index;
 230
 231     SPS sps_buffer[MAX_SPS_COUNT];
 232     SPS sps; ///< current sps
 233
 234     PPS pps_buffer[MAX_PPS_COUNT];
 235     /**
 236      * current pps
 237      */
 238     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 239
 240     uint16_t (*dequant4_coeff)[16]; // FIXME quant matrices should be per SPS or PPS
 241     uint16_t (*dequant8_coeff)[64];
 242
 243     int slice_num;
 244     uint8_t *slice_table_base;
 245     uint8_t *slice_table;      ///< slice_table_base + mb_stride + 1
 246     int slice_type;
 247     int slice_type_fixed;
 248
 249     //interlacing specific flags
 250     int mb_aff_frame;
 251     int mb_field_decoding_flag;
 252
 253     int sub_mb_type[4];
 254
 255     //POC stuff
 256     int poc_lsb;
 257     int poc_msb;
 258     int delta_poc_bottom;
 259     int delta_poc[2];
 260     int frame_num;
 261     int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
 262     int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
 263     int frame_num_offset;         ///< for POC type 2
 264     int prev_frame_num_offset;    ///< for POC type 2
 265     int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
 266
 267     /**
 268      * frame_num for frames or 2*frame_num for field pics.
 269      */
 270     int curr_pic_num;
 271
 272     /**
 273      * max_frame_num or 2*max_frame_num for field pics.
 274      */
 275     int max_pic_num;
 276
 277     //Weighted pred stuff
 278     int use_weight;
 279     int use_weight_chroma;
 280     int luma_log2_weight_denom;
 281     int chroma_log2_weight_denom;
 282     int luma_weight[2][16];
 283     int luma_offset[2][16];
 284     int chroma_weight[2][16][2];
 285     int chroma_offset[2][16][2];
 286     int implicit_weight[16][16];
 287
 288     //deblock
 289     int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
 290     int slice_alpha_c0_offset;
 291     int slice_beta_offset;
 292
 293     int redundant_pic_count;
 294
 295     int direct_spatial_mv_pred;
 296     int dist_scale_factor[16];
 297     int map_col_to_list0[2][16];
 298
 299     /**
 300      * num_ref_idx_l0/1_active_minus1 + 1
 301      */
 302     int ref_count[2];// FIXME split for AFF
 303     Picture *short_ref[32];
 304     Picture *long_ref[32];
 305     Picture default_ref_list[2][32];
 306     Picture ref_list[2][32]; //FIXME size?
 307     Picture field_ref_list[2][32]; //FIXME size?
 308     Picture *delayed_pic[16]; //FIXME size?
 309     Picture *delayed_output_pic;
 310
 311     /**
 312      * memory management control operations buffer.
 313      */
 314     MMCO mmco[MAX_MMCO_COUNT];
 315     int mmco_index;
 316
 317     int long_ref_count;  ///< number of actual long term references
 318     int short_ref_count; ///< number of actual short term references
 319
 320     //data partitioning
 321     GetBitContext intra_gb;
 322     GetBitContext inter_gb;
 323     GetBitContext *intra_gb_ptr;
 324     GetBitContext *inter_gb_ptr;
 325
 326     DCTELEM mb[16*24] __align8;
 327
 328     /**
 329      * Cabac
 330      */
 331     CABACContext cabac;
 332     uint8_t      cabac_state[460];
 333     int          cabac_init_idc;
 334
 335     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
 336     uint16_t     *cbp_table;
 337     int top_cbp;
 338     int left_cbp;
 339     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
 340     uint8_t     *chroma_pred_mode_table;
 341     int         last_qscale_diff;
 342     int16_t     (*mvd_table[2])[2];
 343     int16_t     mvd_cache[2][5*8][2] __align8;
 344     uint8_t     *direct_table;
 345     uint8_t     direct_cache[5*8];
 346
 347     uint8_t zigzag_scan[16];
 348     uint8_t field_scan[16];
 349     const uint8_t *zigzag_scan_q0;
 350     const uint8_t *field_scan_q0;
 351 }H264Context;
 352
 353 static VLC coeff_token_vlc[4];
 354 static VLC chroma_dc_coeff_token_vlc;
 355
 356 static VLC total_zeros_vlc[15];
 357 static VLC chroma_dc_total_zeros_vlc[3];
 358
 359 static VLC run_vlc[6];
 360 static VLC run7_vlc;
 361
 362 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 363 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
 364 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 365
 366 static inline uint32_t pack16to32(int a, int b){
 367 #ifdef WORDS_BIGENDIAN
 368    return (b&0xFFFF) + (a<<16);
 369 #else
 370    return (a&0xFFFF) + (b<<16);
 371 #endif
 372 }
 373
 374 /**
 375  * fill a rectangle.
 376  * @param h height of the rectangle, should be a constant
 377  * @param w width of the rectangle, should be a constant
 378  * @param size the size of val (1 or 4), should be a constant
 379  */
 380 static inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ //FIXME ensure this IS inlined
 381     uint8_t *p= (uint8_t*)vp;
 382     assert(size==1 || size==4);
 383
 384     w      *= size;
 385     stride *= size;
 386
 387     assert((((int)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
 388     assert((stride&(w-1))==0);
 389 //FIXME check what gcc generates for 64 bit on x86 and possibly write a 32 bit ver of it
 390     if(w==2 && h==2){
 391         *(uint16_t*)(p + 0)=
 392         *(uint16_t*)(p + stride)= size==4 ? val : val*0x0101;
 393     }else if(w==2 && h==4){
 394         *(uint16_t*)(p + 0*stride)=
 395         *(uint16_t*)(p + 1*stride)=
 396         *(uint16_t*)(p + 2*stride)=
 397         *(uint16_t*)(p + 3*stride)= size==4 ? val : val*0x0101;
 398     }else if(w==4 && h==1){
 399         *(uint32_t*)(p + 0*stride)= size==4 ? val : val*0x01010101;
 400     }else if(w==4 && h==2){
 401         *(uint32_t*)(p + 0*stride)=
 402         *(uint32_t*)(p + 1*stride)= size==4 ? val : val*0x01010101;
 403     }else if(w==4 && h==4){
 404         *(uint32_t*)(p + 0*stride)=
 405         *(uint32_t*)(p + 1*stride)=
 406         *(uint32_t*)(p + 2*stride)=
 407         *(uint32_t*)(p + 3*stride)= size==4 ? val : val*0x01010101;
 408     }else if(w==8 && h==1){
 409         *(uint32_t*)(p + 0)=
 410         *(uint32_t*)(p + 4)= size==4 ? val : val*0x01010101;
 411     }else if(w==8 && h==2){
 412         *(uint32_t*)(p + 0 + 0*stride)=
 413         *(uint32_t*)(p + 4 + 0*stride)=
 414         *(uint32_t*)(p + 0 + 1*stride)=
 415         *(uint32_t*)(p + 4 + 1*stride)=  size==4 ? val : val*0x01010101;
 416     }else if(w==8 && h==4){
 417         *(uint64_t*)(p + 0*stride)=
 418         *(uint64_t*)(p + 1*stride)=
 419         *(uint64_t*)(p + 2*stride)=
 420         *(uint64_t*)(p + 3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 421     }else if(w==16 && h==2){
 422         *(uint64_t*)(p + 0+0*stride)=
 423         *(uint64_t*)(p + 8+0*stride)=
 424         *(uint64_t*)(p + 0+1*stride)=
 425         *(uint64_t*)(p + 8+1*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 426     }else if(w==16 && h==4){
 427         *(uint64_t*)(p + 0+0*stride)=
 428         *(uint64_t*)(p + 8+0*stride)=
 429         *(uint64_t*)(p + 0+1*stride)=
 430         *(uint64_t*)(p + 8+1*stride)=
 431         *(uint64_t*)(p + 0+2*stride)=
 432         *(uint64_t*)(p + 8+2*stride)=
 433         *(uint64_t*)(p + 0+3*stride)=
 434         *(uint64_t*)(p + 8+3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 435     }else
 436         assert(0);
 437 }
 438
 439 static inline void fill_caches(H264Context *h, int mb_type, int for_deblock){
 440     MpegEncContext * const s = &h->s;
 441     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 442     int topleft_xy, top_xy, topright_xy, left_xy[2];
 443     int topleft_type, top_type, topright_type, left_type[2];
 444     int left_block[8];
 445     int i;
 446
 447     //FIXME deblocking can skip fill_caches much of the time with multiple slices too.
 448     // the actual condition is whether we're on the edge of a slice,
 449     // and even then the intra and nnz parts are unnecessary.
 450     if(for_deblock && h->slice_num == 1)
 451         return;
 452
 453     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 454
 455     top_xy     = mb_xy  - s->mb_stride;
 456     topleft_xy = top_xy - 1;
 457     topright_xy= top_xy + 1;
 458     left_xy[1] = left_xy[0] = mb_xy-1;
 459     left_block[0]= 0;
 460     left_block[1]= 1;
 461     left_block[2]= 2;
 462     left_block[3]= 3;
 463     left_block[4]= 7;
 464     left_block[5]= 10;
 465     left_block[6]= 8;
 466     left_block[7]= 11;
 467     if(h->mb_aff_frame){
 468         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 469         const int top_pair_xy      = pair_xy     - s->mb_stride;
 470         const int topleft_pair_xy  = top_pair_xy - 1;
 471         const int topright_pair_xy = top_pair_xy + 1;
 472         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 473         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 474         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 475         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 476         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 477         const int bottom = (s->mb_y & 1);
 478         tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 479         if (bottom
 480                 ? !curr_mb_frame_flag // bottom macroblock
 481                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 482                 ) {
 483             top_xy -= s->mb_stride;
 484         }
 485         if (bottom
 486                 ? !curr_mb_frame_flag // bottom macroblock
 487                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 488                 ) {
 489             topleft_xy -= s->mb_stride;
 490         }
 491         if (bottom
 492                 ? !curr_mb_frame_flag // bottom macroblock
 493                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 494                 ) {
 495             topright_xy -= s->mb_stride;
 496         }
 497         if (left_mb_frame_flag != curr_mb_frame_flag) {
 498             left_xy[1] = left_xy[0] = pair_xy - 1;
 499             if (curr_mb_frame_flag) {
 500                 if (bottom) {
 501                     left_block[0]= 2;
 502                     left_block[1]= 2;
 503                     left_block[2]= 3;
 504                     left_block[3]= 3;
 505                     left_block[4]= 8;
 506                     left_block[5]= 11;
 507                     left_block[6]= 8;
 508                     left_block[7]= 11;
 509                 } else {
 510                     left_block[0]= 0;
 511                     left_block[1]= 0;
 512                     left_block[2]= 1;
 513                     left_block[3]= 1;
 514                     left_block[4]= 7;
 515                     left_block[5]= 10;
 516                     left_block[6]= 7;
 517                     left_block[7]= 10;
 518                 }
 519             } else {
 520                 left_xy[1] += s->mb_stride;
 521                 //left_block[0]= 0;
 522                 left_block[1]= 2;
 523                 left_block[2]= 0;
 524                 left_block[3]= 2;
 525                 //left_block[4]= 7;
 526                 left_block[5]= 10;
 527                 left_block[6]= 7;
 528                 left_block[7]= 10;
 529             }
 530         }
 531     }
 532
 533     h->top_mb_xy = top_xy;
 534     h->left_mb_xy[0] = left_xy[0];
 535     h->left_mb_xy[1] = left_xy[1];
 536     if(for_deblock){
 537         topleft_type = h->slice_table[topleft_xy ] < 255 ? s->current_picture.mb_type[topleft_xy] : 0;
 538         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 539         topright_type= h->slice_table[topright_xy] < 255 ? s->current_picture.mb_type[topright_xy]: 0;
 540         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 541         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 542     }else{
 543         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 544         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 545         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 546         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 547         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 548     }
 549
 550     if(IS_INTRA(mb_type)){
 551         h->topleft_samples_available=
 552         h->top_samples_available=
 553         h->left_samples_available= 0xFFFF;
 554         h->topright_samples_available= 0xEEEA;
 555
 556         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 557             h->topleft_samples_available= 0xB3FF;
 558             h->top_samples_available= 0x33FF;
 559             h->topright_samples_available= 0x26EA;
 560         }
 561         for(i=0; i<2; i++){
 562             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 563                 h->topleft_samples_available&= 0xDF5F;
 564                 h->left_samples_available&= 0x5F5F;
 565             }
 566         }
 567
 568         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 569             h->topleft_samples_available&= 0x7FFF;
 570
 571         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 572             h->topright_samples_available&= 0xFBFF;
 573
 574         if(IS_INTRA4x4(mb_type)){
 575             if(IS_INTRA4x4(top_type)){
 576                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 577                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 578                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 579                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 580             }else{
 581                 int pred;
 582                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 583                     pred= -1;
 584                 else{
 585                     pred= 2;
 586                 }
 587                 h->intra4x4_pred_mode_cache[4+8*0]=
 588                 h->intra4x4_pred_mode_cache[5+8*0]=
 589                 h->intra4x4_pred_mode_cache[6+8*0]=
 590                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 591             }
 592             for(i=0; i<2; i++){
 593                 if(IS_INTRA4x4(left_type[i])){
 594                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 595                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 596                 }else{
 597                     int pred;
 598                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 599                         pred= -1;
 600                     else{
 601                         pred= 2;
 602                     }
 603                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 604                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 605                 }
 606             }
 607         }
 608     }
 609
 610
 611 /*
 612 0 . T T. T T T T
 613 1 L . .L . . . .
 614 2 L . .L . . . .
 615 3 . T TL . . . .
 616 4 L . .L . . . .
 617 5 L . .. . . . .
 618 */
 619 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 620     if(top_type){
 621         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 622         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 623         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 624         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 625
 626         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 627         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 628
 629         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 630         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 631
 632     }else{
 633         h->non_zero_count_cache[4+8*0]=
 634         h->non_zero_count_cache[5+8*0]=
 635         h->non_zero_count_cache[6+8*0]=
 636         h->non_zero_count_cache[7+8*0]=
 637
 638         h->non_zero_count_cache[1+8*0]=
 639         h->non_zero_count_cache[2+8*0]=
 640
 641         h->non_zero_count_cache[1+8*3]=
 642         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 643
 644     }
 645
 646     for (i=0; i<2; i++) {
 647         if(left_type[i]){
 648             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 649             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 650             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 651             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 652         }else{
 653             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 654             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 655             h->non_zero_count_cache[0+8*1 +   8*i]=
 656             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 657         }
 658     }
 659
 660     if( h->pps.cabac ) {
 661         // top_cbp
 662         if(top_type) {
 663             h->top_cbp = h->cbp_table[top_xy];
 664         } else if(IS_INTRA(mb_type)) {
 665             h->top_cbp = 0x1C0;
 666         } else {
 667             h->top_cbp = 0;
 668         }
 669         // left_cbp
 670         if (left_type[0]) {
 671             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 672         } else if(IS_INTRA(mb_type)) {
 673             h->left_cbp = 0x1C0;
 674         } else {
 675             h->left_cbp = 0;
 676         }
 677         if (left_type[0]) {
 678             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 679         }
 680         if (left_type[1]) {
 681             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 682         }
 683     }
 684
 685 #if 1
 686     //FIXME direct mb can skip much of this
 687     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 688         int list;
 689         for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 690             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 691                 /*if(!h->mv_cache_clean[list]){
 692                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 693                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 694                     h->mv_cache_clean[list]= 1;
 695                 }*/
 696                 continue;
 697             }
 698             h->mv_cache_clean[list]= 0;
 699
 700             if(IS_INTER(top_type)){
 701                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 702                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 703                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 704                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 705                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 706                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 707                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 708                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 709                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 710                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 711             }else{
 712                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 713                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 714                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 715                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 716                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 717             }
 718
 719             //FIXME unify cleanup or sth
 720             if(IS_INTER(left_type[0])){
 721                 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 722                 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
 723                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
 724                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
 725                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 726                 h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
 727             }else{
 728                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 0*8]=
 729                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
 730                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 731                 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 732             }
 733
 734             if(IS_INTER(left_type[1])){
 735                 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 736                 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
 737                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
 738                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
 739                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 740                 h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
 741             }else{
 742                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 2*8]=
 743                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
 744                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 745                 h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 746                 assert((!left_type[0]) == (!left_type[1]));
 747             }
 748
 749             if(for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred))
 750                 continue;
 751
 752             if(IS_INTER(topleft_type)){
 753                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 754                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 755                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 756                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 757             }else{
 758                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 759                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 760             }
 761
 762             if(IS_INTER(topright_type)){
 763                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 764                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 765                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 766                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 767             }else{
 768                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 769                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 770             }
 771
 772
 773             h->ref_cache[list][scan8[5 ]+1] =
 774             h->ref_cache[list][scan8[7 ]+1] =
 775             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 776             h->ref_cache[list][scan8[4 ]] =
 777             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 778             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 779             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 780             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 781             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 782             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 783
 784             if( h->pps.cabac ) {
 785                 /* XXX beurk, Load mvd */
 786                 if(IS_INTER(topleft_type)){
 787                     const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 788                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy];
 789                 }else{
 790                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= 0;
 791                 }
 792
 793                 if(IS_INTER(top_type)){
 794                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 795                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 796                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 797                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 798                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 799                 }else{
 800                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 801                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 802                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 803                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 804                 }
 805                 if(IS_INTER(left_type[0])){
 806                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 807                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 808                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 809                 }else{
 810                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 811                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 812                 }
 813                 if(IS_INTER(left_type[1])){
 814                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 815                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 816                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 817                 }else{
 818                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 819                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 820                 }
 821                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 822                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 823                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 824                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 825                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 826
 827                 if(h->slice_type == B_TYPE){
 828                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 829
 830                     if(IS_DIRECT(top_type)){
 831                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 832                     }else if(IS_8X8(top_type)){
 833                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 834                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 835                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 836                     }else{
 837                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 838                     }
 839
 840                     //FIXME interlacing
 841                     if(IS_DIRECT(left_type[0])){
 842                         h->direct_cache[scan8[0] - 1 + 0*8]=
 843                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 844                     }else if(IS_8X8(left_type[0])){
 845                         int b8_xy = h->mb2b8_xy[left_xy[0]] + 1;
 846                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[b8_xy];
 847                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[b8_xy + h->b8_stride];
 848                     }else{
 849                         h->direct_cache[scan8[0] - 1 + 0*8]=
 850                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 851                     }
 852                 }
 853             }
 854         }
 855     }
 856 #endif
 857
 858     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 859 }
 860
 861 static inline void write_back_intra_pred_mode(H264Context *h){
 862     MpegEncContext * const s = &h->s;
 863     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 864
 865     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 866     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 867     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 868     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 869     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 870     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 871     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 872 }
 873
 874 /**
 875  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 876  */
 877 static inline int check_intra4x4_pred_mode(H264Context *h){
 878     MpegEncContext * const s = &h->s;
 879     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 880     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 881     int i;
 882
 883     if(!(h->top_samples_available&0x8000)){
 884         for(i=0; i<4; i++){
 885             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 886             if(status<0){
 887                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 888                 return -1;
 889             } else if(status){
 890                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 891             }
 892         }
 893     }
 894
 895     if(!(h->left_samples_available&0x8000)){
 896         for(i=0; i<4; i++){
 897             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 898             if(status<0){
 899                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 900                 return -1;
 901             } else if(status){
 902                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 903             }
 904         }
 905     }
 906
 907     return 0;
 908 } //FIXME cleanup like next
 909
 910 /**
 911  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 912  */
 913 static inline int check_intra_pred_mode(H264Context *h, int mode){
 914     MpegEncContext * const s = &h->s;
 915     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 916     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 917
 918     if(mode < 0 || mode > 6) {
 919         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 920         return -1;
 921     }
 922
 923     if(!(h->top_samples_available&0x8000)){
 924         mode= top[ mode ];
 925         if(mode<0){
 926             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 927             return -1;
 928         }
 929     }
 930
 931     if(!(h->left_samples_available&0x8000)){
 932         mode= left[ mode ];
 933         if(mode<0){
 934             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 935             return -1;
 936         }
 937     }
 938
 939     return mode;
 940 }
 941
 942 /**
 943  * gets the predicted intra4x4 prediction mode.
 944  */
 945 static inline int pred_intra_mode(H264Context *h, int n){
 946     const int index8= scan8[n];
 947     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 948     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 949     const int min= FFMIN(left, top);
 950
 951     tprintf("mode:%d %d min:%d\n", left ,top, min);
 952
 953     if(min<0) return DC_PRED;
 954     else      return min;
 955 }
 956
 957 static inline void write_back_non_zero_count(H264Context *h){
 958     MpegEncContext * const s = &h->s;
 959     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 960
 961     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 962     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 963     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 964     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 965     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 966     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 967     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 968
 969     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 970     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 971     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 972
 973     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 974     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 975     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 976 }
 977
 978 /**
 979  * gets the predicted number of non zero coefficients.
 980  * @param n block index
 981  */
 982 static inline int pred_non_zero_count(H264Context *h, int n){
 983     const int index8= scan8[n];
 984     const int left= h->non_zero_count_cache[index8 - 1];
 985     const int top = h->non_zero_count_cache[index8 - 8];
 986     int i= left + top;
 987
 988     if(i<64) i= (i+1)>>1;
 989
 990     tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 991
 992     return i&31;
 993 }
 994
 995 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 996     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 997
 998     if(topright_ref != PART_NOT_AVAILABLE){
 999         *C= h->mv_cache[list][ i - 8 + part_width ];
1000         return topright_ref;
1001     }else{
1002         tprintf("topright MV not available\n");
1003
1004         *C= h->mv_cache[list][ i - 8 - 1 ];
1005         return h->ref_cache[list][ i - 8 - 1 ];
1006     }
1007 }
1008
1009 /**
1010  * gets the predicted MV.
1011  * @param n the block index
1012  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1013  * @param mx the x component of the predicted motion vector
1014  * @param my the y component of the predicted motion vector
1015  */
1016 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
1017     const int index8= scan8[n];
1018     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
1019     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
1020     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1021     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1022     const int16_t * C;
1023     int diagonal_ref, match_count;
1024
1025     assert(part_width==1 || part_width==2 || part_width==4);
1026
1027 /* mv_cache
1028   B . . A T T T T
1029   U . . L . . , .
1030   U . . L . . . .
1031   U . . L . . , .
1032   . . . L . . . .
1033 */
1034
1035     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1036     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1037     tprintf("pred_motion match_count=%d\n", match_count);
1038     if(match_count > 1){ //most common
1039         *mx= mid_pred(A[0], B[0], C[0]);
1040         *my= mid_pred(A[1], B[1], C[1]);
1041     }else if(match_count==1){
1042         if(left_ref==ref){
1043             *mx= A[0];
1044             *my= A[1];
1045         }else if(top_ref==ref){
1046             *mx= B[0];
1047             *my= B[1];
1048         }else{
1049             *mx= C[0];
1050             *my= C[1];
1051         }
1052     }else{
1053         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1054             *mx= A[0];
1055             *my= A[1];
1056         }else{
1057             *mx= mid_pred(A[0], B[0], C[0]);
1058             *my= mid_pred(A[1], B[1], C[1]);
1059         }
1060     }
1061
1062     tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1063 }
1064
1065 /**
1066  * gets the directionally predicted 16x8 MV.
1067  * @param n the block index
1068  * @param mx the x component of the predicted motion vector
1069  * @param my the y component of the predicted motion vector
1070  */
1071 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1072     if(n==0){
1073         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
1074         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1075
1076         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1077
1078         if(top_ref == ref){
1079             *mx= B[0];
1080             *my= B[1];
1081             return;
1082         }
1083     }else{
1084         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
1085         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1086
1087         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1088
1089         if(left_ref == ref){
1090             *mx= A[0];
1091             *my= A[1];
1092             return;
1093         }
1094     }
1095
1096     //RARE
1097     pred_motion(h, n, 4, list, ref, mx, my);
1098 }
1099
1100 /**
1101  * gets the directionally predicted 8x16 MV.
1102  * @param n the block index
1103  * @param mx the x component of the predicted motion vector
1104  * @param my the y component of the predicted motion vector
1105  */
1106 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1107     if(n==0){
1108         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
1109         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
1110
1111         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1112
1113         if(left_ref == ref){
1114             *mx= A[0];
1115             *my= A[1];
1116             return;
1117         }
1118     }else{
1119         const int16_t * C;
1120         int diagonal_ref;
1121
1122         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1123
1124         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1125
1126         if(diagonal_ref == ref){
1127             *mx= C[0];
1128             *my= C[1];
1129             return;
1130         }
1131     }
1132
1133     //RARE
1134     pred_motion(h, n, 2, list, ref, mx, my);
1135 }
1136
1137 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1138     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1139     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1140
1141     tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1142
1143     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1144        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1145        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1146
1147         *mx = *my = 0;
1148         return;
1149     }
1150
1151     pred_motion(h, 0, 4, 0, 0, mx, my);
1152
1153     return;
1154 }
1155
1156 static inline void direct_dist_scale_factor(H264Context * const h){
1157     const int poc = h->s.current_picture_ptr->poc;
1158     const int poc1 = h->ref_list[1][0].poc;
1159     int i;
1160     for(i=0; i<h->ref_count[0]; i++){
1161         int poc0 = h->ref_list[0][i].poc;
1162         int td = clip(poc1 - poc0, -128, 127);
1163         if(td == 0 /* FIXME || pic0 is a long-term ref */){
1164             h->dist_scale_factor[i] = 256;
1165         }else{
1166             int tb = clip(poc - poc0, -128, 127);
1167             int tx = (16384 + (ABS(td) >> 1)) / td;
1168             h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
1169         }
1170     }
1171 }
1172 static inline void direct_ref_list_init(H264Context * const h){
1173     MpegEncContext * const s = &h->s;
1174     Picture * const ref1 = &h->ref_list[1][0];
1175     Picture * const cur = s->current_picture_ptr;
1176     int list, i, j;
1177     if(cur->pict_type == I_TYPE)
1178         cur->ref_count[0] = 0;
1179     if(cur->pict_type != B_TYPE)
1180         cur->ref_count[1] = 0;
1181     for(list=0; list<2; list++){
1182         cur->ref_count[list] = h->ref_count[list];
1183         for(j=0; j<h->ref_count[list]; j++)
1184             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1185     }
1186     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1187         return;
1188     for(list=0; list<2; list++){
1189         for(i=0; i<ref1->ref_count[list]; i++){
1190             const int poc = ref1->ref_poc[list][i];
1191             h->map_col_to_list0[list][i] = PART_NOT_AVAILABLE;
1192             for(j=0; j<h->ref_count[list]; j++)
1193                 if(h->ref_list[list][j].poc == poc){
1194                     h->map_col_to_list0[list][i] = j;
1195                     break;
1196                 }
1197         }
1198     }
1199 }
1200
1201 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1202     MpegEncContext * const s = &h->s;
1203     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1204     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1205     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1206     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1207     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1208     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1209     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1210     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1211     const int is_b8x8 = IS_8X8(*mb_type);
1212     int sub_mb_type;
1213     int i8, i4;
1214
1215     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1216         /* FIXME save sub mb types from previous frames (or derive from MVs)
1217          * so we know exactly what block size to use */
1218         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1219         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1220     }else if(!is_b8x8 && (IS_16X16(mb_type_col) || IS_INTRA(mb_type_col))){
1221         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1222         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1223     }else{
1224         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1225         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1226     }
1227     if(!is_b8x8)
1228         *mb_type |= MB_TYPE_DIRECT2;
1229
1230     tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1231
1232     if(h->direct_spatial_mv_pred){
1233         int ref[2];
1234         int mv[2][2];
1235         int list;
1236
1237         /* ref = min(neighbors) */
1238         for(list=0; list<2; list++){
1239             int refa = h->ref_cache[list][scan8[0] - 1];
1240             int refb = h->ref_cache[list][scan8[0] - 8];
1241             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1242             if(refc == -2)
1243                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1244             ref[list] = refa;
1245             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1246                 ref[list] = refb;
1247             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1248                 ref[list] = refc;
1249             if(ref[list] < 0)
1250                 ref[list] = -1;
1251         }
1252
1253         if(ref[0] < 0 && ref[1] < 0){
1254             ref[0] = ref[1] = 0;
1255             mv[0][0] = mv[0][1] =
1256             mv[1][0] = mv[1][1] = 0;
1257         }else{
1258             for(list=0; list<2; list++){
1259                 if(ref[list] >= 0)
1260                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1261                 else
1262                     mv[list][0] = mv[list][1] = 0;
1263             }
1264         }
1265
1266         if(ref[1] < 0){
1267             *mb_type &= ~MB_TYPE_P0L1;
1268             sub_mb_type &= ~MB_TYPE_P0L1;
1269         }else if(ref[0] < 0){
1270             *mb_type &= ~MB_TYPE_P0L0;
1271             sub_mb_type &= ~MB_TYPE_P0L0;
1272         }
1273
1274         if(IS_16X16(*mb_type)){
1275             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref[0], 1);
1276             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, ref[1], 1);
1277             if(!IS_INTRA(mb_type_col)
1278                && (   l1ref0[0] == 0 && ABS(l1mv0[0][0]) <= 1 && ABS(l1mv0[0][1]) <= 1
1279                    || l1ref0[0]  < 0 && l1ref1[0] == 0 && ABS(l1mv1[0][0]) <= 1 && ABS(l1mv1[0][1]) <= 1)){
1280                 if(ref[0] > 0)
1281                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1282                 else
1283                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1284                 if(ref[1] > 0)
1285                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1286                 else
1287                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1288             }else{
1289                 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1290                 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1291             }
1292         }else{
1293             for(i8=0; i8<4; i8++){
1294                 const int x8 = i8&1;
1295                 const int y8 = i8>>1;
1296
1297                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1298                     continue;
1299                 h->sub_mb_type[i8] = sub_mb_type;
1300
1301                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1302                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1303                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref[0], 1);
1304                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, ref[1], 1);
1305
1306                 /* col_zero_flag */
1307                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1308                                               || l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0)){
1309                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1310                     for(i4=0; i4<4; i4++){
1311                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1312                         if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
1313                             if(ref[0] == 0)
1314                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1315                             if(ref[1] == 0)
1316                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1317                         }
1318                     }
1319                 }
1320             }
1321         }
1322     }else{ /* direct temporal mv pred */
1323         if(IS_16X16(*mb_type)){
1324             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1325             if(IS_INTRA(mb_type_col)){
1326                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
1327                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1328                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1329             }else{
1330                 const int ref0 = l1ref0[0] >= 0 ? h->map_col_to_list0[0][l1ref0[0]]
1331                                                 : h->map_col_to_list0[1][l1ref1[0]];
1332                 const int dist_scale_factor = h->dist_scale_factor[ref0];
1333                 const int16_t *mv_col = l1mv0[0];
1334                 int mv_l0[2];
1335                 mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
1336                 mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
1337                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
1338                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
1339                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
1340             }
1341         }else{
1342             for(i8=0; i8<4; i8++){
1343                 const int x8 = i8&1;
1344                 const int y8 = i8>>1;
1345                 int ref0, dist_scale_factor;
1346
1347                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1348                     continue;
1349                 h->sub_mb_type[i8] = sub_mb_type;
1350                 if(IS_INTRA(mb_type_col)){
1351                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1352                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1353                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1354                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1355                     continue;
1356                 }
1357
1358                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1359                 if(ref0 >= 0)
1360                     ref0 = h->map_col_to_list0[0][ref0];
1361                 else
1362                     ref0 = h->map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1363                 dist_scale_factor = h->dist_scale_factor[ref0];
1364
1365                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1366                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1367                 for(i4=0; i4<4; i4++){
1368                     const int16_t *mv_col = l1mv0[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1369                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1370                     mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
1371                     mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
1372                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1373                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1374                 }
1375             }
1376         }
1377     }
1378 }
1379
1380 static inline void write_back_motion(H264Context *h, int mb_type){
1381     MpegEncContext * const s = &h->s;
1382     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1383     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1384     int list;
1385
1386     for(list=0; list<2; list++){
1387         int y;
1388         if(!USES_LIST(mb_type, list)){
1389             if(1){ //FIXME skip or never read if mb_type doesn't use it
1390                 for(y=0; y<4; y++){
1391                     *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]=
1392                     *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= 0;
1393                 }
1394                 if( h->pps.cabac ) {
1395                     /* FIXME needed ? */
1396                     for(y=0; y<4; y++){
1397                         *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]=
1398                         *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= 0;
1399                     }
1400                 }
1401                 for(y=0; y<2; y++){
1402                     s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]=
1403                     s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= LIST_NOT_USED;
1404                 }
1405             }
1406             continue;
1407         }
1408
1409         for(y=0; y<4; y++){
1410             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1411             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1412         }
1413         if( h->pps.cabac ) {
1414             for(y=0; y<4; y++){
1415                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1416                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1417             }
1418         }
1419         for(y=0; y<2; y++){
1420             s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+0 + 16*y];
1421             s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+2 + 16*y];
1422         }
1423     }
1424
1425     if(h->slice_type == B_TYPE && h->pps.cabac){
1426         if(IS_8X8(mb_type)){
1427             h->direct_table[b8_xy+1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1428             h->direct_table[b8_xy+0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1429             h->direct_table[b8_xy+1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1430         }
1431     }
1432 }
1433
1434 /**
1435  * Decodes a network abstraction layer unit.
1436  * @param consumed is the number of bytes used as input
1437  * @param length is the length of the array
1438  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1439  * @returns decoded bytes, might be src+1 if no escapes
1440  */
1441 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1442     int i, si, di;
1443     uint8_t *dst;
1444
1445 //    src[0]&0x80;              //forbidden bit
1446     h->nal_ref_idc= src[0]>>5;
1447     h->nal_unit_type= src[0]&0x1F;
1448
1449     src++; length--;
1450 #if 0
1451     for(i=0; i<length; i++)
1452         printf("%2X ", src[i]);
1453 #endif
1454     for(i=0; i+1<length; i+=2){
1455         if(src[i]) continue;
1456         if(i>0 && src[i-1]==0) i--;
1457         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1458             if(src[i+2]!=3){
1459                 /* startcode, so we must be past the end */
1460                 length=i;
1461             }
1462             break;
1463         }
1464     }
1465
1466     if(i>=length-1){ //no escaped 0
1467         *dst_length= length;
1468         *consumed= length+1; //+1 for the header
1469         return src;
1470     }
1471
1472     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1473     dst= h->rbsp_buffer;
1474
1475 //printf("decoding esc\n");
1476     si=di=0;
1477     while(si<length){
1478         //remove escapes (very rare 1:2^22)
1479         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1480             if(src[si+2]==3){ //escape
1481                 dst[di++]= 0;
1482                 dst[di++]= 0;
1483                 si+=3;
1484                 continue;
1485             }else //next start code
1486                 break;
1487         }
1488
1489         dst[di++]= src[si++];
1490     }
1491
1492     *dst_length= di;
1493     *consumed= si + 1;//+1 for the header
1494 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1495     return dst;
1496 }
1497
1498 #if 0
1499 /**
1500  * @param src the data which should be escaped
1501  * @param dst the target buffer, dst+1 == src is allowed as a special case
1502  * @param length the length of the src data
1503  * @param dst_length the length of the dst array
1504  * @returns length of escaped data in bytes or -1 if an error occured
1505  */
1506 static int encode_nal(H264Context *h, uint8_t *dst, uint8_t *src, int length, int dst_length){
1507     int i, escape_count, si, di;
1508     uint8_t *temp;
1509
1510     assert(length>=0);
1511     assert(dst_length>0);
1512
1513     dst[0]= (h->nal_ref_idc<<5) + h->nal_unit_type;
1514
1515     if(length==0) return 1;
1516
1517     escape_count= 0;
1518     for(i=0; i<length; i+=2){
1519         if(src[i]) continue;
1520         if(i>0 && src[i-1]==0)
1521             i--;
1522         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1523             escape_count++;
1524             i+=2;
1525         }
1526     }
1527
1528     if(escape_count==0){
1529         if(dst+1 != src)
1530             memcpy(dst+1, src, length);
1531         return length + 1;
1532     }
1533
1534     if(length + escape_count + 1> dst_length)
1535         return -1;
1536
1537     //this should be damn rare (hopefully)
1538
1539     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length + escape_count);
1540     temp= h->rbsp_buffer;
1541 //printf("encoding esc\n");
1542
1543     si= 0;
1544     di= 0;
1545     while(si < length){
1546         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1547             temp[di++]= 0; si++;
1548             temp[di++]= 0; si++;
1549             temp[di++]= 3;
1550             temp[di++]= src[si++];
1551         }
1552         else
1553             temp[di++]= src[si++];
1554     }
1555     memcpy(dst+1, temp, length+escape_count);
1556
1557     assert(di == length+escape_count);
1558
1559     return di + 1;
1560 }
1561
1562 /**
1563  * write 1,10,100,1000,... for alignment, yes its exactly inverse to mpeg4
1564  */
1565 static void encode_rbsp_trailing(PutBitContext *pb){
1566     int length;
1567     put_bits(pb, 1, 1);
1568     length= (-put_bits_count(pb))&7;
1569     if(length) put_bits(pb, length, 0);
1570 }
1571 #endif
1572
1573 /**
1574  * identifies the exact end of the bitstream
1575  * @return the length of the trailing, or 0 if damaged
1576  */
1577 static int decode_rbsp_trailing(uint8_t *src){
1578     int v= *src;
1579     int r;
1580
1581     tprintf("rbsp trailing %X\n", v);
1582
1583     for(r=1; r<9; r++){
1584         if(v&1) return r;
1585         v>>=1;
1586     }
1587     return 0;
1588 }
1589
1590 /**
1591  * idct tranforms the 16 dc values and dequantize them.
1592  * @param qp quantization parameter
1593  */
1594 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp){
1595     const int qmul= dequant_coeff[qp][0];
1596 #define stride 16
1597     int i;
1598     int temp[16]; //FIXME check if this is a good idea
1599     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1600     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1601
1602 //memset(block, 64, 2*256);
1603 //return;
1604     for(i=0; i<4; i++){
1605         const int offset= y_offset[i];
1606         const int z0= block[offset+stride*0] + block[offset+stride*4];
1607         const int z1= block[offset+stride*0] - block[offset+stride*4];
1608         const int z2= block[offset+stride*1] - block[offset+stride*5];
1609         const int z3= block[offset+stride*1] + block[offset+stride*5];
1610
1611         temp[4*i+0]= z0+z3;
1612         temp[4*i+1]= z1+z2;
1613         temp[4*i+2]= z1-z2;
1614         temp[4*i+3]= z0-z3;
1615     }
1616
1617     for(i=0; i<4; i++){
1618         const int offset= x_offset[i];
1619         const int z0= temp[4*0+i] + temp[4*2+i];
1620         const int z1= temp[4*0+i] - temp[4*2+i];
1621         const int z2= temp[4*1+i] - temp[4*3+i];
1622         const int z3= temp[4*1+i] + temp[4*3+i];
1623
1624         block[stride*0 +offset]= ((z0 + z3)*qmul + 2)>>2; //FIXME think about merging this into decode_resdual
1625         block[stride*2 +offset]= ((z1 + z2)*qmul + 2)>>2;
1626         block[stride*8 +offset]= ((z1 - z2)*qmul + 2)>>2;
1627         block[stride*10+offset]= ((z0 - z3)*qmul + 2)>>2;
1628     }
1629 }
1630
1631 #if 0
1632 /**
1633  * dct tranforms the 16 dc values.
1634  * @param qp quantization parameter ??? FIXME
1635  */
1636 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1637 //    const int qmul= dequant_coeff[qp][0];
1638     int i;
1639     int temp[16]; //FIXME check if this is a good idea
1640     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1641     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1642
1643     for(i=0; i<4; i++){
1644         const int offset= y_offset[i];
1645         const int z0= block[offset+stride*0] + block[offset+stride*4];
1646         const int z1= block[offset+stride*0] - block[offset+stride*4];
1647         const int z2= block[offset+stride*1] - block[offset+stride*5];
1648         const int z3= block[offset+stride*1] + block[offset+stride*5];
1649
1650         temp[4*i+0]= z0+z3;
1651         temp[4*i+1]= z1+z2;
1652         temp[4*i+2]= z1-z2;
1653         temp[4*i+3]= z0-z3;
1654     }
1655
1656     for(i=0; i<4; i++){
1657         const int offset= x_offset[i];
1658         const int z0= temp[4*0+i] + temp[4*2+i];
1659         const int z1= temp[4*0+i] - temp[4*2+i];
1660         const int z2= temp[4*1+i] - temp[4*3+i];
1661         const int z3= temp[4*1+i] + temp[4*3+i];
1662
1663         block[stride*0 +offset]= (z0 + z3)>>1;
1664         block[stride*2 +offset]= (z1 + z2)>>1;
1665         block[stride*8 +offset]= (z1 - z2)>>1;
1666         block[stride*10+offset]= (z0 - z3)>>1;
1667     }
1668 }
1669 #endif
1670
1671 #undef xStride
1672 #undef stride
1673
1674 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp){
1675     const int qmul= dequant_coeff[qp][0];
1676     const int stride= 16*2;
1677     const int xStride= 16;
1678     int a,b,c,d,e;
1679
1680     a= block[stride*0 + xStride*0];
1681     b= block[stride*0 + xStride*1];
1682     c= block[stride*1 + xStride*0];
1683     d= block[stride*1 + xStride*1];
1684
1685     e= a-b;
1686     a= a+b;
1687     b= c-d;
1688     c= c+d;
1689
1690     block[stride*0 + xStride*0]= ((a+c)*qmul + 0)>>1;
1691     block[stride*0 + xStride*1]= ((e+b)*qmul + 0)>>1;
1692     block[stride*1 + xStride*0]= ((a-c)*qmul + 0)>>1;
1693     block[stride*1 + xStride*1]= ((e-b)*qmul + 0)>>1;
1694 }
1695
1696 #if 0
1697 static void chroma_dc_dct_c(DCTELEM *block){
1698     const int stride= 16*2;
1699     const int xStride= 16;
1700     int a,b,c,d,e;
1701
1702     a= block[stride*0 + xStride*0];
1703     b= block[stride*0 + xStride*1];
1704     c= block[stride*1 + xStride*0];
1705     d= block[stride*1 + xStride*1];
1706
1707     e= a-b;
1708     a= a+b;
1709     b= c-d;
1710     c= c+d;
1711
1712     block[stride*0 + xStride*0]= (a+c);
1713     block[stride*0 + xStride*1]= (e+b);
1714     block[stride*1 + xStride*0]= (a-c);
1715     block[stride*1 + xStride*1]= (e-b);
1716 }
1717 #endif
1718
1719 /**
1720  * gets the chroma qp.
1721  */
1722 static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
1723
1724     return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
1725 }
1726
1727
1728 #if 0
1729 static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){
1730     int i;
1731     //FIXME try int temp instead of block
1732
1733     for(i=0; i<4; i++){
1734         const int d0= src1[0 + i*stride] - src2[0 + i*stride];
1735         const int d1= src1[1 + i*stride] - src2[1 + i*stride];
1736         const int d2= src1[2 + i*stride] - src2[2 + i*stride];
1737         const int d3= src1[3 + i*stride] - src2[3 + i*stride];
1738         const int z0= d0 + d3;
1739         const int z3= d0 - d3;
1740         const int z1= d1 + d2;
1741         const int z2= d1 - d2;
1742
1743         block[0 + 4*i]=   z0 +   z1;
1744         block[1 + 4*i]= 2*z3 +   z2;
1745         block[2 + 4*i]=   z0 -   z1;
1746         block[3 + 4*i]=   z3 - 2*z2;
1747     }
1748
1749     for(i=0; i<4; i++){
1750         const int z0= block[0*4 + i] + block[3*4 + i];
1751         const int z3= block[0*4 + i] - block[3*4 + i];
1752         const int z1= block[1*4 + i] + block[2*4 + i];
1753         const int z2= block[1*4 + i] - block[2*4 + i];
1754
1755         block[0*4 + i]=   z0 +   z1;
1756         block[1*4 + i]= 2*z3 +   z2;
1757         block[2*4 + i]=   z0 -   z1;
1758         block[3*4 + i]=   z3 - 2*z2;
1759     }
1760 }
1761 #endif
1762
1763 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
1764 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
1765 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int seperate_dc){
1766     int i;
1767     const int * const quant_table= quant_coeff[qscale];
1768     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1769     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1770     const unsigned int threshold2= (threshold1<<1);
1771     int last_non_zero;
1772
1773     if(seperate_dc){
1774         if(qscale<=18){
1775             //avoid overflows
1776             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1777             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1778             const unsigned int dc_threshold2= (dc_threshold1<<1);
1779
1780             int level= block[0]*quant_coeff[qscale+18][0];
1781             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1782                 if(level>0){
1783                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1784                     block[0]= level;
1785                 }else{
1786                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1787                     block[0]= -level;
1788                 }
1789 //                last_non_zero = i;
1790             }else{
1791                 block[0]=0;
1792             }
1793         }else{
1794             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1795             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1796             const unsigned int dc_threshold2= (dc_threshold1<<1);
1797
1798             int level= block[0]*quant_table[0];
1799             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1800                 if(level>0){
1801                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1802                     block[0]= level;
1803                 }else{
1804                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1805                     block[0]= -level;
1806                 }
1807 //                last_non_zero = i;
1808             }else{
1809                 block[0]=0;
1810             }
1811         }
1812         last_non_zero= 0;
1813         i=1;
1814     }else{
1815         last_non_zero= -1;
1816         i=0;
1817     }
1818
1819     for(; i<16; i++){
1820         const int j= scantable[i];
1821         int level= block[j]*quant_table[j];
1822
1823 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1824 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1825         if(((unsigned)(level+threshold1))>threshold2){
1826             if(level>0){
1827                 level= (bias + level)>>QUANT_SHIFT;
1828                 block[j]= level;
1829             }else{
1830                 level= (bias - level)>>QUANT_SHIFT;
1831                 block[j]= -level;
1832             }
1833             last_non_zero = i;
1834         }else{
1835             block[j]=0;
1836         }
1837     }
1838
1839     return last_non_zero;
1840 }
1841
1842 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
1843     const uint32_t a= ((uint32_t*)(src-stride))[0];
1844     ((uint32_t*)(src+0*stride))[0]= a;
1845     ((uint32_t*)(src+1*stride))[0]= a;
1846     ((uint32_t*)(src+2*stride))[0]= a;
1847     ((uint32_t*)(src+3*stride))[0]= a;
1848 }
1849
1850 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
1851     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
1852     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
1853     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
1854     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
1855 }
1856
1857 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
1858     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
1859                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
1860
1861     ((uint32_t*)(src+0*stride))[0]=
1862     ((uint32_t*)(src+1*stride))[0]=
1863     ((uint32_t*)(src+2*stride))[0]=
1864     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1865 }
1866
1867 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
1868     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
1869
1870     ((uint32_t*)(src+0*stride))[0]=
1871     ((uint32_t*)(src+1*stride))[0]=
1872     ((uint32_t*)(src+2*stride))[0]=
1873     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1874 }
1875
1876 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
1877     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
1878
1879     ((uint32_t*)(src+0*stride))[0]=
1880     ((uint32_t*)(src+1*stride))[0]=
1881     ((uint32_t*)(src+2*stride))[0]=
1882     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1883 }
1884
1885 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
1886     ((uint32_t*)(src+0*stride))[0]=
1887     ((uint32_t*)(src+1*stride))[0]=
1888     ((uint32_t*)(src+2*stride))[0]=
1889     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
1890 }
1891
1892
1893 #define LOAD_TOP_RIGHT_EDGE\
1894     const int t4= topright[0];\
1895     const int t5= topright[1];\
1896     const int t6= topright[2];\
1897     const int t7= topright[3];\
1898
1899 #define LOAD_LEFT_EDGE\
1900     const int l0= src[-1+0*stride];\
1901     const int l1= src[-1+1*stride];\
1902     const int l2= src[-1+2*stride];\
1903     const int l3= src[-1+3*stride];\
1904
1905 #define LOAD_TOP_EDGE\
1906     const int t0= src[ 0-1*stride];\
1907     const int t1= src[ 1-1*stride];\
1908     const int t2= src[ 2-1*stride];\
1909     const int t3= src[ 3-1*stride];\
1910
1911 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
1912     const int lt= src[-1-1*stride];
1913     LOAD_TOP_EDGE
1914     LOAD_LEFT_EDGE
1915
1916     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
1917     src[0+2*stride]=
1918     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
1919     src[0+1*stride]=
1920     src[1+2*stride]=
1921     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
1922     src[0+0*stride]=
1923     src[1+1*stride]=
1924     src[2+2*stride]=
1925     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1926     src[1+0*stride]=
1927     src[2+1*stride]=
1928     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
1929     src[2+0*stride]=
1930     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1931     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1932 }
1933
1934 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
1935     LOAD_TOP_EDGE
1936     LOAD_TOP_RIGHT_EDGE
1937 //    LOAD_LEFT_EDGE
1938
1939     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
1940     src[1+0*stride]=
1941     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
1942     src[2+0*stride]=
1943     src[1+1*stride]=
1944     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
1945     src[3+0*stride]=
1946     src[2+1*stride]=
1947     src[1+2*stride]=
1948     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
1949     src[3+1*stride]=
1950     src[2+2*stride]=
1951     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
1952     src[3+2*stride]=
1953     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
1954     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
1955 }
1956
1957 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
1958     const int lt= src[-1-1*stride];
1959     LOAD_TOP_EDGE
1960     LOAD_LEFT_EDGE
1961     const __attribute__((unused)) int unu= l3;
1962
1963     src[0+0*stride]=
1964     src[1+2*stride]=(lt + t0 + 1)>>1;
1965     src[1+0*stride]=
1966     src[2+2*stride]=(t0 + t1 + 1)>>1;
1967     src[2+0*stride]=
1968     src[3+2*stride]=(t1 + t2 + 1)>>1;
1969     src[3+0*stride]=(t2 + t3 + 1)>>1;
1970     src[0+1*stride]=
1971     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1972     src[1+1*stride]=
1973     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
1974     src[2+1*stride]=
1975     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1976     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1977     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1978     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1979 }
1980
1981 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
1982     LOAD_TOP_EDGE
1983     LOAD_TOP_RIGHT_EDGE
1984     const __attribute__((unused)) int unu= t7;
1985
1986     src[0+0*stride]=(t0 + t1 + 1)>>1;
1987     src[1+0*stride]=
1988     src[0+2*stride]=(t1 + t2 + 1)>>1;
1989     src[2+0*stride]=
1990     src[1+2*stride]=(t2 + t3 + 1)>>1;
1991     src[3+0*stride]=
1992     src[2+2*stride]=(t3 + t4+ 1)>>1;
1993     src[3+2*stride]=(t4 + t5+ 1)>>1;
1994     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1995     src[1+1*stride]=
1996     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1997     src[2+1*stride]=
1998     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
1999     src[3+1*stride]=
2000     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
2001     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
2002 }
2003
2004 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
2005     LOAD_LEFT_EDGE
2006
2007     src[0+0*stride]=(l0 + l1 + 1)>>1;
2008     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2009     src[2+0*stride]=
2010     src[0+1*stride]=(l1 + l2 + 1)>>1;
2011     src[3+0*stride]=
2012     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2013     src[2+1*stride]=
2014     src[0+2*stride]=(l2 + l3 + 1)>>1;
2015     src[3+1*stride]=
2016     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
2017     src[3+2*stride]=
2018     src[1+3*stride]=
2019     src[0+3*stride]=
2020     src[2+2*stride]=
2021     src[2+3*stride]=
2022     src[3+3*stride]=l3;
2023 }
2024
2025 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
2026     const int lt= src[-1-1*stride];
2027     LOAD_TOP_EDGE
2028     LOAD_LEFT_EDGE
2029     const __attribute__((unused)) int unu= t3;
2030
2031     src[0+0*stride]=
2032     src[2+1*stride]=(lt + l0 + 1)>>1;
2033     src[1+0*stride]=
2034     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
2035     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
2036     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2037     src[0+1*stride]=
2038     src[2+2*stride]=(l0 + l1 + 1)>>1;
2039     src[1+1*stride]=
2040     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2041     src[0+2*stride]=
2042     src[2+3*stride]=(l1 + l2+ 1)>>1;
2043     src[1+2*stride]=
2044     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2045     src[0+3*stride]=(l2 + l3 + 1)>>1;
2046     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2047 }
2048
2049 static void pred16x16_vertical_c(uint8_t *src, int stride){
2050     int i;
2051     const uint32_t a= ((uint32_t*)(src-stride))[0];
2052     const uint32_t b= ((uint32_t*)(src-stride))[1];
2053     const uint32_t c= ((uint32_t*)(src-stride))[2];
2054     const uint32_t d= ((uint32_t*)(src-stride))[3];
2055
2056     for(i=0; i<16; i++){
2057         ((uint32_t*)(src+i*stride))[0]= a;
2058         ((uint32_t*)(src+i*stride))[1]= b;
2059         ((uint32_t*)(src+i*stride))[2]= c;
2060         ((uint32_t*)(src+i*stride))[3]= d;
2061     }
2062 }
2063
2064 static void pred16x16_horizontal_c(uint8_t *src, int stride){
2065     int i;
2066
2067     for(i=0; i<16; i++){
2068         ((uint32_t*)(src+i*stride))[0]=
2069         ((uint32_t*)(src+i*stride))[1]=
2070         ((uint32_t*)(src+i*stride))[2]=
2071         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2072     }
2073 }
2074
2075 static void pred16x16_dc_c(uint8_t *src, int stride){
2076     int i, dc=0;
2077
2078     for(i=0;i<16; i++){
2079         dc+= src[-1+i*stride];
2080     }
2081
2082     for(i=0;i<16; i++){
2083         dc+= src[i-stride];
2084     }
2085
2086     dc= 0x01010101*((dc + 16)>>5);
2087
2088     for(i=0; i<16; i++){
2089         ((uint32_t*)(src+i*stride))[0]=
2090         ((uint32_t*)(src+i*stride))[1]=
2091         ((uint32_t*)(src+i*stride))[2]=
2092         ((uint32_t*)(src+i*stride))[3]= dc;
2093     }
2094 }
2095
2096 static void pred16x16_left_dc_c(uint8_t *src, int stride){
2097     int i, dc=0;
2098
2099     for(i=0;i<16; i++){
2100         dc+= src[-1+i*stride];
2101     }
2102
2103     dc= 0x01010101*((dc + 8)>>4);
2104
2105     for(i=0; i<16; i++){
2106         ((uint32_t*)(src+i*stride))[0]=
2107         ((uint32_t*)(src+i*stride))[1]=
2108         ((uint32_t*)(src+i*stride))[2]=
2109         ((uint32_t*)(src+i*stride))[3]= dc;
2110     }
2111 }
2112
2113 static void pred16x16_top_dc_c(uint8_t *src, int stride){
2114     int i, dc=0;
2115
2116     for(i=0;i<16; i++){
2117         dc+= src[i-stride];
2118     }
2119     dc= 0x01010101*((dc + 8)>>4);
2120
2121     for(i=0; i<16; i++){
2122         ((uint32_t*)(src+i*stride))[0]=
2123         ((uint32_t*)(src+i*stride))[1]=
2124         ((uint32_t*)(src+i*stride))[2]=
2125         ((uint32_t*)(src+i*stride))[3]= dc;
2126     }
2127 }
2128
2129 static void pred16x16_128_dc_c(uint8_t *src, int stride){
2130     int i;
2131
2132     for(i=0; i<16; i++){
2133         ((uint32_t*)(src+i*stride))[0]=
2134         ((uint32_t*)(src+i*stride))[1]=
2135         ((uint32_t*)(src+i*stride))[2]=
2136         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2137     }
2138 }
2139
2140 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2141   int i, j, k;
2142   int a;
2143   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2144   const uint8_t * const src0 = src+7-stride;
2145   const uint8_t *src1 = src+8*stride-1;
2146   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
2147   int H = src0[1] - src0[-1];
2148   int V = src1[0] - src2[ 0];
2149   for(k=2; k<=8; ++k) {
2150     src1 += stride; src2 -= stride;
2151     H += k*(src0[k] - src0[-k]);
2152     V += k*(src1[0] - src2[ 0]);
2153   }
2154   if(svq3){
2155     H = ( 5*(H/4) ) / 16;
2156     V = ( 5*(V/4) ) / 16;
2157
2158     /* required for 100% accuracy */
2159     i = H; H = V; V = i;
2160   }else{
2161     H = ( 5*H+32 ) >> 6;
2162     V = ( 5*V+32 ) >> 6;
2163   }
2164
2165   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2166   for(j=16; j>0; --j) {
2167     int b = a;
2168     a += V;
2169     for(i=-16; i<0; i+=4) {
2170       src[16+i] = cm[ (b    ) >> 5 ];
2171       src[17+i] = cm[ (b+  H) >> 5 ];
2172       src[18+i] = cm[ (b+2*H) >> 5 ];
2173       src[19+i] = cm[ (b+3*H) >> 5 ];
2174       b += 4*H;
2175     }
2176     src += stride;
2177   }
2178 }
2179
2180 static void pred16x16_plane_c(uint8_t *src, int stride){
2181     pred16x16_plane_compat_c(src, stride, 0);
2182 }
2183
2184 static void pred8x8_vertical_c(uint8_t *src, int stride){
2185     int i;
2186     const uint32_t a= ((uint32_t*)(src-stride))[0];
2187     const uint32_t b= ((uint32_t*)(src-stride))[1];
2188
2189     for(i=0; i<8; i++){
2190         ((uint32_t*)(src+i*stride))[0]= a;
2191         ((uint32_t*)(src+i*stride))[1]= b;
2192     }
2193 }
2194
2195 static void pred8x8_horizontal_c(uint8_t *src, int stride){
2196     int i;
2197
2198     for(i=0; i<8; i++){
2199         ((uint32_t*)(src+i*stride))[0]=
2200         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2201     }
2202 }
2203
2204 static void pred8x8_128_dc_c(uint8_t *src, int stride){
2205     int i;
2206
2207     for(i=0; i<8; i++){
2208         ((uint32_t*)(src+i*stride))[0]=
2209         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2210     }
2211 }
2212
2213 static void pred8x8_left_dc_c(uint8_t *src, int stride){
2214     int i;
2215     int dc0, dc2;
2216
2217     dc0=dc2=0;
2218     for(i=0;i<4; i++){
2219         dc0+= src[-1+i*stride];
2220         dc2+= src[-1+(i+4)*stride];
2221     }
2222     dc0= 0x01010101*((dc0 + 2)>>2);
2223     dc2= 0x01010101*((dc2 + 2)>>2);
2224
2225     for(i=0; i<4; i++){
2226         ((uint32_t*)(src+i*stride))[0]=
2227         ((uint32_t*)(src+i*stride))[1]= dc0;
2228     }
2229     for(i=4; i<8; i++){
2230         ((uint32_t*)(src+i*stride))[0]=
2231         ((uint32_t*)(src+i*stride))[1]= dc2;
2232     }
2233 }
2234
2235 static void pred8x8_top_dc_c(uint8_t *src, int stride){
2236     int i;
2237     int dc0, dc1;
2238
2239     dc0=dc1=0;
2240     for(i=0;i<4; i++){
2241         dc0+= src[i-stride];
2242         dc1+= src[4+i-stride];
2243     }
2244     dc0= 0x01010101*((dc0 + 2)>>2);
2245     dc1= 0x01010101*((dc1 + 2)>>2);
2246
2247     for(i=0; i<4; i++){
2248         ((uint32_t*)(src+i*stride))[0]= dc0;
2249         ((uint32_t*)(src+i*stride))[1]= dc1;
2250     }
2251     for(i=4; i<8; i++){
2252         ((uint32_t*)(src+i*stride))[0]= dc0;
2253         ((uint32_t*)(src+i*stride))[1]= dc1;
2254     }
2255 }
2256
2257
2258 static void pred8x8_dc_c(uint8_t *src, int stride){
2259     int i;
2260     int dc0, dc1, dc2, dc3;
2261
2262     dc0=dc1=dc2=0;
2263     for(i=0;i<4; i++){
2264         dc0+= src[-1+i*stride] + src[i-stride];
2265         dc1+= src[4+i-stride];
2266         dc2+= src[-1+(i+4)*stride];
2267     }
2268     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2269     dc0= 0x01010101*((dc0 + 4)>>3);
2270     dc1= 0x01010101*((dc1 + 2)>>2);
2271     dc2= 0x01010101*((dc2 + 2)>>2);
2272
2273     for(i=0; i<4; i++){
2274         ((uint32_t*)(src+i*stride))[0]= dc0;
2275         ((uint32_t*)(src+i*stride))[1]= dc1;
2276     }
2277     for(i=4; i<8; i++){
2278         ((uint32_t*)(src+i*stride))[0]= dc2;
2279         ((uint32_t*)(src+i*stride))[1]= dc3;
2280     }
2281 }
2282
2283 static void pred8x8_plane_c(uint8_t *src, int stride){
2284   int j, k;
2285   int a;
2286   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2287   const uint8_t * const src0 = src+3-stride;
2288   const uint8_t *src1 = src+4*stride-1;
2289   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2290   int H = src0[1] - src0[-1];
2291   int V = src1[0] - src2[ 0];
2292   for(k=2; k<=4; ++k) {
2293     src1 += stride; src2 -= stride;
2294     H += k*(src0[k] - src0[-k]);
2295     V += k*(src1[0] - src2[ 0]);
2296   }
2297   H = ( 17*H+16 ) >> 5;
2298   V = ( 17*V+16 ) >> 5;
2299
2300   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2301   for(j=8; j>0; --j) {
2302     int b = a;
2303     a += V;
2304     src[0] = cm[ (b    ) >> 5 ];
2305     src[1] = cm[ (b+  H) >> 5 ];
2306     src[2] = cm[ (b+2*H) >> 5 ];
2307     src[3] = cm[ (b+3*H) >> 5 ];
2308     src[4] = cm[ (b+4*H) >> 5 ];
2309     src[5] = cm[ (b+5*H) >> 5 ];
2310     src[6] = cm[ (b+6*H) >> 5 ];
2311     src[7] = cm[ (b+7*H) >> 5 ];
2312     src += stride;
2313   }
2314 }
2315
2316 #define SRC(x,y) src[(x)+(y)*stride]
2317 #define PL(y) \
2318     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2319 #define PREDICT_8x8_LOAD_LEFT \
2320     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2321                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2322     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2323     const int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2324
2325 #define PT(x) \
2326     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2327 #define PREDICT_8x8_LOAD_TOP \
2328     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2329                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2330     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2331     const int t7 = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2332                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2333
2334 #define PTR(x) \
2335     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2336 #define PREDICT_8x8_LOAD_TOPRIGHT \
2337     int t8, t9, t10, t11, t12, t13, t14, t15; \
2338     if(has_topright) { \
2339         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2340         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2341     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2342
2343 #define PREDICT_8x8_LOAD_TOPLEFT \
2344     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2345
2346 #define PREDICT_8x8_DC(v) \
2347     int y; \
2348     for( y = 0; y < 8; y++ ) { \
2349         ((uint32_t*)src)[0] = \
2350         ((uint32_t*)src)[1] = v; \
2351         src += stride; \
2352     }
2353
2354 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2355 {
2356     PREDICT_8x8_DC(0x80808080);
2357 }
2358 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2359 {
2360     PREDICT_8x8_LOAD_LEFT;
2361     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2362     PREDICT_8x8_DC(dc);
2363 }
2364 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2365 {
2366     PREDICT_8x8_LOAD_TOP;
2367     const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2368     PREDICT_8x8_DC(dc);
2369 }
2370 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2371 {
2372     PREDICT_8x8_LOAD_LEFT;
2373     PREDICT_8x8_LOAD_TOP;
2374     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2375                          +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2376     PREDICT_8x8_DC(dc);
2377 }
2378 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2379 {
2380     PREDICT_8x8_LOAD_LEFT;
2381 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2382                ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2383     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2384 #undef ROW
2385 }
2386 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2387 {
2388     int y;
2389     PREDICT_8x8_LOAD_TOP;
2390     src[0] = t0;
2391     src[1] = t1;
2392     src[2] = t2;
2393     src[3] = t3;
2394     src[4] = t4;
2395     src[5] = t5;
2396     src[6] = t6;
2397     src[7] = t7;
2398     for( y = 1; y < 8; y++ )
2399         *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2400 }
2401 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2402 {
2403     PREDICT_8x8_LOAD_TOP;
2404     PREDICT_8x8_LOAD_TOPRIGHT;
2405     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2406     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2407     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2408     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2409     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2410     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2411     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2412     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2413     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2414     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2415     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2416     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2417     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2418     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2419     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2420 }
2421 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2422 {
2423     PREDICT_8x8_LOAD_TOP;
2424     PREDICT_8x8_LOAD_LEFT;
2425     PREDICT_8x8_LOAD_TOPLEFT;
2426     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2427     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2428     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2429     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2430     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2431     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2432     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2433     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2434     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2435     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2436     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2437     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2438     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2439     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2440     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2441
2442 }
2443 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2444 {
2445     PREDICT_8x8_LOAD_TOP;
2446     PREDICT_8x8_LOAD_LEFT;
2447     PREDICT_8x8_LOAD_TOPLEFT;
2448     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2449     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2450     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2451     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2452     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2453     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2454     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2455     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2456     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2457     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2458     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2459     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2460     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2461     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2462     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2463     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2464     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2465     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2466     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2467     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2468     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2469     SRC(7,0)= (t6 + t7 + 1) >> 1;
2470 }
2471 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2472 {
2473     PREDICT_8x8_LOAD_TOP;
2474     PREDICT_8x8_LOAD_LEFT;
2475     PREDICT_8x8_LOAD_TOPLEFT;
2476     SRC(0,7)= (l6 + l7 + 1) >> 1;
2477     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2478     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2479     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2480     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2481     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2482     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2483     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2484     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2485     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2486     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2487     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2488     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2489     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2490     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2491     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2492     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2493     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2494     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2495     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2496     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2497     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2498 }
2499 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2500 {
2501     PREDICT_8x8_LOAD_TOP;
2502     PREDICT_8x8_LOAD_TOPRIGHT;
2503     SRC(0,0)= (t0 + t1 + 1) >> 1;
2504     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2505     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2506     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2507     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2508     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2509     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2510     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2511     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2512     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2513     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2514     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2515     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2516     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2517     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2518     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2519     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2520     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2521     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2522     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2523     SRC(7,6)= (t10 + t11 + 1) >> 1;
2524     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2525 }
2526 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2527 {
2528     PREDICT_8x8_LOAD_LEFT;
2529     SRC(0,0)= (l0 + l1 + 1) >> 1;
2530     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2531     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2532     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2533     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2534     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2535     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2536     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2537     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2538     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2539     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2540     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2541     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2542     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2543     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2544     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2545     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2546     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2547 }
2548 #undef PREDICT_8x8_LOAD_LEFT
2549 #undef PREDICT_8x8_LOAD_TOP
2550 #undef PREDICT_8x8_LOAD_TOPLEFT
2551 #undef PREDICT_8x8_LOAD_TOPRIGHT
2552 #undef PREDICT_8x8_DC
2553 #undef PTR
2554 #undef PT
2555 #undef PL
2556 #undef SRC
2557
2558 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2559                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2560                            int src_x_offset, int src_y_offset,
2561                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2562     MpegEncContext * const s = &h->s;
2563     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2564     const int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2565     const int luma_xy= (mx&3) + ((my&3)<<2);
2566     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*s->linesize;
2567     uint8_t * src_cb= pic->data[1] + (mx>>3) + (my>>3)*s->uvlinesize;
2568     uint8_t * src_cr= pic->data[2] + (mx>>3) + (my>>3)*s->uvlinesize;
2569     int extra_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16; //FIXME increase edge?, IMHO not worth it
2570     int extra_height= extra_width;
2571     int emu=0;
2572     const int full_mx= mx>>2;
2573     const int full_my= my>>2;
2574
2575     assert(pic->data[0]);
2576
2577     if(mx&7) extra_width -= 3;
2578     if(my&7) extra_height -= 3;
2579
2580     if(   full_mx < 0-extra_width
2581        || full_my < 0-extra_height
2582        || full_mx + 16/*FIXME*/ > s->width + extra_width
2583        || full_my + 16/*FIXME*/ > s->height + extra_height){
2584         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*s->linesize, s->linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, s->width, s->height);
2585             src_y= s->edge_emu_buffer + 2 + 2*s->linesize;
2586         emu=1;
2587     }
2588
2589     qpix_op[luma_xy](dest_y, src_y, s->linesize); //FIXME try variable height perhaps?
2590     if(!square){
2591         qpix_op[luma_xy](dest_y + delta, src_y + delta, s->linesize);
2592     }
2593
2594     if(s->flags&CODEC_FLAG_GRAY) return;
2595
2596     if(emu){
2597         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), s->width>>1, s->height>>1);
2598             src_cb= s->edge_emu_buffer;
2599     }
2600     chroma_op(dest_cb, src_cb, s->uvlinesize, chroma_height, mx&7, my&7);
2601
2602     if(emu){
2603         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), s->width>>1, s->height>>1);
2604             src_cr= s->edge_emu_buffer;
2605     }
2606     chroma_op(dest_cr, src_cr, s->uvlinesize, chroma_height, mx&7, my&7);
2607 }
2608
2609 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2610                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2611                            int x_offset, int y_offset,
2612                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2613                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2614                            int list0, int list1){
2615     MpegEncContext * const s = &h->s;
2616     qpel_mc_func *qpix_op=  qpix_put;
2617     h264_chroma_mc_func chroma_op= chroma_put;
2618
2619     dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
2620     dest_cb +=   x_offset +   y_offset*s->uvlinesize;
2621     dest_cr +=   x_offset +   y_offset*s->uvlinesize;
2622     x_offset += 8*s->mb_x;
2623     y_offset += 8*s->mb_y;
2624
2625     if(list0){
2626         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2627         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2628                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2629                            qpix_op, chroma_op);
2630
2631         qpix_op=  qpix_avg;
2632         chroma_op= chroma_avg;
2633     }
2634
2635     if(list1){
2636         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2637         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2638                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2639                            qpix_op, chroma_op);
2640     }
2641 }
2642
2643 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2644                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2645                            int x_offset, int y_offset,
2646                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2647                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2648                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2649                            int list0, int list1){
2650     MpegEncContext * const s = &h->s;
2651
2652     dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
2653     dest_cb +=   x_offset +   y_offset*s->uvlinesize;
2654     dest_cr +=   x_offset +   y_offset*s->uvlinesize;
2655     x_offset += 8*s->mb_x;
2656     y_offset += 8*s->mb_y;
2657
2658     if(list0 && list1){
2659         /* don't optimize for luma-only case, since B-frames usually
2660          * use implicit weights => chroma too. */
2661         uint8_t *tmp_cb = s->obmc_scratchpad;
2662         uint8_t *tmp_cr = tmp_cb + 8*s->uvlinesize;
2663         uint8_t *tmp_y  = tmp_cr + 8*s->uvlinesize;
2664         int refn0 = h->ref_cache[0][ scan8[n] ];
2665         int refn1 = h->ref_cache[1][ scan8[n] ];
2666
2667         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2668                     dest_y, dest_cb, dest_cr,
2669                     x_offset, y_offset, qpix_put, chroma_put);
2670         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2671                     tmp_y, tmp_cb, tmp_cr,
2672                     x_offset, y_offset, qpix_put, chroma_put);
2673
2674         if(h->use_weight == 2){
2675             int weight0 = h->implicit_weight[refn0][refn1];
2676             int weight1 = 64 - weight0;
2677             luma_weight_avg(  dest_y,  tmp_y,  s->  linesize, 5, weight0, weight1, 0, 0);
2678             chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, 5, weight0, weight1, 0, 0);
2679             chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, 5, weight0, weight1, 0, 0);
2680         }else{
2681             luma_weight_avg(dest_y, tmp_y, s->linesize, h->luma_log2_weight_denom,
2682                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2683                             h->luma_offset[0][refn0], h->luma_offset[1][refn1]);
2684             chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, h->chroma_log2_weight_denom,
2685                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2686                             h->chroma_offset[0][refn0][0], h->chroma_offset[1][refn1][0]);
2687             chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, h->chroma_log2_weight_denom,
2688                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2689                             h->chroma_offset[0][refn0][1], h->chroma_offset[1][refn1][1]);
2690         }
2691     }else{
2692         int list = list1 ? 1 : 0;
2693         int refn = h->ref_cache[list][ scan8[n] ];
2694         Picture *ref= &h->ref_list[list][refn];
2695         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2696                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
2697                     qpix_put, chroma_put);
2698
2699         luma_weight_op(dest_y, s->linesize, h->luma_log2_weight_denom,
2700                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
2701         if(h->use_weight_chroma){
2702             chroma_weight_op(dest_cb, s->uvlinesize, h->chroma_log2_weight_denom,
2703                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2704             chroma_weight_op(dest_cr, s->uvlinesize, h->chroma_log2_weight_denom,
2705                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2706         }
2707     }
2708 }
2709
2710 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2711                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2712                            int x_offset, int y_offset,
2713                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2714                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2715                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2716                            int list0, int list1){
2717     if((h->use_weight==2 && list0 && list1
2718         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2719        || h->use_weight==1)
2720         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2721                          x_offset, y_offset, qpix_put, chroma_put,
2722                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2723     else
2724         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2725                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2726 }
2727
2728 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2729                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2730                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2731                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2732     MpegEncContext * const s = &h->s;
2733     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2734     const int mb_type= s->current_picture.mb_type[mb_xy];
2735
2736     assert(IS_INTER(mb_type));
2737
2738     if(IS_16X16(mb_type)){
2739         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2740                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2741                 &weight_op[0], &weight_avg[0],
2742                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2743     }else if(IS_16X8(mb_type)){
2744         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2745                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2746                 &weight_op[1], &weight_avg[1],
2747                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2748         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2749                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2750                 &weight_op[1], &weight_avg[1],
2751                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2752     }else if(IS_8X16(mb_type)){
2753         mc_part(h, 0, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 0, 0,
2754                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2755                 &weight_op[2], &weight_avg[2],
2756                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2757         mc_part(h, 4, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 4, 0,
2758                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2759                 &weight_op[2], &weight_avg[2],
2760                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2761     }else{
2762         int i;
2763
2764         assert(IS_8X8(mb_type));
2765
2766         for(i=0; i<4; i++){
2767             const int sub_mb_type= h->sub_mb_type[i];
2768             const int n= 4*i;
2769             int x_offset= (i&1)<<2;
2770             int y_offset= (i&2)<<1;
2771
2772             if(IS_SUB_8X8(sub_mb_type)){
2773                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2774                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2775                     &weight_op[3], &weight_avg[3],
2776                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2777             }else if(IS_SUB_8X4(sub_mb_type)){
2778                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2779                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2780                     &weight_op[4], &weight_avg[4],
2781                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2782                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
2783                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2784                     &weight_op[4], &weight_avg[4],
2785                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2786             }else if(IS_SUB_4X8(sub_mb_type)){
2787                 mc_part(h, n  , 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2788                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2789                     &weight_op[5], &weight_avg[5],
2790                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2791                 mc_part(h, n+1, 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
2792                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2793                     &weight_op[5], &weight_avg[5],
2794                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2795             }else{
2796                 int j;
2797                 assert(IS_SUB_4X4(sub_mb_type));
2798                 for(j=0; j<4; j++){
2799                     int sub_x_offset= x_offset + 2*(j&1);
2800                     int sub_y_offset= y_offset +   (j&2);
2801                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
2802                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2803                         &weight_op[6], &weight_avg[6],
2804                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2805                 }
2806             }
2807         }
2808     }
2809 }
2810
2811 static void decode_init_vlc(H264Context *h){
2812     static int done = 0;
2813
2814     if (!done) {
2815         int i;
2816         done = 1;
2817
2818         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
2819                  &chroma_dc_coeff_token_len [0], 1, 1,
2820                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
2821
2822         for(i=0; i<4; i++){
2823             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
2824                      &coeff_token_len [i][0], 1, 1,
2825                      &coeff_token_bits[i][0], 1, 1, 1);
2826         }
2827
2828         for(i=0; i<3; i++){
2829             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
2830                      &chroma_dc_total_zeros_len [i][0], 1, 1,
2831                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
2832         }
2833         for(i=0; i<15; i++){
2834             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
2835                      &total_zeros_len [i][0], 1, 1,
2836                      &total_zeros_bits[i][0], 1, 1, 1);
2837         }
2838
2839         for(i=0; i<6; i++){
2840             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
2841                      &run_len [i][0], 1, 1,
2842                      &run_bits[i][0], 1, 1, 1);
2843         }
2844         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2845                  &run_len [6][0], 1, 1,
2846                  &run_bits[6][0], 1, 1, 1);
2847     }
2848 }
2849
2850 /**
2851  * Sets the intra prediction function pointers.
2852  */
2853 static void init_pred_ptrs(H264Context *h){
2854 //    MpegEncContext * const s = &h->s;
2855
2856     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
2857     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
2858     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
2859     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
2860     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
2861     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
2862     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
2863     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
2864     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
2865     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
2866     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
2867     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
2868
2869     h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
2870     h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
2871     h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
2872     h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
2873     h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
2874     h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
2875     h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
2876     h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
2877     h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
2878     h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
2879     h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
2880     h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
2881
2882     h->pred8x8[DC_PRED8x8     ]= pred8x8_dc_c;
2883     h->pred8x8[VERT_PRED8x8   ]= pred8x8_vertical_c;
2884     h->pred8x8[HOR_PRED8x8    ]= pred8x8_horizontal_c;
2885     h->pred8x8[PLANE_PRED8x8  ]= pred8x8_plane_c;
2886     h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
2887     h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
2888     h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
2889
2890     h->pred16x16[DC_PRED8x8     ]= pred16x16_dc_c;
2891     h->pred16x16[VERT_PRED8x8   ]= pred16x16_vertical_c;
2892     h->pred16x16[HOR_PRED8x8    ]= pred16x16_horizontal_c;
2893     h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
2894     h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
2895     h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
2896     h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
2897 }
2898
2899 static void free_tables(H264Context *h){
2900     av_freep(&h->intra4x4_pred_mode);
2901     av_freep(&h->chroma_pred_mode_table);
2902     av_freep(&h->cbp_table);
2903     av_freep(&h->mvd_table[0]);
2904     av_freep(&h->mvd_table[1]);
2905     av_freep(&h->direct_table);
2906     av_freep(&h->non_zero_count);
2907     av_freep(&h->slice_table_base);
2908     av_freep(&h->top_borders[1]);
2909     av_freep(&h->top_borders[0]);
2910     h->slice_table= NULL;
2911
2912     av_freep(&h->mb2b_xy);
2913     av_freep(&h->mb2b8_xy);
2914
2915     av_freep(&h->dequant4_coeff);
2916     av_freep(&h->dequant8_coeff);
2917
2918     av_freep(&h->s.obmc_scratchpad);
2919 }
2920
2921 /**
2922  * allocates tables.
2923  * needs width/height
2924  */
2925 static int alloc_tables(H264Context *h){
2926     MpegEncContext * const s = &h->s;
2927     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2928     int x,y,q;
2929
2930     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2931
2932     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2933     CHECKED_ALLOCZ(h->slice_table_base  , big_mb_num * sizeof(uint8_t))
2934     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2935     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2936     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2937
2938     if( h->pps.cabac ) {
2939         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2940         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2941         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2942         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2943     }
2944
2945     memset(h->slice_table_base, -1, big_mb_num  * sizeof(uint8_t));
2946     h->slice_table= h->slice_table_base + s->mb_stride + 1;
2947
2948     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2949     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2950     for(y=0; y<s->mb_height; y++){
2951         for(x=0; x<s->mb_width; x++){
2952             const int mb_xy= x + y*s->mb_stride;
2953             const int b_xy = 4*x + 4*y*h->b_stride;
2954             const int b8_xy= 2*x + 2*y*h->b8_stride;
2955
2956             h->mb2b_xy [mb_xy]= b_xy;
2957             h->mb2b8_xy[mb_xy]= b8_xy;
2958         }
2959     }
2960
2961     CHECKED_ALLOCZ(h->dequant4_coeff, 52*16 * sizeof(uint16_t));
2962     CHECKED_ALLOCZ(h->dequant8_coeff, 52*64 * sizeof(uint16_t));
2963     memcpy(h->dequant4_coeff, dequant_coeff, 52*16 * sizeof(uint16_t));
2964     for(q=0; q<52; q++){
2965         int shift = div6[q];
2966         int idx = rem6[q];
2967         if(shift >= 2) // qp<12 are shifted during dequant
2968             shift -= 2;
2969         for(x=0; x<64; x++)
2970             h->dequant8_coeff[q][x] = dequant8_coeff_init[idx][
2971                 dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] << shift;
2972     }
2973     if(h->sps.transform_bypass){
2974         for(x=0; x<16; x++)
2975             h->dequant4_coeff[0][x] = 1;
2976         for(x=0; x<64; x++)
2977             h->dequant8_coeff[0][x] = 1<<2;
2978     }
2979
2980     s->obmc_scratchpad = NULL;
2981
2982     return 0;
2983 fail:
2984     free_tables(h);
2985     return -1;
2986 }
2987
2988 static void common_init(H264Context *h){
2989     MpegEncContext * const s = &h->s;
2990
2991     s->width = s->avctx->width;
2992     s->height = s->avctx->height;
2993     s->codec_id= s->avctx->codec->id;
2994
2995     init_pred_ptrs(h);
2996
2997     s->unrestricted_mv=1;
2998     s->decode=1; //FIXME
2999 }
3000
3001 static int decode_init(AVCodecContext *avctx){
3002     H264Context *h= avctx->priv_data;
3003     MpegEncContext * const s = &h->s;
3004
3005     MPV_decode_defaults(s);
3006
3007     s->avctx = avctx;
3008     common_init(h);
3009
3010     s->out_format = FMT_H264;
3011     s->workaround_bugs= avctx->workaround_bugs;
3012
3013     // set defaults
3014 //    s->decode_mb= ff_h263_decode_mb;
3015     s->low_delay= 1;
3016     avctx->pix_fmt= PIX_FMT_YUV420P;
3017
3018     decode_init_vlc(h);
3019
3020     if(avctx->extradata_size > 0 && avctx->extradata &&
3021        *(char *)avctx->extradata == 1){
3022         h->is_avc = 1;
3023         h->got_avcC = 0;
3024     } else {
3025         h->is_avc = 0;
3026     }
3027
3028     return 0;
3029 }
3030
3031 static void frame_start(H264Context *h){
3032     MpegEncContext * const s = &h->s;
3033     int i;
3034
3035     MPV_frame_start(s, s->avctx);
3036     ff_er_frame_start(s);
3037
3038     assert(s->linesize && s->uvlinesize);
3039
3040     for(i=0; i<16; i++){
3041         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
3042         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
3043     }
3044     for(i=0; i<4; i++){
3045         h->block_offset[16+i]=
3046         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3047         h->block_offset[24+16+i]=
3048         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3049     }
3050
3051     /* can't be in alloc_tables because linesize isn't known there.
3052      * FIXME: redo bipred weight to not require extra buffer? */
3053     if(!s->obmc_scratchpad)
3054         s->obmc_scratchpad = av_malloc(16*s->linesize + 2*8*s->uvlinesize);
3055
3056 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
3057 }
3058
3059 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3060     MpegEncContext * const s = &h->s;
3061     int i;
3062
3063     src_y  -=   linesize;
3064     src_cb -= uvlinesize;
3065     src_cr -= uvlinesize;
3066
3067     // There are two lines saved, the line above the the top macroblock of a pair,
3068     // and the line above the bottom macroblock
3069     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3070     for(i=1; i<17; i++){
3071         h->left_border[i]= src_y[15+i*  linesize];
3072     }
3073
3074     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3075     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3076
3077     if(!(s->flags&CODEC_FLAG_GRAY)){
3078         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3079         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3080         for(i=1; i<9; i++){
3081             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3082             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3083         }
3084         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3085         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3086     }
3087 }
3088
3089 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3090     MpegEncContext * const s = &h->s;
3091     int temp8, i;
3092     uint64_t temp64;
3093     int deblock_left = (s->mb_x > 0);
3094     int deblock_top  = (s->mb_y > 0);
3095
3096     src_y  -=   linesize + 1;
3097     src_cb -= uvlinesize + 1;
3098     src_cr -= uvlinesize + 1;
3099
3100 #define XCHG(a,b,t,xchg)\
3101 t= a;\
3102 if(xchg)\
3103     a= b;\
3104 b= t;
3105
3106     if(deblock_left){
3107         for(i = !deblock_top; i<17; i++){
3108             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3109         }
3110     }
3111
3112     if(deblock_top){
3113         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3114         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3115         if(s->mb_x+1 < s->mb_width){
3116             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3117         }
3118     }
3119
3120     if(!(s->flags&CODEC_FLAG_GRAY)){
3121         if(deblock_left){
3122             for(i = !deblock_top; i<9; i++){
3123                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3124                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3125             }
3126         }
3127         if(deblock_top){
3128             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3129             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3130         }
3131     }
3132 }
3133
3134 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3135     MpegEncContext * const s = &h->s;
3136     int i;
3137
3138     src_y  -= 2 *   linesize;
3139     src_cb -= 2 * uvlinesize;
3140     src_cr -= 2 * uvlinesize;
3141
3142     // There are two lines saved, the line above the the top macroblock of a pair,
3143     // and the line above the bottom macroblock
3144     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3145     h->left_border[1]= h->top_borders[1][s->mb_x][15];
3146     for(i=2; i<34; i++){
3147         h->left_border[i]= src_y[15+i*  linesize];
3148     }
3149
3150     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3151     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3152     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3153     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3154
3155     if(!(s->flags&CODEC_FLAG_GRAY)){
3156         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3157         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3158         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3159         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3160         for(i=2; i<18; i++){
3161             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3162             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3163         }
3164         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3165         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3166         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3167         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3168     }
3169 }
3170
3171 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3172     MpegEncContext * const s = &h->s;
3173     int temp8, i;
3174     uint64_t temp64;
3175     int deblock_left = (s->mb_x > 0);
3176     int deblock_top  = (s->mb_y > 0);
3177
3178     tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3179
3180     src_y  -= 2 *   linesize + 1;
3181     src_cb -= 2 * uvlinesize + 1;
3182     src_cr -= 2 * uvlinesize + 1;
3183
3184 #define XCHG(a,b,t,xchg)\
3185 t= a;\
3186 if(xchg)\
3187     a= b;\
3188 b= t;
3189
3190     if(deblock_left){
3191         for(i = (!deblock_top)<<1; i<34; i++){
3192             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3193         }
3194     }
3195
3196     if(deblock_top){
3197         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3198         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3199         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3200         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3201     }
3202
3203     if(!(s->flags&CODEC_FLAG_GRAY)){
3204         if(deblock_left){
3205             for(i = (!deblock_top) << 1; i<18; i++){
3206                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3207                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3208             }
3209         }
3210         if(deblock_top){
3211             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3212             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3213             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3214             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3215         }
3216     }
3217 }
3218
3219 static void hl_decode_mb(H264Context *h){
3220     MpegEncContext * const s = &h->s;
3221     const int mb_x= s->mb_x;
3222     const int mb_y= s->mb_y;
3223     const int mb_xy= mb_x + mb_y*s->mb_stride;
3224     const int mb_type= s->current_picture.mb_type[mb_xy];
3225     uint8_t  *dest_y, *dest_cb, *dest_cr;
3226     int linesize, uvlinesize /*dct_offset*/;
3227     int i;
3228     int *block_offset = &h->block_offset[0];
3229     const unsigned int bottom = mb_y & 1;
3230     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
3231     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3232
3233     if(!s->decode)
3234         return;
3235
3236     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3237     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3238     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3239
3240     if (h->mb_field_decoding_flag) {
3241         linesize = s->linesize * 2;
3242         uvlinesize = s->uvlinesize * 2;
3243         block_offset = &h->block_offset[24];
3244         if(mb_y&1){ //FIXME move out of this func?
3245             dest_y -= s->linesize*15;
3246             dest_cb-= s->uvlinesize*7;
3247             dest_cr-= s->uvlinesize*7;
3248         }
3249     } else {
3250         linesize = s->linesize;
3251         uvlinesize = s->uvlinesize;
3252 //        dct_offset = s->linesize * 16;
3253     }
3254
3255     idct_add = transform_bypass
3256              ? IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4
3257              : IS_8x8DCT(mb_type) ? s->dsp.h264_idct8_add : s->dsp.h264_idct_add;
3258
3259     if (IS_INTRA_PCM(mb_type)) {
3260         unsigned int x, y;
3261
3262         // The pixels are stored in h->mb array in the same order as levels,
3263         // copy them in output in the correct order.
3264         for(i=0; i<16; i++) {
3265             for (y=0; y<4; y++) {
3266                 for (x=0; x<4; x++) {
3267                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3268                 }
3269             }
3270         }
3271         for(i=16; i<16+4; i++) {
3272             for (y=0; y<4; y++) {
3273                 for (x=0; x<4; x++) {
3274                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3275                 }
3276             }
3277         }
3278         for(i=20; i<20+4; i++) {
3279             for (y=0; y<4; y++) {
3280                 for (x=0; x<4; x++) {
3281                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3282                 }
3283             }
3284         }
3285     } else {
3286         if(IS_INTRA(mb_type)){
3287             if(h->deblocking_filter) {
3288                 if (h->mb_aff_frame) {
3289                     if (!bottom)
3290                         xchg_pair_border(h, dest_y, dest_cb, dest_cr, s->linesize, s->uvlinesize, 1);
3291                 } else {
3292                     xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
3293                 }
3294             }
3295
3296             if(!(s->flags&CODEC_FLAG_GRAY)){
3297                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3298                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3299             }
3300
3301             if(IS_INTRA4x4(mb_type)){
3302                 if(!s->encoding){
3303                     if(IS_8x8DCT(mb_type)){
3304                         for(i=0; i<16; i+=4){
3305                             uint8_t * const ptr= dest_y + block_offset[i];
3306                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3307                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3308                                                    (h->topright_samples_available<<(i+1))&0x8000, linesize);
3309                             if(h->non_zero_count_cache[ scan8[i] ])
3310                                 idct_add(ptr, h->mb + i*16, linesize);
3311                         }
3312                     }else
3313                     for(i=0; i<16; i++){
3314                         uint8_t * const ptr= dest_y + block_offset[i];
3315                         uint8_t *topright;
3316                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3317                         int tr;
3318
3319                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3320                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3321                             assert(mb_y || linesize <= block_offset[i]);
3322                             if(!topright_avail){
3323                                 tr= ptr[3 - linesize]*0x01010101;
3324                                 topright= (uint8_t*) &tr;
3325                             }else
3326                                 topright= ptr + 4 - linesize;
3327                         }else
3328                             topright= NULL;
3329
3330                         h->pred4x4[ dir ](ptr, topright, linesize);
3331                         if(h->non_zero_count_cache[ scan8[i] ]){
3332                             if(s->codec_id == CODEC_ID_H264)
3333                                 idct_add(ptr, h->mb + i*16, linesize);
3334                             else
3335                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3336                         }
3337                     }
3338                 }
3339             }else{
3340                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3341                 if(s->codec_id == CODEC_ID_H264){
3342                     if(!transform_bypass)
3343                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale);
3344                 }else
3345                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3346             }
3347             if(h->deblocking_filter) {
3348                 if (h->mb_aff_frame) {
3349                     if (bottom) {
3350                         uint8_t *pair_dest_y  = s->current_picture.data[0] + ((mb_y-1) * 16* s->linesize  ) + mb_x * 16;
3351                         uint8_t *pair_dest_cb = s->current_picture.data[1] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
3352                         uint8_t *pair_dest_cr = s->current_picture.data[2] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
3353                         s->mb_y--;
3354                         xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3355                         s->mb_y++;
3356                     }
3357                 } else {
3358                     xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3359                 }
3360             }
3361         }else if(s->codec_id == CODEC_ID_H264){
3362             hl_motion(h, dest_y, dest_cb, dest_cr,
3363                       s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab,
3364                       s->dsp.avg_h264_qpel_pixels_tab, s->dsp.avg_h264_chroma_pixels_tab,
3365                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3366         }
3367
3368
3369         if(!IS_INTRA4x4(mb_type)){
3370             if(s->codec_id == CODEC_ID_H264){
3371                 const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3372                 for(i=0; i<16; i+=di){
3373                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3374                         uint8_t * const ptr= dest_y + block_offset[i];
3375                         idct_add(ptr, h->mb + i*16, linesize);
3376                     }
3377                 }
3378             }else{
3379                 for(i=0; i<16; i++){
3380                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3381                         uint8_t * const ptr= dest_y + block_offset[i];
3382                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3383                     }
3384                 }
3385             }
3386         }
3387
3388         if(!(s->flags&CODEC_FLAG_GRAY)){
3389             idct_add = transform_bypass ? s->dsp.add_pixels4 : s->dsp.h264_idct_add;
3390             if(!transform_bypass){
3391                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp);
3392                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp);
3393             }
3394             if(s->codec_id == CODEC_ID_H264){
3395                 for(i=16; i<16+4; i++){
3396                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3397                         uint8_t * const ptr= dest_cb + block_offset[i];
3398                         idct_add(ptr, h->mb + i*16, uvlinesize);
3399                     }
3400                 }
3401                 for(i=20; i<20+4; i++){
3402                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3403                         uint8_t * const ptr= dest_cr + block_offset[i];
3404                         idct_add(ptr, h->mb + i*16, uvlinesize);
3405                     }
3406                 }
3407             }else{
3408                 for(i=16; i<16+4; i++){
3409                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3410                         uint8_t * const ptr= dest_cb + block_offset[i];
3411                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3412                     }
3413                 }
3414                 for(i=20; i<20+4; i++){
3415                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3416                         uint8_t * const ptr= dest_cr + block_offset[i];
3417                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3418                     }
3419                 }
3420             }
3421         }
3422     }
3423     if(h->deblocking_filter) {
3424         if (h->mb_aff_frame) {
3425             const int mb_y = s->mb_y - 1;
3426             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3427             const int mb_xy= mb_x + mb_y*s->mb_stride;
3428             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3429             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3430             uint8_t tmp = s->current_picture.data[1][384];
3431             if (!bottom) return;
3432             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3433             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3434             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3435
3436             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3437             // TODO deblock a pair
3438             // top
3439             s->mb_y--;
3440             tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3441             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3442             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3443             if (tmp != s->current_picture.data[1][384]) {
3444                 tprintf("modified pixel 8,1 (1)\n");
3445             }
3446             // bottom
3447             s->mb_y++;
3448             tprintf("call mbaff filter_mb\n");
3449             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3450             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3451             if (tmp != s->current_picture.data[1][384]) {
3452                 tprintf("modified pixel 8,1 (2)\n");
3453             }
3454         } else {
3455             tprintf("call filter_mb\n");
3456             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3457             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3458             filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3459         }
3460     }
3461 }
3462
3463 /**
3464  * fills the default_ref_list.
3465  */
3466 static int fill_default_ref_list(H264Context *h){
3467     MpegEncContext * const s = &h->s;
3468     int i;
3469     int smallest_poc_greater_than_current = -1;
3470     Picture sorted_short_ref[32];
3471
3472     if(h->slice_type==B_TYPE){
3473         int out_i;
3474         int limit= INT_MIN;
3475
3476         /* sort frame according to poc in B slice */
3477         for(out_i=0; out_i<h->short_ref_count; out_i++){
3478             int best_i=INT_MIN;
3479             int best_poc=INT_MAX;
3480
3481             for(i=0; i<h->short_ref_count; i++){
3482                 const int poc= h->short_ref[i]->poc;
3483                 if(poc > limit && poc < best_poc){
3484                     best_poc= poc;
3485                     best_i= i;
3486                 }
3487             }
3488
3489             assert(best_i != INT_MIN);
3490
3491             limit= best_poc;
3492             sorted_short_ref[out_i]= *h->short_ref[best_i];
3493             tprintf("sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3494             if (-1 == smallest_poc_greater_than_current) {
3495                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3496                     smallest_poc_greater_than_current = out_i;
3497                 }
3498             }
3499         }
3500     }
3501
3502     if(s->picture_structure == PICT_FRAME){
3503         if(h->slice_type==B_TYPE){
3504             int list;
3505             tprintf("current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3506
3507             // find the largest poc
3508             for(list=0; list<2; list++){
3509                 int index = 0;
3510                 int j= -99;
3511                 int step= list ? -1 : 1;
3512
3513                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3514                     while(j<0 || j>= h->short_ref_count){
3515                         if(j != -99 && step == (list ? -1 : 1))
3516                             return -1;
3517                         step = -step;
3518                         j= smallest_poc_greater_than_current + (step>>1);
3519                     }
3520                     if(sorted_short_ref[j].reference != 3) continue;
3521                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3522                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3523                 }
3524
3525                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3526                     if(h->long_ref[i] == NULL) continue;
3527                     if(h->long_ref[i]->reference != 3) continue;
3528
3529                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3530                     h->default_ref_list[ list ][index++].pic_id= i;;
3531                 }
3532
3533                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3534                     // swap the two first elements of L1 when
3535                     // L0 and L1 are identical
3536                     Picture temp= h->default_ref_list[1][0];
3537                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3538                     h->default_ref_list[1][1] = temp;
3539                 }
3540
3541                 if(index < h->ref_count[ list ])
3542                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3543             }
3544         }else{
3545             int index=0;
3546             for(i=0; i<h->short_ref_count; i++){
3547                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3548                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3549                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3550             }
3551             for(i = 0; i < 16; i++){
3552                 if(h->long_ref[i] == NULL) continue;
3553                 if(h->long_ref[i]->reference != 3) continue;
3554                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3555                 h->default_ref_list[0][index++].pic_id= i;;
3556             }
3557             if(index < h->ref_count[0])
3558                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3559         }
3560     }else{ //FIELD
3561         if(h->slice_type==B_TYPE){
3562         }else{
3563             //FIXME second field balh
3564         }
3565     }
3566 #ifdef TRACE
3567     for (i=0; i<h->ref_count[0]; i++) {
3568         tprintf("List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3569     }
3570     if(h->slice_type==B_TYPE){
3571         for (i=0; i<h->ref_count[1]; i++) {
3572             tprintf("List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3573         }
3574     }
3575 #endif
3576     return 0;
3577 }
3578
3579 static void print_short_term(H264Context *h);
3580 static void print_long_term(H264Context *h);
3581
3582 static int decode_ref_pic_list_reordering(H264Context *h){
3583     MpegEncContext * const s = &h->s;
3584     int list, index;
3585
3586     print_short_term(h);
3587     print_long_term(h);
3588     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3589
3590     for(list=0; list<2; list++){
3591         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3592
3593         if(get_bits1(&s->gb)){
3594             int pred= h->curr_pic_num;
3595
3596             for(index=0; ; index++){
3597                 int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3598                 int pic_id;
3599                 int i;
3600                 Picture *ref = NULL;
3601
3602                 if(reordering_of_pic_nums_idc==3)
3603                     break;
3604
3605                 if(index >= h->ref_count[list]){
3606                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3607                     return -1;
3608                 }
3609
3610                 if(reordering_of_pic_nums_idc<3){
3611                     if(reordering_of_pic_nums_idc<2){
3612                         const int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3613
3614                         if(abs_diff_pic_num >= h->max_pic_num){
3615                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3616                             return -1;
3617                         }
3618
3619                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3620                         else                                pred+= abs_diff_pic_num;
3621                         pred &= h->max_pic_num - 1;
3622
3623                         for(i= h->short_ref_count-1; i>=0; i--){
3624                             ref = h->short_ref[i];
3625                             assert(ref->reference == 3);
3626                             assert(!ref->long_ref);
3627                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3628                                 break;
3629                         }
3630                         if(i>=0)
3631                             ref->pic_id= ref->frame_num;
3632                     }else{
3633                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3634                         ref = h->long_ref[pic_id];
3635                         ref->pic_id= pic_id;
3636                         assert(ref->reference == 3);
3637                         assert(ref->long_ref);
3638                         i=0;
3639                     }
3640
3641                     if (i < 0) {
3642                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3643                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3644                     } else {
3645                         for(i=index; i+1<h->ref_count[list]; i++){
3646                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3647                                 break;
3648                         }
3649                         for(; i > index; i--){
3650                             h->ref_list[list][i]= h->ref_list[list][i-1];
3651                         }
3652                         h->ref_list[list][index]= *ref;
3653                     }
3654                 }else{
3655                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3656                     return -1;
3657                 }
3658             }
3659         }
3660
3661         if(h->slice_type!=B_TYPE) break;
3662     }
3663     for(list=0; list<2; list++){
3664         for(index= 0; index < h->ref_count[list]; index++){
3665             if(!h->ref_list[list][index].data[0])
3666                 h->ref_list[list][index]= s->current_picture;
3667         }
3668         if(h->slice_type!=B_TYPE) break;
3669     }
3670
3671     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3672         direct_dist_scale_factor(h);
3673     direct_ref_list_init(h);
3674     return 0;
3675 }
3676
3677 static int pred_weight_table(H264Context *h){
3678     MpegEncContext * const s = &h->s;
3679     int list, i;
3680     int luma_def, chroma_def;
3681
3682     h->use_weight= 0;
3683     h->use_weight_chroma= 0;
3684     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3685     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3686     luma_def = 1<<h->luma_log2_weight_denom;
3687     chroma_def = 1<<h->chroma_log2_weight_denom;
3688
3689     for(list=0; list<2; list++){
3690         for(i=0; i<h->ref_count[list]; i++){
3691             int luma_weight_flag, chroma_weight_flag;
3692
3693             luma_weight_flag= get_bits1(&s->gb);
3694             if(luma_weight_flag){
3695                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3696                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3697                 if(   h->luma_weight[list][i] != luma_def
3698                    || h->luma_offset[list][i] != 0)
3699                     h->use_weight= 1;
3700             }else{
3701                 h->luma_weight[list][i]= luma_def;
3702                 h->luma_offset[list][i]= 0;
3703             }
3704
3705             chroma_weight_flag= get_bits1(&s->gb);
3706             if(chroma_weight_flag){
3707                 int j;
3708                 for(j=0; j<2; j++){
3709                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3710                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3711                     if(   h->chroma_weight[list][i][j] != chroma_def
3712                        || h->chroma_offset[list][i][j] != 0)
3713                         h->use_weight_chroma= 1;
3714                 }
3715             }else{
3716                 int j;
3717                 for(j=0; j<2; j++){
3718                     h->chroma_weight[list][i][j]= chroma_def;
3719                     h->chroma_offset[list][i][j]= 0;
3720                 }
3721             }
3722         }
3723         if(h->slice_type != B_TYPE) break;
3724     }
3725     h->use_weight= h->use_weight || h->use_weight_chroma;
3726     return 0;
3727 }
3728
3729 static void implicit_weight_table(H264Context *h){
3730     MpegEncContext * const s = &h->s;
3731     int ref0, ref1;
3732     int cur_poc = s->current_picture_ptr->poc;
3733
3734     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3735        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3736         h->use_weight= 0;
3737         h->use_weight_chroma= 0;
3738         return;
3739     }
3740
3741     h->use_weight= 2;
3742     h->use_weight_chroma= 2;
3743     h->luma_log2_weight_denom= 5;
3744     h->chroma_log2_weight_denom= 5;
3745
3746     /* FIXME: MBAFF */
3747     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3748         int poc0 = h->ref_list[0][ref0].poc;
3749         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3750             int poc1 = h->ref_list[1][ref1].poc;
3751             int td = clip(poc1 - poc0, -128, 127);
3752             if(td){
3753                 int tb = clip(cur_poc - poc0, -128, 127);
3754                 int tx = (16384 + (ABS(td) >> 1)) / td;
3755                 int dist_scale_factor = clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3756                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3757                     h->implicit_weight[ref0][ref1] = 32;
3758                 else
3759                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3760             }else
3761                 h->implicit_weight[ref0][ref1] = 32;
3762         }
3763     }
3764 }
3765
3766 static inline void unreference_pic(H264Context *h, Picture *pic){
3767     int i;
3768     pic->reference=0;
3769     if(pic == h->delayed_output_pic)
3770         pic->reference=1;
3771     else{
3772         for(i = 0; h->delayed_pic[i]; i++)
3773             if(pic == h->delayed_pic[i]){
3774                 pic->reference=1;
3775                 break;
3776             }
3777     }
3778 }
3779
3780 /**
3781  * instantaneous decoder refresh.
3782  */
3783 static void idr(H264Context *h){
3784     int i;
3785
3786     for(i=0; i<16; i++){
3787         if (h->long_ref[i] != NULL) {
3788             unreference_pic(h, h->long_ref[i]);
3789             h->long_ref[i]= NULL;
3790         }
3791     }
3792     h->long_ref_count=0;
3793
3794     for(i=0; i<h->short_ref_count; i++){
3795         unreference_pic(h, h->short_ref[i]);
3796         h->short_ref[i]= NULL;
3797     }
3798     h->short_ref_count=0;
3799 }
3800
3801 /* forget old pics after a seek */
3802 static void flush_dpb(AVCodecContext *avctx){
3803     H264Context *h= avctx->priv_data;
3804     int i;
3805     for(i=0; i<16; i++)
3806         h->delayed_pic[i]= NULL;
3807     h->delayed_output_pic= NULL;
3808     idr(h);
3809     if(h->s.current_picture_ptr)
3810         h->s.current_picture_ptr->reference= 0;
3811 }
3812
3813 /**
3814  *
3815  * @return the removed picture or NULL if an error occurs
3816  */
3817 static Picture * remove_short(H264Context *h, int frame_num){
3818     MpegEncContext * const s = &h->s;
3819     int i;
3820
3821     if(s->avctx->debug&FF_DEBUG_MMCO)
3822         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3823
3824     for(i=0; i<h->short_ref_count; i++){
3825         Picture *pic= h->short_ref[i];
3826         if(s->avctx->debug&FF_DEBUG_MMCO)
3827             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3828         if(pic->frame_num == frame_num){
3829             h->short_ref[i]= NULL;
3830             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
3831             h->short_ref_count--;
3832             return pic;
3833         }
3834     }
3835     return NULL;
3836 }
3837
3838 /**
3839  *
3840  * @return the removed picture or NULL if an error occurs
3841  */
3842 static Picture * remove_long(H264Context *h, int i){
3843     Picture *pic;
3844
3845     pic= h->long_ref[i];
3846     h->long_ref[i]= NULL;
3847     if(pic) h->long_ref_count--;
3848
3849     return pic;
3850 }
3851
3852 /**
3853  * print short term list
3854  */
3855 static void print_short_term(H264Context *h) {
3856     uint32_t i;
3857     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3858         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3859         for(i=0; i<h->short_ref_count; i++){
3860             Picture *pic= h->short_ref[i];
3861             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3862         }
3863     }
3864 }
3865
3866 /**
3867  * print long term list
3868  */
3869 static void print_long_term(H264Context *h) {
3870     uint32_t i;
3871     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3872         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3873         for(i = 0; i < 16; i++){
3874             Picture *pic= h->long_ref[i];
3875             if (pic) {
3876                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3877             }
3878         }
3879     }
3880 }
3881
3882 /**
3883  * Executes the reference picture marking (memory management control operations).
3884  */
3885 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3886     MpegEncContext * const s = &h->s;
3887     int i, j;
3888     int current_is_long=0;
3889     Picture *pic;
3890
3891     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3892         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3893
3894     for(i=0; i<mmco_count; i++){
3895         if(s->avctx->debug&FF_DEBUG_MMCO)
3896             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
3897
3898         switch(mmco[i].opcode){
3899         case MMCO_SHORT2UNUSED:
3900             pic= remove_short(h, mmco[i].short_frame_num);
3901             if(pic)
3902                 unreference_pic(h, pic);
3903             else if(s->avctx->debug&FF_DEBUG_MMCO)
3904                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
3905             break;
3906         case MMCO_SHORT2LONG:
3907             pic= remove_long(h, mmco[i].long_index);
3908             if(pic) unreference_pic(h, pic);
3909
3910             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
3911             h->long_ref[ mmco[i].long_index ]->long_ref=1;
3912             h->long_ref_count++;
3913             break;
3914         case MMCO_LONG2UNUSED:
3915             pic= remove_long(h, mmco[i].long_index);
3916             if(pic)
3917                 unreference_pic(h, pic);
3918             else if(s->avctx->debug&FF_DEBUG_MMCO)
3919                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
3920             break;
3921         case MMCO_LONG:
3922             pic= remove_long(h, mmco[i].long_index);
3923             if(pic) unreference_pic(h, pic);
3924
3925             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
3926             h->long_ref[ mmco[i].long_index ]->long_ref=1;
3927             h->long_ref_count++;
3928
3929             current_is_long=1;
3930             break;
3931         case MMCO_SET_MAX_LONG:
3932             assert(mmco[i].long_index <= 16);
3933             // just remove the long term which index is greater than new max
3934             for(j = mmco[i].long_index; j<16; j++){
3935                 pic = remove_long(h, j);
3936                 if (pic) unreference_pic(h, pic);
3937             }
3938             break;
3939         case MMCO_RESET:
3940             while(h->short_ref_count){
3941                 pic= remove_short(h, h->short_ref[0]->frame_num);
3942                 unreference_pic(h, pic);
3943             }
3944             for(j = 0; j < 16; j++) {
3945                 pic= remove_long(h, j);
3946                 if(pic) unreference_pic(h, pic);
3947             }
3948             break;
3949         default: assert(0);
3950         }
3951     }
3952
3953     if(!current_is_long){
3954         pic= remove_short(h, s->current_picture_ptr->frame_num);
3955         if(pic){
3956             unreference_pic(h, pic);
3957             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3958         }
3959
3960         if(h->short_ref_count)
3961             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3962
3963         h->short_ref[0]= s->current_picture_ptr;
3964         h->short_ref[0]->long_ref=0;
3965         h->short_ref_count++;
3966     }
3967
3968     print_short_term(h);
3969     print_long_term(h);
3970     return 0;
3971 }
3972
3973 static int decode_ref_pic_marking(H264Context *h){
3974     MpegEncContext * const s = &h->s;
3975     int i;
3976
3977     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3978         s->broken_link= get_bits1(&s->gb) -1;
3979         h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
3980         if(h->mmco[0].long_index == -1)
3981             h->mmco_index= 0;
3982         else{
3983             h->mmco[0].opcode= MMCO_LONG;
3984             h->mmco_index= 1;
3985         }
3986     }else{
3987         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
3988             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3989                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
3990
3991                 h->mmco[i].opcode= opcode;
3992                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3993                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
3994 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
3995                         fprintf(stderr, "illegal short ref in memory management control operation %d\n", mmco);
3996                         return -1;
3997                     }*/
3998                 }
3999                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
4000                     h->mmco[i].long_index= get_ue_golomb(&s->gb);
4001                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ h->mmco[i].long_index >= 16){
4002                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4003                         return -1;
4004                     }
4005                 }
4006
4007                 if(opcode > MMCO_LONG){
4008                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4009                     return -1;
4010                 }
4011                 if(opcode == MMCO_END)
4012                     break;
4013             }
4014             h->mmco_index= i;
4015         }else{
4016             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4017
4018             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4019                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4020                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4021                 h->mmco_index= 1;
4022             }else
4023                 h->mmco_index= 0;
4024         }
4025     }
4026
4027     return 0;
4028 }
4029
4030 static int init_poc(H264Context *h){
4031     MpegEncContext * const s = &h->s;
4032     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4033     int field_poc[2];
4034
4035     if(h->nal_unit_type == NAL_IDR_SLICE){
4036         h->frame_num_offset= 0;
4037     }else{
4038         if(h->frame_num < h->prev_frame_num)
4039             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4040         else
4041             h->frame_num_offset= h->prev_frame_num_offset;
4042     }
4043
4044     if(h->sps.poc_type==0){
4045         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4046
4047         if(h->nal_unit_type == NAL_IDR_SLICE){
4048              h->prev_poc_msb=
4049              h->prev_poc_lsb= 0;
4050         }
4051
4052         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4053             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4054         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4055             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4056         else
4057             h->poc_msb = h->prev_poc_msb;
4058 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4059         field_poc[0] =
4060         field_poc[1] = h->poc_msb + h->poc_lsb;
4061         if(s->picture_structure == PICT_FRAME)
4062             field_poc[1] += h->delta_poc_bottom;
4063     }else if(h->sps.poc_type==1){
4064         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4065         int i;
4066
4067         if(h->sps.poc_cycle_length != 0)
4068             abs_frame_num = h->frame_num_offset + h->frame_num;
4069         else
4070             abs_frame_num = 0;
4071
4072         if(h->nal_ref_idc==0 && abs_frame_num > 0)
4073             abs_frame_num--;
4074
4075         expected_delta_per_poc_cycle = 0;
4076         for(i=0; i < h->sps.poc_cycle_length; i++)
4077             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4078
4079         if(abs_frame_num > 0){
4080             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4081             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4082
4083             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4084             for(i = 0; i <= frame_num_in_poc_cycle; i++)
4085                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4086         } else
4087             expectedpoc = 0;
4088
4089         if(h->nal_ref_idc == 0)
4090             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4091
4092         field_poc[0] = expectedpoc + h->delta_poc[0];
4093         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4094
4095         if(s->picture_structure == PICT_FRAME)
4096             field_poc[1] += h->delta_poc[1];
4097     }else{
4098         int poc;
4099         if(h->nal_unit_type == NAL_IDR_SLICE){
4100             poc= 0;
4101         }else{
4102             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4103             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4104         }
4105         field_poc[0]= poc;
4106         field_poc[1]= poc;
4107     }
4108
4109     if(s->picture_structure != PICT_BOTTOM_FIELD)
4110         s->current_picture_ptr->field_poc[0]= field_poc[0];
4111     if(s->picture_structure != PICT_TOP_FIELD)
4112         s->current_picture_ptr->field_poc[1]= field_poc[1];
4113     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4114         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4115
4116     return 0;
4117 }
4118
4119 /**
4120  * decodes a slice header.
4121  * this will allso call MPV_common_init() and frame_start() as needed
4122  */
4123 static int decode_slice_header(H264Context *h){
4124     MpegEncContext * const s = &h->s;
4125     int first_mb_in_slice, pps_id;
4126     int num_ref_idx_active_override_flag;
4127     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4128     int slice_type;
4129     int default_ref_list_done = 0;
4130
4131     s->current_picture.reference= h->nal_ref_idc != 0;
4132     s->dropable= h->nal_ref_idc == 0;
4133
4134     first_mb_in_slice= get_ue_golomb(&s->gb);
4135
4136     slice_type= get_ue_golomb(&s->gb);
4137     if(slice_type > 9){
4138         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4139         return -1;
4140     }
4141     if(slice_type > 4){
4142         slice_type -= 5;
4143         h->slice_type_fixed=1;
4144     }else
4145         h->slice_type_fixed=0;
4146
4147     slice_type= slice_type_map[ slice_type ];
4148     if (slice_type == I_TYPE
4149         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
4150         default_ref_list_done = 1;
4151     }
4152     h->slice_type= slice_type;
4153
4154     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4155
4156     pps_id= get_ue_golomb(&s->gb);
4157     if(pps_id>255){
4158         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4159         return -1;
4160     }
4161     h->pps= h->pps_buffer[pps_id];
4162     if(h->pps.slice_group_count == 0){
4163         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4164         return -1;
4165     }
4166
4167     h->sps= h->sps_buffer[ h->pps.sps_id ];
4168     if(h->sps.log2_max_frame_num == 0){
4169         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4170         return -1;
4171     }
4172
4173     s->mb_width= h->sps.mb_width;
4174     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4175
4176     h->b_stride=  s->mb_width*4 + 1;
4177     h->b8_stride= s->mb_width*2 + 1;
4178
4179     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
4180     if(h->sps.frame_mbs_only_flag)
4181         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
4182     else
4183         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
4184
4185     if (s->context_initialized
4186         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
4187         free_tables(h);
4188         MPV_common_end(s);
4189     }
4190     if (!s->context_initialized) {
4191         if (MPV_common_init(s) < 0)
4192             return -1;
4193
4194         if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4195             memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4196             memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
4197         }else{
4198             int i;
4199             for(i=0; i<16; i++){
4200 #define T(x) (x>>2) | ((x<<2) & 0xF)
4201                 h->zigzag_scan[i] = T(zigzag_scan[i]);
4202                 h-> field_scan[i] = T( field_scan[i]);
4203             }
4204         }
4205         if(h->sps.transform_bypass){ //FIXME same ugly
4206             h->zigzag_scan_q0 = zigzag_scan;
4207             h->field_scan_q0 = field_scan;
4208         }else{
4209             h->zigzag_scan_q0 = h->zigzag_scan;
4210             h->field_scan_q0 = h->field_scan;
4211         }
4212
4213         alloc_tables(h);
4214
4215         s->avctx->width = s->width;
4216         s->avctx->height = s->height;
4217         s->avctx->sample_aspect_ratio= h->sps.sar;
4218         if(!s->avctx->sample_aspect_ratio.den)
4219             s->avctx->sample_aspect_ratio.den = 1;
4220
4221         if(h->sps.timing_info_present_flag){
4222             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
4223         }
4224     }
4225
4226     if(h->slice_num == 0){
4227         frame_start(h);
4228     }
4229
4230     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4231     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4232
4233     h->mb_aff_frame = 0;
4234     if(h->sps.frame_mbs_only_flag){
4235         s->picture_structure= PICT_FRAME;
4236     }else{
4237         if(get_bits1(&s->gb)) { //field_pic_flag
4238             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4239         } else {
4240             s->picture_structure= PICT_FRAME;
4241             first_mb_in_slice <<= h->sps.mb_aff;
4242             h->mb_aff_frame = h->sps.mb_aff;
4243         }
4244     }
4245
4246     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4247     s->resync_mb_y = s->mb_y = first_mb_in_slice / s->mb_width;
4248     if(s->mb_y >= s->mb_height){
4249         return -1;
4250     }
4251
4252     if(s->picture_structure==PICT_FRAME){
4253         h->curr_pic_num=   h->frame_num;
4254         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4255     }else{
4256         h->curr_pic_num= 2*h->frame_num;
4257         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4258     }
4259
4260     if(h->nal_unit_type == NAL_IDR_SLICE){
4261         get_ue_golomb(&s->gb); /* idr_pic_id */
4262     }
4263
4264     if(h->sps.poc_type==0){
4265         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4266
4267         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4268             h->delta_poc_bottom= get_se_golomb(&s->gb);
4269         }
4270     }
4271
4272     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4273         h->delta_poc[0]= get_se_golomb(&s->gb);
4274
4275         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4276             h->delta_poc[1]= get_se_golomb(&s->gb);
4277     }
4278
4279     init_poc(h);
4280
4281     if(h->pps.redundant_pic_cnt_present){
4282         h->redundant_pic_count= get_ue_golomb(&s->gb);
4283     }
4284
4285     //set defaults, might be overriden a few line later
4286     h->ref_count[0]= h->pps.ref_count[0];
4287     h->ref_count[1]= h->pps.ref_count[1];
4288
4289     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4290         if(h->slice_type == B_TYPE){
4291             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4292         }
4293         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4294
4295         if(num_ref_idx_active_override_flag){
4296             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4297             if(h->slice_type==B_TYPE)
4298                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4299
4300             if(h->ref_count[0] > 32 || h->ref_count[1] > 32){
4301                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4302                 return -1;
4303             }
4304         }
4305     }
4306
4307     if(!default_ref_list_done){
4308         fill_default_ref_list(h);
4309     }
4310
4311     if(decode_ref_pic_list_reordering(h) < 0)
4312         return -1;
4313
4314     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4315        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4316         pred_weight_table(h);
4317     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4318         implicit_weight_table(h);
4319     else
4320         h->use_weight = 0;
4321
4322     if(s->current_picture.reference)
4323         decode_ref_pic_marking(h);
4324
4325     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac )
4326         h->cabac_init_idc = get_ue_golomb(&s->gb);
4327
4328     h->last_qscale_diff = 0;
4329     s->qscale = h->pps.init_qp + get_se_golomb(&s->gb);
4330     if(s->qscale<0 || s->qscale>51){
4331         av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale);
4332         return -1;
4333     }
4334     h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4335     //FIXME qscale / qp ... stuff
4336     if(h->slice_type == SP_TYPE){
4337         get_bits1(&s->gb); /* sp_for_switch_flag */
4338     }
4339     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4340         get_se_golomb(&s->gb); /* slice_qs_delta */
4341     }
4342
4343     h->deblocking_filter = 1;
4344     h->slice_alpha_c0_offset = 0;
4345     h->slice_beta_offset = 0;
4346     if( h->pps.deblocking_filter_parameters_present ) {
4347         h->deblocking_filter= get_ue_golomb(&s->gb);
4348         if(h->deblocking_filter < 2)
4349             h->deblocking_filter^= 1; // 1<->0
4350
4351         if( h->deblocking_filter ) {
4352             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4353             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4354         }
4355     }
4356     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4357        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4358        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4359        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4360         h->deblocking_filter= 0;
4361
4362 #if 0 //FMO
4363     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4364         slice_group_change_cycle= get_bits(&s->gb, ?);
4365 #endif
4366
4367     h->slice_num++;
4368
4369     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4370         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4371                h->slice_num,
4372                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4373                first_mb_in_slice,
4374                av_get_pict_type_char(h->slice_type),
4375                pps_id, h->frame_num,
4376                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4377                h->ref_count[0], h->ref_count[1],
4378                s->qscale,
4379                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4380                h->use_weight,
4381                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4382                );
4383     }
4384
4385     return 0;
4386 }
4387
4388 /**
4389  *
4390  */
4391 static inline int get_level_prefix(GetBitContext *gb){
4392     unsigned int buf;
4393     int log;
4394
4395     OPEN_READER(re, gb);
4396     UPDATE_CACHE(re, gb);
4397     buf=GET_CACHE(re, gb);
4398
4399     log= 32 - av_log2(buf);
4400 #ifdef TRACE
4401     print_bin(buf>>(32-log), log);
4402     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4403 #endif
4404
4405     LAST_SKIP_BITS(re, gb, log);
4406     CLOSE_READER(re, gb);
4407
4408     return log-1;
4409 }
4410
4411 static inline int get_dct8x8_allowed(H264Context *h){
4412     int i;
4413     for(i=0; i<4; i++){
4414         if(!IS_SUB_8X8(h->sub_mb_type[i])
4415            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4416             return 0;
4417     }
4418     return 1;
4419 }
4420
4421 /**
4422  * decodes a residual block.
4423  * @param n block index
4424  * @param scantable scantable
4425  * @param max_coeff number of coefficients in the block
4426  * @return <0 if an error occured
4427  */
4428 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint16_t *qmul, int max_coeff){
4429     MpegEncContext * const s = &h->s;
4430     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4431     int level[16], run[16];
4432     int suffix_length, zeros_left, coeff_num, coeff_token, total_coeff, i, trailing_ones;
4433
4434     //FIXME put trailing_onex into the context
4435
4436     if(n == CHROMA_DC_BLOCK_INDEX){
4437         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4438         total_coeff= coeff_token>>2;
4439     }else{
4440         if(n == LUMA_DC_BLOCK_INDEX){
4441             total_coeff= pred_non_zero_count(h, 0);
4442             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4443             total_coeff= coeff_token>>2;
4444         }else{
4445             total_coeff= pred_non_zero_count(h, n);
4446             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4447             total_coeff= coeff_token>>2;
4448             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4449         }
4450     }
4451
4452     //FIXME set last_non_zero?
4453
4454     if(total_coeff==0)
4455         return 0;
4456
4457     trailing_ones= coeff_token&3;
4458     tprintf("trailing:%d, total:%d\n", trailing_ones, total_coeff);
4459     assert(total_coeff<=16);
4460
4461     for(i=0; i<trailing_ones; i++){
4462         level[i]= 1 - 2*get_bits1(gb);
4463     }
4464
4465     suffix_length= total_coeff > 10 && trailing_ones < 3;
4466
4467     for(; i<total_coeff; i++){
4468         const int prefix= get_level_prefix(gb);
4469         int level_code, mask;
4470
4471         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4472             if(suffix_length)
4473                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4474             else
4475                 level_code= (prefix<<suffix_length); //part
4476         }else if(prefix==14){
4477             if(suffix_length)
4478                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4479             else
4480                 level_code= prefix + get_bits(gb, 4); //part
4481         }else if(prefix==15){
4482             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4483             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4484         }else{
4485             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4486             return -1;
4487         }
4488
4489         if(i==trailing_ones && i<3) level_code+= 2; //FIXME split first iteration
4490
4491         mask= -(level_code&1);
4492         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4493
4494         if(suffix_length==0) suffix_length=1; //FIXME split first iteration
4495
4496 #if 1
4497         if(ABS(level[i]) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++;
4498 #else
4499         if((2+level_code)>>1) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++;
4500         /* ? == prefix > 2 or sth */
4501 #endif
4502         tprintf("level: %d suffix_length:%d\n", level[i], suffix_length);
4503     }
4504
4505     if(total_coeff == max_coeff)
4506         zeros_left=0;
4507     else{
4508         if(n == CHROMA_DC_BLOCK_INDEX)
4509             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4510         else
4511             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4512     }
4513
4514     for(i=0; i<total_coeff-1; i++){
4515         if(zeros_left <=0)
4516             break;
4517         else if(zeros_left < 7){
4518             run[i]= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4519         }else{
4520             run[i]= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4521         }
4522         zeros_left -= run[i];
4523     }
4524
4525     if(zeros_left<0){
4526         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4527         return -1;
4528     }
4529
4530     for(; i<total_coeff-1; i++){
4531         run[i]= 0;
4532     }
4533
4534     run[i]= zeros_left;
4535
4536     coeff_num=-1;
4537     if(n > 24){
4538         for(i=total_coeff-1; i>=0; i--){ //FIXME merge into rundecode?
4539             int j;
4540
4541             coeff_num += run[i] + 1; //FIXME add 1 earlier ?
4542             j= scantable[ coeff_num ];
4543
4544             block[j]= level[i];
4545         }
4546     }else{
4547         for(i=total_coeff-1; i>=0; i--){ //FIXME merge into  rundecode?
4548             int j;
4549
4550             coeff_num += run[i] + 1; //FIXME add 1 earlier ?
4551             j= scantable[ coeff_num ];
4552
4553             block[j]= level[i] * qmul[j];
4554 //            printf("%d %d  ", block[j], qmul[j]);
4555         }
4556     }
4557     return 0;
4558 }
4559
4560 /**
4561  * decodes a P_SKIP or B_SKIP macroblock
4562  */
4563 static void decode_mb_skip(H264Context *h){
4564     MpegEncContext * const s = &h->s;
4565     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4566     int mb_type=0;
4567
4568     memset(h->non_zero_count[mb_xy], 0, 16);
4569     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4570
4571     if(h->mb_aff_frame && s->mb_skip_run==0 && (s->mb_y&1)==0){
4572         h->mb_field_decoding_flag= get_bits1(&s->gb);
4573     }
4574     if(h->mb_field_decoding_flag)
4575         mb_type|= MB_TYPE_INTERLACED;
4576
4577     if( h->slice_type == B_TYPE )
4578     {
4579         // just for fill_caches. pred_direct_motion will set the real mb_type
4580         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4581
4582         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4583         pred_direct_motion(h, &mb_type);
4584         if(h->pps.cabac){
4585             fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
4586             fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
4587         }
4588     }
4589     else
4590     {
4591         int mx, my;
4592         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4593
4594         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4595         pred_pskip_motion(h, &mx, &my);
4596         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4597         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4598         if(h->pps.cabac)
4599             fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
4600     }
4601
4602     write_back_motion(h, mb_type);
4603     s->current_picture.mb_type[mb_xy]= mb_type|MB_TYPE_SKIP;
4604     s->current_picture.qscale_table[mb_xy]= s->qscale;
4605     h->slice_table[ mb_xy ]= h->slice_num;
4606     h->prev_mb_skipped= 1;
4607 }
4608
4609 /**
4610  * decodes a macroblock
4611  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4612  */
4613 static int decode_mb_cavlc(H264Context *h){
4614     MpegEncContext * const s = &h->s;
4615     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4616     int mb_type, partition_count, cbp;
4617     int dct8x8_allowed= h->pps.transform_8x8_mode;
4618
4619     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4620
4621     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4622     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4623                 down the code */
4624     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4625         if(s->mb_skip_run==-1)
4626             s->mb_skip_run= get_ue_golomb(&s->gb);
4627
4628         if (s->mb_skip_run--) {
4629             decode_mb_skip(h);
4630             return 0;
4631         }
4632     }
4633     if(h->mb_aff_frame){
4634         if ( ((s->mb_y&1) == 0) || h->prev_mb_skipped)
4635             h->mb_field_decoding_flag = get_bits1(&s->gb);
4636     }else
4637         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4638
4639     h->prev_mb_skipped= 0;
4640
4641     mb_type= get_ue_golomb(&s->gb);
4642     if(h->slice_type == B_TYPE){
4643         if(mb_type < 23){
4644             partition_count= b_mb_type_info[mb_type].partition_count;
4645             mb_type=         b_mb_type_info[mb_type].type;
4646         }else{
4647             mb_type -= 23;
4648             goto decode_intra_mb;
4649         }
4650     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4651         if(mb_type < 5){
4652             partition_count= p_mb_type_info[mb_type].partition_count;
4653             mb_type=         p_mb_type_info[mb_type].type;
4654         }else{
4655             mb_type -= 5;
4656             goto decode_intra_mb;
4657         }
4658     }else{
4659        assert(h->slice_type == I_TYPE);
4660 decode_intra_mb:
4661         if(mb_type > 25){
4662             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice to large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4663             return -1;
4664         }
4665         partition_count=0;
4666         cbp= i_mb_type_info[mb_type].cbp;
4667         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4668         mb_type= i_mb_type_info[mb_type].type;
4669     }
4670
4671     if(h->mb_field_decoding_flag)
4672         mb_type |= MB_TYPE_INTERLACED;
4673
4674     h->slice_table[ mb_xy ]= h->slice_num;
4675
4676     if(IS_INTRA_PCM(mb_type)){
4677         unsigned int x, y;
4678
4679         // we assume these blocks are very rare so we dont optimize it
4680         align_get_bits(&s->gb);
4681
4682         // The pixels are stored in the same order as levels in h->mb array.
4683         for(y=0; y<16; y++){
4684             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4685             for(x=0; x<16; x++){
4686                 tprintf("LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4687                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4688             }
4689         }
4690         for(y=0; y<8; y++){
4691             const int index= 256 + 4*(y&3) + 32*(y>>2);
4692             for(x=0; x<8; x++){
4693                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4694                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4695             }
4696         }
4697         for(y=0; y<8; y++){
4698             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4699             for(x=0; x<8; x++){
4700                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4701                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4702             }
4703         }
4704
4705         // In deblocking, the quantizer is 0
4706         s->current_picture.qscale_table[mb_xy]= 0;
4707         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
4708         // All coeffs are present
4709         memset(h->non_zero_count[mb_xy], 16, 16);
4710
4711         s->current_picture.mb_type[mb_xy]= mb_type;
4712         return 0;
4713     }
4714
4715     fill_caches(h, mb_type, 0);
4716
4717     //mb_pred
4718     if(IS_INTRA(mb_type)){
4719 //            init_top_left_availability(h);
4720             if(IS_INTRA4x4(mb_type)){
4721                 int i;
4722                 int di = 1;
4723                 if(dct8x8_allowed && get_bits1(&s->gb)){
4724                     mb_type |= MB_TYPE_8x8DCT;
4725                     di = 4;
4726                 }
4727
4728 //                fill_intra4x4_pred_table(h);
4729                 for(i=0; i<16; i+=di){
4730                     const int mode_coded= !get_bits1(&s->gb);
4731                     const int predicted_mode=  pred_intra_mode(h, i);
4732                     int mode;
4733
4734                     if(mode_coded){
4735                         const int rem_mode= get_bits(&s->gb, 3);
4736                         if(rem_mode<predicted_mode)
4737                             mode= rem_mode;
4738                         else
4739                             mode= rem_mode + 1;
4740                     }else{
4741                         mode= predicted_mode;
4742                     }
4743
4744                     if(di==4)
4745                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4746                     else
4747                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4748                 }
4749                 write_back_intra_pred_mode(h);
4750                 if( check_intra4x4_pred_mode(h) < 0)
4751                     return -1;
4752             }else{
4753                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4754                 if(h->intra16x16_pred_mode < 0)
4755                     return -1;
4756             }
4757             h->chroma_pred_mode= get_ue_golomb(&s->gb);
4758
4759             h->chroma_pred_mode= check_intra_pred_mode(h, h->chroma_pred_mode);
4760             if(h->chroma_pred_mode < 0)
4761                 return -1;
4762     }else if(partition_count==4){
4763         int i, j, sub_partition_count[4], list, ref[2][4];
4764
4765         if(h->slice_type == B_TYPE){
4766             for(i=0; i<4; i++){
4767                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4768                 if(h->sub_mb_type[i] >=13){
4769                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4770                     return -1;
4771                 }
4772                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4773                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4774             }
4775             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4776                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3]))
4777                 pred_direct_motion(h, &mb_type);
4778         }else{
4779             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4780             for(i=0; i<4; i++){
4781                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4782                 if(h->sub_mb_type[i] >=4){
4783                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4784                     return -1;
4785                 }
4786                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4787                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4788             }
4789         }
4790
4791         for(list=0; list<2; list++){
4792             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4793             if(ref_count == 0) continue;
4794             if (h->mb_aff_frame && h->mb_field_decoding_flag) {
4795                 ref_count <<= 1;
4796             }
4797             for(i=0; i<4; i++){
4798                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4799                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4800                     ref[list][i] = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4801                 }else{
4802                  //FIXME
4803                     ref[list][i] = -1;
4804                 }
4805             }
4806         }
4807
4808         if(dct8x8_allowed)
4809             dct8x8_allowed = get_dct8x8_allowed(h);
4810
4811         for(list=0; list<2; list++){
4812             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4813             if(ref_count == 0) continue;
4814
4815             for(i=0; i<4; i++){
4816                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4817                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4818                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4819
4820                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4821                     const int sub_mb_type= h->sub_mb_type[i];
4822                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4823                     for(j=0; j<sub_partition_count[i]; j++){
4824                         int mx, my;
4825                         const int index= 4*i + block_width*j;
4826                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4827                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4828                         mx += get_se_golomb(&s->gb);
4829                         my += get_se_golomb(&s->gb);
4830                         tprintf("final mv:%d %d\n", mx, my);
4831
4832                         if(IS_SUB_8X8(sub_mb_type)){
4833                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
4834                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4835                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
4836                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4837                         }else if(IS_SUB_8X4(sub_mb_type)){
4838                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
4839                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
4840                         }else if(IS_SUB_4X8(sub_mb_type)){
4841                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
4842                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
4843                         }else{
4844                             assert(IS_SUB_4X4(sub_mb_type));
4845                             mv_cache[ 0 ][0]= mx;
4846                             mv_cache[ 0 ][1]= my;
4847                         }
4848                     }
4849                 }else{
4850                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4851                     p[0] = p[1]=
4852                     p[8] = p[9]= 0;
4853                 }
4854             }
4855         }
4856     }else if(IS_DIRECT(mb_type)){
4857         pred_direct_motion(h, &mb_type);
4858         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4859     }else{
4860         int list, mx, my, i;
4861          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4862         if(IS_16X16(mb_type)){
4863             for(list=0; list<2; list++){
4864                 if(h->ref_count[list]>0){
4865                     if(IS_DIR(mb_type, 0, list)){
4866                         const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
4867                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4868                     }else
4869                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (LIST_NOT_USED&0xFF), 1);
4870                 }
4871             }
4872             for(list=0; list<2; list++){
4873                 if(IS_DIR(mb_type, 0, list)){
4874                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4875                     mx += get_se_golomb(&s->gb);
4876                     my += get_se_golomb(&s->gb);
4877                     tprintf("final mv:%d %d\n", mx, my);
4878
4879                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
4880                 }else
4881                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
4882             }
4883         }
4884         else if(IS_16X8(mb_type)){
4885             for(list=0; list<2; list++){
4886                 if(h->ref_count[list]>0){
4887                     for(i=0; i<2; i++){
4888                         if(IS_DIR(mb_type, i, list)){
4889                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
4890                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4891                         }else
4892                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
4893                     }
4894                 }
4895             }
4896             for(list=0; list<2; list++){
4897                 for(i=0; i<2; i++){
4898                     if(IS_DIR(mb_type, i, list)){
4899                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4900                         mx += get_se_golomb(&s->gb);
4901                         my += get_se_golomb(&s->gb);
4902                         tprintf("final mv:%d %d\n", mx, my);
4903
4904                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
4905                     }else
4906                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
4907                 }
4908             }
4909         }else{
4910             assert(IS_8X16(mb_type));
4911             for(list=0; list<2; list++){
4912                 if(h->ref_count[list]>0){
4913                     for(i=0; i<2; i++){
4914                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4915                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
4916                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4917                         }else
4918                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
4919                     }
4920                 }
4921             }
4922             for(list=0; list<2; list++){
4923                 for(i=0; i<2; i++){
4924                     if(IS_DIR(mb_type, i, list)){
4925                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4926                         mx += get_se_golomb(&s->gb);
4927                         my += get_se_golomb(&s->gb);
4928                         tprintf("final mv:%d %d\n", mx, my);
4929
4930                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
4931                     }else
4932                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
4933                 }
4934             }
4935         }
4936     }
4937
4938     if(IS_INTER(mb_type))
4939         write_back_motion(h, mb_type);
4940
4941     if(!IS_INTRA16x16(mb_type)){
4942         cbp= get_ue_golomb(&s->gb);
4943         if(cbp > 47){
4944             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%d) at %d %d\n", cbp, s->mb_x, s->mb_y);
4945             return -1;
4946         }
4947
4948         if(IS_INTRA4x4(mb_type))
4949             cbp= golomb_to_intra4x4_cbp[cbp];
4950         else
4951             cbp= golomb_to_inter_cbp[cbp];
4952     }
4953
4954     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4955         if(get_bits1(&s->gb))
4956             mb_type |= MB_TYPE_8x8DCT;
4957     }
4958     s->current_picture.mb_type[mb_xy]= mb_type;
4959
4960     if(cbp || IS_INTRA16x16(mb_type)){
4961         int i8x8, i4x4, chroma_idx;
4962         int chroma_qp, dquant;
4963         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4964         const uint8_t *scan, *dc_scan;
4965
4966 //        fill_non_zero_count_cache(h);
4967
4968         if(IS_INTERLACED(mb_type)){
4969             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4970             dc_scan= luma_dc_field_scan;
4971         }else{
4972             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4973             dc_scan= luma_dc_zigzag_scan;
4974         }
4975
4976         dquant= get_se_golomb(&s->gb);
4977
4978         if( dquant > 25 || dquant < -26 ){
4979             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4980             return -1;
4981         }
4982
4983         s->qscale += dquant;
4984         if(((unsigned)s->qscale) > 51){
4985             if(s->qscale<0) s->qscale+= 52;
4986             else            s->qscale-= 52;
4987         }
4988
4989         h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4990         if(IS_INTRA16x16(mb_type)){
4991             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[s->qscale], 16) < 0){
4992                 return -1; //FIXME continue if partitioned and other return -1 too
4993             }
4994
4995             assert((cbp&15) == 0 || (cbp&15) == 15);
4996
4997             if(cbp&15){
4998                 for(i8x8=0; i8x8<4; i8x8++){
4999                     for(i4x4=0; i4x4<4; i4x4++){
5000                         const int index= i4x4 + 4*i8x8;
5001                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[s->qscale], 15) < 0 ){
5002                             return -1;
5003                         }
5004                     }
5005                 }
5006             }else{
5007                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5008             }
5009         }else{
5010             for(i8x8=0; i8x8<4; i8x8++){
5011                 if(cbp & (1<<i8x8)){
5012                     if(IS_8x8DCT(mb_type)){
5013                         DCTELEM *buf = &h->mb[64*i8x8];
5014                         uint8_t *nnz;
5015                         for(i4x4=0; i4x4<4; i4x4++){
5016                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, zigzag_scan8x8_cavlc+16*i4x4,
5017                                                 h->dequant8_coeff[s->qscale], 16) <0 )
5018                                 return -1;
5019                         }
5020                         if(s->qscale < 12){
5021                             int i;
5022                             for(i=0; i<64; i++)
5023                                 buf[i] = (buf[i] + 2) >> 2;
5024                         }
5025                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5026                         nnz[0] |= nnz[1] | nnz[8] | nnz[9];
5027                     }else{
5028                         for(i4x4=0; i4x4<4; i4x4++){
5029                             const int index= i4x4 + 4*i8x8;
5030
5031                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[s->qscale], 16) <0 ){
5032                                 return -1;
5033                             }
5034                         }
5035                     }
5036                 }else{
5037                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5038                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5039                 }
5040             }
5041         }
5042
5043         if(cbp&0x30){
5044             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5045                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, h->dequant4_coeff[chroma_qp], 4) < 0){
5046                     return -1;
5047                 }
5048         }
5049
5050         if(cbp&0x20){
5051             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5052                 for(i4x4=0; i4x4<4; i4x4++){
5053                     const int index= 16 + 4*chroma_idx + i4x4;
5054                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[chroma_qp], 15) < 0){
5055                         return -1;
5056                     }
5057                 }
5058             }
5059         }else{
5060             uint8_t * const nnz= &h->non_zero_count_cache[0];
5061             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5062             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5063         }
5064     }else{
5065         uint8_t * const nnz= &h->non_zero_count_cache[0];
5066         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5067         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5068         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5069     }
5070     s->current_picture.qscale_table[mb_xy]= s->qscale;
5071     write_back_non_zero_count(h);
5072
5073     return 0;
5074 }
5075
5076 static int decode_cabac_field_decoding_flag(H264Context *h) {
5077     MpegEncContext * const s = &h->s;
5078     const int mb_x = s->mb_x;
5079     const int mb_y = s->mb_y & ~1;
5080     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5081     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5082
5083     unsigned int ctx = 0;
5084
5085     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5086         ctx += 1;
5087     }
5088     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5089         ctx += 1;
5090     }
5091
5092     return get_cabac( &h->cabac, &h->cabac_state[70 + ctx] );
5093 }
5094
5095 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5096     uint8_t *state= &h->cabac_state[ctx_base];
5097     int mb_type;
5098
5099     if(intra_slice){
5100         MpegEncContext * const s = &h->s;
5101         const int mba_xy = h->left_mb_xy[0];
5102         const int mbb_xy = h->top_mb_xy;
5103         int ctx=0;
5104         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5105             ctx++;
5106         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5107             ctx++;
5108         if( get_cabac( &h->cabac, &state[ctx] ) == 0 )
5109             return 0;   /* I4x4 */
5110         state += 2;
5111     }else{
5112         if( get_cabac( &h->cabac, &state[0] ) == 0 )
5113             return 0;   /* I4x4 */
5114     }
5115
5116     if( get_cabac_terminate( &h->cabac ) )
5117         return 25;  /* PCM */
5118
5119     mb_type = 1; /* I16x16 */
5120     if( get_cabac( &h->cabac, &state[1] ) )
5121         mb_type += 12;  /* cbp_luma != 0 */
5122
5123     if( get_cabac( &h->cabac, &state[2] ) ) {
5124         if( get_cabac( &h->cabac, &state[2+intra_slice] ) )
5125             mb_type += 4 * 2;   /* cbp_chroma == 2 */
5126         else
5127             mb_type += 4 * 1;   /* cbp_chroma == 1 */
5128     }
5129     if( get_cabac( &h->cabac, &state[3+intra_slice] ) )
5130         mb_type += 2;
5131     if( get_cabac( &h->cabac, &state[3+2*intra_slice] ) )
5132         mb_type += 1;
5133     return mb_type;
5134 }
5135
5136 static int decode_cabac_mb_type( H264Context *h ) {
5137     MpegEncContext * const s = &h->s;
5138
5139     if( h->slice_type == I_TYPE ) {
5140         return decode_cabac_intra_mb_type(h, 3, 1);
5141     } else if( h->slice_type == P_TYPE ) {
5142         if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5143             /* P-type */
5144             if( get_cabac( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5145                 if( get_cabac( &h->cabac, &h->cabac_state[16] ) == 0 )
5146                     return 0; /* P_L0_D16x16; */
5147                 else
5148                     return 3; /* P_8x8; */
5149             } else {
5150                 if( get_cabac( &h->cabac, &h->cabac_state[17] ) == 0 )
5151                     return 2; /* P_L0_D8x16; */
5152                 else
5153                     return 1; /* P_L0_D16x8; */
5154             }
5155         } else {
5156             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5157         }
5158     } else if( h->slice_type == B_TYPE ) {
5159         const int mba_xy = h->left_mb_xy[0];
5160         const int mbb_xy = h->top_mb_xy;
5161         int ctx = 0;
5162         int bits;
5163
5164         if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] )
5165                       && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5166             ctx++;
5167         if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] )
5168                       && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5169             ctx++;
5170
5171         if( !get_cabac( &h->cabac, &h->cabac_state[27+ctx] ) )
5172             return 0; /* B_Direct_16x16 */
5173
5174         if( !get_cabac( &h->cabac, &h->cabac_state[27+3] ) ) {
5175             return 1 + get_cabac( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5176         }
5177
5178         bits = get_cabac( &h->cabac, &h->cabac_state[27+4] ) << 3;
5179         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 2;
5180         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 1;
5181         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] );
5182         if( bits < 8 )
5183             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5184         else if( bits == 13 ) {
5185             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5186         } else if( bits == 14 )
5187             return 11; /* B_L1_L0_8x16 */
5188         else if( bits == 15 )
5189             return 22; /* B_8x8 */
5190
5191         bits= ( bits<<1 ) | get_cabac( &h->cabac, &h->cabac_state[27+5] );
5192         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5193     } else {
5194         /* TODO SI/SP frames? */
5195         return -1;
5196     }
5197 }
5198
5199 static int decode_cabac_mb_skip( H264Context *h) {
5200     MpegEncContext * const s = &h->s;
5201     const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5202     const int mba_xy = mb_xy - 1;
5203     const int mbb_xy = mb_xy - s->mb_stride;
5204     int ctx = 0;
5205
5206     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5207         ctx++;
5208     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5209         ctx++;
5210
5211     if( h->slice_type == P_TYPE || h->slice_type == SP_TYPE)
5212         return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
5213     else /* B-frame */
5214         return get_cabac( &h->cabac, &h->cabac_state[24+ctx] );
5215 }
5216
5217 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5218     int mode = 0;
5219
5220     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5221         return pred_mode;
5222
5223     if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
5224         mode += 1;
5225     if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
5226         mode += 2;
5227     if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
5228         mode += 4;
5229     if( mode >= pred_mode )
5230         return mode + 1;
5231     else
5232         return mode;
5233 }
5234
5235 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5236     const int mba_xy = h->left_mb_xy[0];
5237     const int mbb_xy = h->top_mb_xy;
5238
5239     int ctx = 0;
5240
5241     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5242     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5243         ctx++;
5244
5245     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5246         ctx++;
5247
5248     if( get_cabac( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5249         return 0;
5250
5251     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5252         return 1;
5253     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5254         return 2;
5255     else
5256         return 3;
5257 }
5258
5259 static const uint8_t block_idx_x[16] = {
5260     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5261 };
5262 static const uint8_t block_idx_y[16] = {
5263     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5264 };
5265 static const uint8_t block_idx_xy[4][4] = {
5266     { 0, 2, 8,  10},
5267     { 1, 3, 9,  11},
5268     { 4, 6, 12, 14},
5269     { 5, 7, 13, 15}
5270 };
5271
5272 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5273     MpegEncContext * const s = &h->s;
5274
5275     int cbp = 0;
5276     int i8x8;
5277
5278     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5279         int cbp_a = -1;
5280         int cbp_b = -1;
5281         int x, y;
5282         int ctx = 0;
5283
5284         x = block_idx_x[4*i8x8];
5285         y = block_idx_y[4*i8x8];
5286
5287         if( x > 0 )
5288             cbp_a = cbp;
5289         else if( s->mb_x > 0 && (h->slice_table[h->left_mb_xy[0]] == h->slice_num)) {
5290             cbp_a = h->left_cbp;
5291             tprintf("cbp_a = left_cbp = %x\n", cbp_a);
5292         }
5293
5294         if( y > 0 )
5295             cbp_b = cbp;
5296         else if( s->mb_y > 0 && (h->slice_table[h->top_mb_xy] == h->slice_num)) {
5297             cbp_b = h->top_cbp;
5298             tprintf("cbp_b = top_cbp = %x\n", cbp_b);
5299         }
5300
5301         /* No need to test for skip as we put 0 for skip block */
5302         /* No need to test for IPCM as we put 1 for IPCM block */
5303         if( cbp_a >= 0 ) {
5304             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5305             if( ((cbp_a >> i8x8a)&0x01) == 0 )
5306                 ctx++;
5307         }
5308
5309         if( cbp_b >= 0 ) {
5310             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5311             if( ((cbp_b >> i8x8b)&0x01) == 0 )
5312                 ctx += 2;
5313         }
5314
5315         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5316             cbp |= 1 << i8x8;
5317         }
5318     }
5319     return cbp;
5320 }
5321 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5322     int ctx;
5323     int cbp_a, cbp_b;
5324
5325     cbp_a = (h->left_cbp>>4)&0x03;
5326     cbp_b = (h-> top_cbp>>4)&0x03;
5327
5328     ctx = 0;
5329     if( cbp_a > 0 ) ctx++;
5330     if( cbp_b > 0 ) ctx += 2;
5331     if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5332         return 0;
5333
5334     ctx = 4;
5335     if( cbp_a == 2 ) ctx++;
5336     if( cbp_b == 2 ) ctx += 2;
5337     return 1 + get_cabac( &h->cabac, &h->cabac_state[77 + ctx] );
5338 }
5339 static int decode_cabac_mb_dqp( H264Context *h) {
5340     MpegEncContext * const s = &h->s;
5341     int mbn_xy;
5342     int   ctx = 0;
5343     int   val = 0;
5344
5345     if( s->mb_x > 0 )
5346         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5347     else
5348         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5349
5350     if( h->last_qscale_diff != 0 && ( IS_INTRA16x16(s->current_picture.mb_type[mbn_xy] ) || (h->cbp_table[mbn_xy]&0x3f) ) )
5351         ctx++;
5352
5353     while( get_cabac( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5354         if( ctx < 2 )
5355             ctx = 2;
5356         else
5357             ctx = 3;
5358         val++;
5359     }
5360
5361     if( val&0x01 )
5362         return (val + 1)/2;
5363     else
5364         return -(val + 1)/2;
5365 }
5366 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5367     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5368         return 0;   /* 8x8 */
5369     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5370         return 1;   /* 8x4 */
5371     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5372         return 2;   /* 4x8 */
5373     return 3;       /* 4x4 */
5374 }
5375 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5376     int type;
5377     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5378         return 0;   /* B_Direct_8x8 */
5379     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5380         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5381     type = 3;
5382     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5383         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5384             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5385         type += 4;
5386     }
5387     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5388     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5389     return type;
5390 }
5391
5392 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5393     return get_cabac( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5394 }
5395
5396 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5397     int refa = h->ref_cache[list][scan8[n] - 1];
5398     int refb = h->ref_cache[list][scan8[n] - 8];
5399     int ref  = 0;
5400     int ctx  = 0;
5401
5402     if( h->slice_type == B_TYPE) {
5403         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5404             ctx++;
5405         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5406             ctx += 2;
5407     } else {
5408         if( refa > 0 )
5409             ctx++;
5410         if( refb > 0 )
5411             ctx += 2;
5412     }
5413
5414     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5415         ref++;
5416         if( ctx < 4 )
5417             ctx = 4;
5418         else
5419             ctx = 5;
5420     }
5421     return ref;
5422 }
5423
5424 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5425     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5426                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5427     int ctxbase = (l == 0) ? 40 : 47;
5428     int ctx, mvd;
5429
5430     if( amvd < 3 )
5431         ctx = 0;
5432     else if( amvd > 32 )
5433         ctx = 2;
5434     else
5435         ctx = 1;
5436
5437     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5438         return 0;
5439
5440     mvd= 1;
5441     ctx= 3;
5442     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5443         mvd++;
5444         if( ctx < 6 )
5445             ctx++;
5446     }
5447
5448     if( mvd >= 9 ) {
5449         int k = 3;
5450         while( get_cabac_bypass( &h->cabac ) ) {
5451             mvd += 1 << k;
5452             k++;
5453         }
5454         while( k-- ) {
5455             if( get_cabac_bypass( &h->cabac ) )
5456                 mvd += 1 << k;
5457         }
5458     }
5459     if( get_cabac_bypass( &h->cabac ) )  return -mvd;
5460     else                                 return  mvd;
5461 }
5462
5463 static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5464     int nza, nzb;
5465     int ctx = 0;
5466
5467     if( cat == 0 ) {
5468         nza = h->left_cbp&0x100;
5469         nzb = h-> top_cbp&0x100;
5470     } else if( cat == 1 || cat == 2 ) {
5471         nza = h->non_zero_count_cache[scan8[idx] - 1];
5472         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5473     } else if( cat == 3 ) {
5474         nza = (h->left_cbp>>(6+idx))&0x01;
5475         nzb = (h-> top_cbp>>(6+idx))&0x01;
5476     } else {
5477         assert(cat == 4);
5478         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5479         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5480     }
5481
5482     if( nza > 0 )
5483         ctx++;
5484
5485     if( nzb > 0 )
5486         ctx += 2;
5487
5488     return ctx + 4 * cat;
5489 }
5490
5491 static int inline decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint16_t *qmul, int max_coeff) {
5492     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5493     static const int significant_coeff_flag_field_offset[2] = { 105, 277 };
5494     static const int last_significant_coeff_flag_field_offset[2] = { 166, 338 };
5495     static const int significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 297 };
5496     static const int last_significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 251 };
5497     static const int coeff_abs_level_m1_offset[6] = { 227+0, 227+10, 227+20, 227+30, 227+39, 426 };
5498     static const int identity[15] = {
5499         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
5500     };
5501     static const int significant_coeff_flag_offset_8x8[63] = {
5502         0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5503         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5504         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5505        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12
5506     };
5507     static const int last_coeff_flag_offset_8x8[63] = {
5508         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5509         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5510         3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5511         5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5512     };
5513
5514     int index[64];
5515
5516     int i, last;
5517     int coeff_count = 0;
5518
5519     int abslevel1 = 1;
5520     int abslevelgt1 = 0;
5521
5522     const int* significant_coeff_ctx_offset;
5523     const int* last_coeff_ctx_offset;
5524     const int significant_coeff_ctx_base = significant_coeff_flag_offset[cat]
5525         + significant_coeff_flag_field_offset[h->mb_field_decoding_flag];
5526     const int last_coeff_ctx_base = last_significant_coeff_flag_offset[cat]
5527         + last_significant_coeff_flag_field_offset[h->mb_field_decoding_flag];
5528
5529     /* cat: 0-> DC 16x16  n = 0
5530      *      1-> AC 16x16  n = luma4x4idx
5531      *      2-> Luma4x4   n = luma4x4idx
5532      *      3-> DC Chroma n = iCbCr
5533      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5534      *      5-> Luma8x8   n = 4 * luma8x8idx
5535      */
5536
5537     /* read coded block flag */
5538     if( cat == 5 ) {
5539         significant_coeff_ctx_offset = significant_coeff_flag_offset_8x8;
5540         last_coeff_ctx_offset = last_coeff_flag_offset_8x8;
5541     } else {
5542         if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5543             if( cat == 1 || cat == 2 )
5544                 h->non_zero_count_cache[scan8[n]] = 0;
5545             else if( cat == 4 )
5546                 h->non_zero_count_cache[scan8[16+n]] = 0;
5547
5548             return 0;
5549         }
5550
5551         significant_coeff_ctx_offset =
5552         last_coeff_ctx_offset = identity;
5553     }
5554
5555     for(last= 0; last < max_coeff - 1; last++) {
5556         int sig_ctx = significant_coeff_ctx_base + significant_coeff_ctx_offset[last];
5557         if( get_cabac( &h->cabac, &h->cabac_state[sig_ctx] )) {
5558             int last_ctx = last_coeff_ctx_base + last_coeff_ctx_offset[last];
5559             index[coeff_count++] = last;
5560             if( get_cabac( &h->cabac, &h->cabac_state[last_ctx] ) ) {
5561                 last= max_coeff;
5562                 break;
5563             }
5564         }
5565     }
5566     if( last == max_coeff -1 ) {
5567         index[coeff_count++] = last;
5568     }
5569     assert(coeff_count > 0);
5570
5571     if( cat == 0 )
5572         h->cbp_table[mb_xy] |= 0x100;
5573     else if( cat == 1 || cat == 2 )
5574         h->non_zero_count_cache[scan8[n]] = coeff_count;
5575     else if( cat == 3 )
5576         h->cbp_table[mb_xy] |= 0x40 << n;
5577     else if( cat == 4 )
5578         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5579     else {
5580         assert( cat == 5 );
5581         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 1, 1);
5582     }
5583
5584     for( i = coeff_count - 1; i >= 0; i-- ) {
5585         int ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + coeff_abs_level_m1_offset[cat];
5586         int j= scantable[index[i]];
5587
5588         if( get_cabac( &h->cabac, &h->cabac_state[ctx] ) == 0 ) {
5589             if( cat == 0 || cat == 3 ) {
5590                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -1;
5591                 else                                block[j] =  1;
5592             }else{
5593                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -qmul[j];
5594                 else                                block[j] =  qmul[j];
5595             }
5596
5597             abslevel1++;
5598         } else {
5599             int coeff_abs = 2;
5600             ctx = 5 + FFMIN( 4, abslevelgt1 ) + coeff_abs_level_m1_offset[cat];
5601             while( coeff_abs < 15 && get_cabac( &h->cabac, &h->cabac_state[ctx] ) ) {
5602                 coeff_abs++;
5603             }
5604
5605             if( coeff_abs >= 15 ) {
5606                 int j = 0;
5607                 while( get_cabac_bypass( &h->cabac ) ) {
5608                     coeff_abs += 1 << j;
5609                     j++;
5610                 }
5611
5612                 while( j-- ) {
5613                     if( get_cabac_bypass( &h->cabac ) )
5614                         coeff_abs += 1 << j ;
5615                 }
5616             }
5617
5618             if( cat == 0 || cat == 3 ) {
5619                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs;
5620                 else                                block[j] =  coeff_abs;
5621             }else{
5622                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs * qmul[j];
5623                 else                                block[j] =  coeff_abs * qmul[j];
5624             }
5625
5626             abslevelgt1++;
5627         }
5628     }
5629     return 0;
5630 }
5631
5632 void inline compute_mb_neighboors(H264Context *h)
5633 {
5634     MpegEncContext * const s = &h->s;
5635     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5636     h->top_mb_xy     = mb_xy - s->mb_stride;
5637     h->left_mb_xy[0] = mb_xy - 1;
5638     if(h->mb_aff_frame){
5639         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5640         const int top_pair_xy      = pair_xy     - s->mb_stride;
5641         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5642         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5643         const int curr_mb_frame_flag = !h->mb_field_decoding_flag;
5644         const int bottom = (s->mb_y & 1);
5645         if (bottom
5646                 ? !curr_mb_frame_flag // bottom macroblock
5647                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5648                 ) {
5649             h->top_mb_xy -= s->mb_stride;
5650         }
5651         if (left_mb_frame_flag != curr_mb_frame_flag) {
5652             h->left_mb_xy[0] = pair_xy - 1;
5653         }
5654     }
5655     return;
5656 }
5657
5658 /**
5659  * decodes a macroblock
5660  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5661  */
5662 static int decode_mb_cabac(H264Context *h) {
5663     MpegEncContext * const s = &h->s;
5664     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5665     int mb_type, partition_count, cbp = 0;
5666     int dct8x8_allowed= h->pps.transform_8x8_mode;
5667
5668     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5669
5670     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5671     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5672         /* read skip flags */
5673         if( decode_cabac_mb_skip( h ) ) {
5674             decode_mb_skip(h);
5675
5676             h->cbp_table[mb_xy] = 0;
5677             h->chroma_pred_mode_table[mb_xy] = 0;
5678             h->last_qscale_diff = 0;
5679
5680             return 0;
5681
5682         }
5683     }
5684     if(h->mb_aff_frame){
5685         if ( ((s->mb_y&1) == 0) || h->prev_mb_skipped)
5686             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5687     }else
5688         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5689
5690     h->prev_mb_skipped = 0;
5691
5692     compute_mb_neighboors(h);
5693     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5694         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5695         return -1;
5696     }
5697
5698     if( h->slice_type == B_TYPE ) {
5699         if( mb_type < 23 ){
5700             partition_count= b_mb_type_info[mb_type].partition_count;
5701             mb_type=         b_mb_type_info[mb_type].type;
5702         }else{
5703             mb_type -= 23;
5704             goto decode_intra_mb;
5705         }
5706     } else if( h->slice_type == P_TYPE ) {
5707         if( mb_type < 5) {
5708             partition_count= p_mb_type_info[mb_type].partition_count;
5709             mb_type=         p_mb_type_info[mb_type].type;
5710         } else {
5711             mb_type -= 5;
5712             goto decode_intra_mb;
5713         }
5714     } else {
5715        assert(h->slice_type == I_TYPE);
5716 decode_intra_mb:
5717         partition_count = 0;
5718         cbp= i_mb_type_info[mb_type].cbp;
5719         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5720         mb_type= i_mb_type_info[mb_type].type;
5721     }
5722     if(h->mb_field_decoding_flag)
5723         mb_type |= MB_TYPE_INTERLACED;
5724
5725     h->slice_table[ mb_xy ]= h->slice_num;
5726
5727     if(IS_INTRA_PCM(mb_type)) {
5728         const uint8_t *ptr;
5729         unsigned int x, y;
5730
5731         // We assume these blocks are very rare so we dont optimize it.
5732         // FIXME The two following lines get the bitstream position in the cabac
5733         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5734         ptr= h->cabac.bytestream;
5735         if (h->cabac.low&0x1) ptr-=CABAC_BITS/8;
5736
5737         // The pixels are stored in the same order as levels in h->mb array.
5738         for(y=0; y<16; y++){
5739             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5740             for(x=0; x<16; x++){
5741                 tprintf("LUMA ICPM LEVEL (%3d)\n", *ptr);
5742                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5743             }
5744         }
5745         for(y=0; y<8; y++){
5746             const int index= 256 + 4*(y&3) + 32*(y>>2);
5747             for(x=0; x<8; x++){
5748                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5749                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5750             }
5751         }
5752         for(y=0; y<8; y++){
5753             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5754             for(x=0; x<8; x++){
5755                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5756                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5757             }
5758         }
5759
5760         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5761
5762         // All blocks are present
5763         h->cbp_table[mb_xy] = 0x1ef;
5764         h->chroma_pred_mode_table[mb_xy] = 0;
5765         // In deblocking, the quantizer is 0
5766         s->current_picture.qscale_table[mb_xy]= 0;
5767         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
5768         // All coeffs are present
5769         memset(h->non_zero_count[mb_xy], 16, 16);
5770         s->current_picture.mb_type[mb_xy]= mb_type;
5771         return 0;
5772     }
5773
5774     fill_caches(h, mb_type, 0);
5775
5776     if( IS_INTRA( mb_type ) ) {
5777         int i;
5778         if( IS_INTRA4x4( mb_type ) ) {
5779             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5780                 mb_type |= MB_TYPE_8x8DCT;
5781                 for( i = 0; i < 16; i+=4 ) {
5782                     int pred = pred_intra_mode( h, i );
5783                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5784                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5785                 }
5786             } else {
5787                 for( i = 0; i < 16; i++ ) {
5788                     int pred = pred_intra_mode( h, i );
5789                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5790
5791                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5792                 }
5793             }
5794             write_back_intra_pred_mode(h);
5795             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5796         } else {
5797             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5798             if( h->intra16x16_pred_mode < 0 ) return -1;
5799         }
5800         h->chroma_pred_mode_table[mb_xy] =
5801             h->chroma_pred_mode          = decode_cabac_mb_chroma_pre_mode( h );
5802
5803         h->chroma_pred_mode= check_intra_pred_mode( h, h->chroma_pred_mode );
5804         if( h->chroma_pred_mode < 0 ) return -1;
5805     } else if( partition_count == 4 ) {
5806         int i, j, sub_partition_count[4], list, ref[2][4];
5807
5808         if( h->slice_type == B_TYPE ) {
5809             for( i = 0; i < 4; i++ ) {
5810                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5811                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5812                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5813             }
5814             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
5815                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
5816                 pred_direct_motion(h, &mb_type);
5817                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5818                     for( i = 0; i < 4; i++ )
5819                         if( IS_DIRECT(h->sub_mb_type[i]) )
5820                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5821                 }
5822             }
5823         } else {
5824             for( i = 0; i < 4; i++ ) {
5825                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5826                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5827                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5828             }
5829         }
5830
5831         for( list = 0; list < 2; list++ ) {
5832             if( h->ref_count[list] > 0 ) {
5833                 for( i = 0; i < 4; i++ ) {
5834                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5835                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5836                         if( h->ref_count[list] > 1 )
5837                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5838                         else
5839                             ref[list][i] = 0;
5840                     } else {
5841                         ref[list][i] = -1;
5842                     }
5843                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5844                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5845                 }
5846             }
5847         }
5848
5849         if(dct8x8_allowed)
5850             dct8x8_allowed = get_dct8x8_allowed(h);
5851
5852         for(list=0; list<2; list++){
5853             for(i=0; i<4; i++){
5854                 if(IS_DIRECT(h->sub_mb_type[i])){
5855                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5856                     continue;
5857                 }
5858                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5859
5860                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5861                     const int sub_mb_type= h->sub_mb_type[i];
5862                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5863                     for(j=0; j<sub_partition_count[i]; j++){
5864                         int mpx, mpy;
5865                         int mx, my;
5866                         const int index= 4*i + block_width*j;
5867                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5868                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5869                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5870
5871                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5872                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5873                         tprintf("final mv:%d %d\n", mx, my);
5874
5875                         if(IS_SUB_8X8(sub_mb_type)){
5876                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
5877                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5878                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
5879                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5880
5881                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]=
5882                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5883                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]=
5884                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5885                         }else if(IS_SUB_8X4(sub_mb_type)){
5886                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
5887                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
5888
5889                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= mx- mpx;
5890                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= my - mpy;
5891                         }else if(IS_SUB_4X8(sub_mb_type)){
5892                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
5893                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
5894
5895                             mvd_cache[ 0 ][0]= mvd_cache[ 8 ][0]= mx - mpx;
5896                             mvd_cache[ 0 ][1]= mvd_cache[ 8 ][1]= my - mpy;
5897                         }else{
5898                             assert(IS_SUB_4X4(sub_mb_type));
5899                             mv_cache[ 0 ][0]= mx;
5900                             mv_cache[ 0 ][1]= my;
5901
5902                             mvd_cache[ 0 ][0]= mx - mpx;
5903                             mvd_cache[ 0 ][1]= my - mpy;
5904                         }
5905                     }
5906                 }else{
5907                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5908                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5909                     p[0] = p[1] = p[8] = p[9] = 0;
5910                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5911                 }
5912             }
5913         }
5914     } else if( IS_DIRECT(mb_type) ) {
5915         pred_direct_motion(h, &mb_type);
5916         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5917         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5918         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5919     } else {
5920         int list, mx, my, i, mpx, mpy;
5921         if(IS_16X16(mb_type)){
5922             for(list=0; list<2; list++){
5923                 if(IS_DIR(mb_type, 0, list)){
5924                     if(h->ref_count[list] > 0 ){
5925                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5926                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5927                     }
5928                 }else
5929                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
5930             }
5931             for(list=0; list<2; list++){
5932                 if(IS_DIR(mb_type, 0, list)){
5933                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5934
5935                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5936                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5937                     tprintf("final mv:%d %d\n", mx, my);
5938
5939                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5940                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5941                 }else
5942                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5943             }
5944         }
5945         else if(IS_16X8(mb_type)){
5946             for(list=0; list<2; list++){
5947                 if(h->ref_count[list]>0){
5948                     for(i=0; i<2; i++){
5949                         if(IS_DIR(mb_type, i, list)){
5950                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5951                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5952                         }else
5953                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5954                     }
5955                 }
5956             }
5957             for(list=0; list<2; list++){
5958                 for(i=0; i<2; i++){
5959                     if(IS_DIR(mb_type, i, list)){
5960                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5961                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5962                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5963                         tprintf("final mv:%d %d\n", mx, my);
5964
5965                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5966                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5967                     }else{
5968                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5969                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5970                     }
5971                 }
5972             }
5973         }else{
5974             assert(IS_8X16(mb_type));
5975             for(list=0; list<2; list++){
5976                 if(h->ref_count[list]>0){
5977                     for(i=0; i<2; i++){
5978                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5979                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5980                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5981                         }else
5982                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5983                     }
5984                 }
5985             }
5986             for(list=0; list<2; list++){
5987                 for(i=0; i<2; i++){
5988                     if(IS_DIR(mb_type, i, list)){
5989                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5990                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5991                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5992
5993                         tprintf("final mv:%d %d\n", mx, my);
5994                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5995                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5996                     }else{
5997                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5998                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5999                     }
6000                 }
6001             }
6002         }
6003     }
6004
6005    if( IS_INTER( mb_type ) ) {
6006         h->chroma_pred_mode_table[mb_xy] = 0;
6007         write_back_motion( h, mb_type );
6008    }
6009
6010     if( !IS_INTRA16x16( mb_type ) ) {
6011         cbp  = decode_cabac_mb_cbp_luma( h );
6012         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6013     }
6014
6015     h->cbp_table[mb_xy] = cbp;
6016
6017     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6018         if( decode_cabac_mb_transform_size( h ) )
6019             mb_type |= MB_TYPE_8x8DCT;
6020     }
6021     s->current_picture.mb_type[mb_xy]= mb_type;
6022
6023     if( cbp || IS_INTRA16x16( mb_type ) ) {
6024         const uint8_t *scan, *dc_scan;
6025         int dqp;
6026
6027         if(IS_INTERLACED(mb_type)){
6028             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6029             dc_scan= luma_dc_field_scan;
6030         }else{
6031             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6032             dc_scan= luma_dc_zigzag_scan;
6033         }
6034
6035         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6036         s->qscale += dqp;
6037         if(((unsigned)s->qscale) > 51){
6038             if(s->qscale<0) s->qscale+= 52;
6039             else            s->qscale-= 52;
6040         }
6041         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
6042
6043         if( IS_INTRA16x16( mb_type ) ) {
6044             int i;
6045             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6046             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, h->dequant4_coeff[s->qscale], 16) < 0)
6047                 return -1;
6048             if( cbp&15 ) {
6049                 for( i = 0; i < 16; i++ ) {
6050                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6051                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[s->qscale], 15) < 0 )
6052                         return -1;
6053                 }
6054             } else {
6055                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6056             }
6057         } else {
6058             int i8x8, i4x4;
6059             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6060                 if( cbp & (1<<i8x8) ) {
6061                     if( IS_8x8DCT(mb_type) ) {
6062                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6063                                 zigzag_scan8x8, h->dequant8_coeff[s->qscale], 64) < 0 )
6064                             return -1;
6065                         if(s->qscale < 12){
6066                             int i;
6067                             for(i=0; i<64; i++)
6068                                 h->mb[64*i8x8+i] = (h->mb[64*i8x8+i] + 2) >> 2;
6069                         }
6070                     } else
6071                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6072                         const int index = 4*i8x8 + i4x4;
6073                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6074                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[s->qscale], 16) < 0 )
6075                             return -1;
6076                     }
6077                 } else {
6078                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6079                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6080                 }
6081             }
6082         }
6083
6084         if( cbp&0x30 ){
6085             int c;
6086             for( c = 0; c < 2; c++ ) {
6087                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6088                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, h->dequant4_coeff[h->chroma_qp], 4) < 0)
6089                     return -1;
6090             }
6091         }
6092
6093         if( cbp&0x20 ) {
6094             int c, i;
6095             for( c = 0; c < 2; c++ ) {
6096                 for( i = 0; i < 4; i++ ) {
6097                     const int index = 16 + 4 * c + i;
6098                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6099                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->dequant4_coeff[h->chroma_qp], 15) < 0)
6100                         return -1;
6101                 }
6102             }
6103         } else {
6104             uint8_t * const nnz= &h->non_zero_count_cache[0];
6105             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6106             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6107         }
6108     } else {
6109         uint8_t * const nnz= &h->non_zero_count_cache[0];
6110         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6111         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6112         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6113     }
6114
6115     s->current_picture.qscale_table[mb_xy]= s->qscale;
6116     write_back_non_zero_count(h);
6117
6118     return 0;
6119 }
6120
6121
6122 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6123     int i, d;
6124     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6125     const int alpha = alpha_table[index_a];
6126     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6127
6128     if( bS[0] < 4 ) {
6129         int8_t tc[4];
6130         for(i=0; i<4; i++)
6131             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6132         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6133     } else {
6134         /* 16px edge length, because bS=4 is triggered by being at
6135          * the edge of an intra MB, so all 4 bS are the same */
6136             for( d = 0; d < 16; d++ ) {
6137                 const int p0 = pix[-1];
6138                 const int p1 = pix[-2];
6139                 const int p2 = pix[-3];
6140
6141                 const int q0 = pix[0];
6142                 const int q1 = pix[1];
6143                 const int q2 = pix[2];
6144
6145                 if( ABS( p0 - q0 ) < alpha &&
6146                     ABS( p1 - p0 ) < beta &&
6147                     ABS( q1 - q0 ) < beta ) {
6148
6149                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6150                         if( ABS( p2 - p0 ) < beta)
6151                         {
6152                             const int p3 = pix[-4];
6153                             /* p0', p1', p2' */
6154                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6155                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6156                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6157                         } else {
6158                             /* p0' */
6159                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6160                         }
6161                         if( ABS( q2 - q0 ) < beta)
6162                         {
6163                             const int q3 = pix[3];
6164                             /* q0', q1', q2' */
6165                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6166                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6167                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6168                         } else {
6169                             /* q0' */
6170                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6171                         }
6172                     }else{
6173                         /* p0', q0' */
6174                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6175                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6176                     }
6177                     tprintf("filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6178                 }
6179                 pix += stride;
6180             }
6181     }
6182 }
6183 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6184     int i, d;
6185     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6186     const int alpha = alpha_table[index_a];
6187     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6188
6189     if( bS[0] < 4 ) {
6190         int8_t tc[4];
6191         for(i=0; i<4; i++)
6192             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
6193         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6194     } else {
6195         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6196     }
6197 }
6198
6199 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int bS[8], int qp[2] ) {
6200     int i;
6201     for( i = 0; i < 16; i++, pix += stride) {
6202         int index_a;
6203         int alpha;
6204         int beta;
6205
6206         int qp_index;
6207         int bS_index = (i >> 1);
6208         if (h->mb_field_decoding_flag) {
6209             bS_index &= ~1;
6210             bS_index |= (i & 1);
6211         }
6212
6213         if( bS[bS_index] == 0 ) {
6214             continue;
6215         }
6216
6217         qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
6218         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6219         alpha = alpha_table[index_a];
6220         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6221
6222
6223         if( bS[bS_index] < 4 ) {
6224             const int tc0 = tc0_table[index_a][bS[bS_index] - 1];
6225             /* 4px edge length */
6226             const int p0 = pix[-1];
6227             const int p1 = pix[-2];
6228             const int p2 = pix[-3];
6229             const int q0 = pix[0];
6230             const int q1 = pix[1];
6231             const int q2 = pix[2];
6232
6233             if( ABS( p0 - q0 ) < alpha &&
6234                 ABS( p1 - p0 ) < beta &&
6235                 ABS( q1 - q0 ) < beta ) {
6236                 int tc = tc0;
6237                 int i_delta;
6238
6239                 if( ABS( p2 - p0 ) < beta ) {
6240                     pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6241                     tc++;
6242                 }
6243                 if( ABS( q2 - q0 ) < beta ) {
6244                     pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6245                     tc++;
6246                 }
6247
6248                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6249                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6250                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6251                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6252             }
6253         }else{
6254             /* 4px edge length */
6255             const int p0 = pix[-1];
6256             const int p1 = pix[-2];
6257             const int p2 = pix[-3];
6258
6259             const int q0 = pix[0];
6260             const int q1 = pix[1];
6261             const int q2 = pix[2];
6262
6263             if( ABS( p0 - q0 ) < alpha &&
6264                 ABS( p1 - p0 ) < beta &&
6265                 ABS( q1 - q0 ) < beta ) {
6266
6267                 if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6268                     if( ABS( p2 - p0 ) < beta)
6269                     {
6270                         const int p3 = pix[-4];
6271                         /* p0', p1', p2' */
6272                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6273                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6274                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6275                     } else {
6276                         /* p0' */
6277                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6278                     }
6279                     if( ABS( q2 - q0 ) < beta)
6280                     {
6281                         const int q3 = pix[3];
6282                         /* q0', q1', q2' */
6283                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6284                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6285                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6286                     } else {
6287                         /* q0' */
6288                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6289                     }
6290                 }else{
6291                     /* p0', q0' */
6292                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6293                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6294                 }
6295                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6296             }
6297         }
6298     }
6299 }
6300 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp[2] ) {
6301     int i;
6302     for( i = 0; i < 8; i++, pix += stride) {
6303         int index_a;
6304         int alpha;
6305         int beta;
6306
6307         int qp_index;
6308         int bS_index = i;
6309
6310         if( bS[bS_index] == 0 ) {
6311             continue;
6312         }
6313
6314         qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
6315         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6316         alpha = alpha_table[index_a];
6317         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6318         if( bS[bS_index] < 4 ) {
6319             const int tc = tc0_table[index_a][bS[bS_index] - 1] + 1;
6320             /* 2px edge length (because we use same bS than the one for luma) */
6321             const int p0 = pix[-1];
6322             const int p1 = pix[-2];
6323             const int q0 = pix[0];
6324             const int q1 = pix[1];
6325
6326             if( ABS( p0 - q0 ) < alpha &&
6327                 ABS( p1 - p0 ) < beta &&
6328                 ABS( q1 - q0 ) < beta ) {
6329                 const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6330
6331                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6332                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6333                 tprintf("filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6334             }
6335         }else{
6336             const int p0 = pix[-1];
6337             const int p1 = pix[-2];
6338             const int q0 = pix[0];
6339             const int q1 = pix[1];
6340
6341             if( ABS( p0 - q0 ) < alpha &&
6342                 ABS( p1 - p0 ) < beta &&
6343                 ABS( q1 - q0 ) < beta ) {
6344
6345                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6346                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6347                 tprintf("filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6348             }
6349         }
6350     }
6351 }
6352
6353 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6354     int i, d;
6355     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6356     const int alpha = alpha_table[index_a];
6357     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6358     const int pix_next  = stride;
6359
6360     if( bS[0] < 4 ) {
6361         int8_t tc[4];
6362         for(i=0; i<4; i++)
6363             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6364         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6365     } else {
6366         /* 16px edge length, see filter_mb_edgev */
6367             for( d = 0; d < 16; d++ ) {
6368                 const int p0 = pix[-1*pix_next];
6369                 const int p1 = pix[-2*pix_next];
6370                 const int p2 = pix[-3*pix_next];
6371                 const int q0 = pix[0];
6372                 const int q1 = pix[1*pix_next];
6373                 const int q2 = pix[2*pix_next];
6374
6375                 if( ABS( p0 - q0 ) < alpha &&
6376                     ABS( p1 - p0 ) < beta &&
6377                     ABS( q1 - q0 ) < beta ) {
6378
6379                     const int p3 = pix[-4*pix_next];
6380                     const int q3 = pix[ 3*pix_next];
6381
6382                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6383                         if( ABS( p2 - p0 ) < beta) {
6384                             /* p0', p1', p2' */
6385                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6386                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6387                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6388                         } else {
6389                             /* p0' */
6390                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6391                         }
6392                         if( ABS( q2 - q0 ) < beta) {
6393                             /* q0', q1', q2' */
6394                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6395                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6396                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6397                         } else {
6398                             /* q0' */
6399                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6400                         }
6401                     }else{
6402                         /* p0', q0' */
6403                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6404                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6405                     }
6406                     tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6407                 }
6408                 pix++;
6409             }
6410     }
6411 }
6412
6413 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6414     int i, d;
6415     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6416     const int alpha = alpha_table[index_a];
6417     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6418
6419     if( bS[0] < 4 ) {
6420         int8_t tc[4];
6421         for(i=0; i<4; i++)
6422             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
6423         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6424     } else {
6425         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6426     }
6427 }
6428
6429 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6430     MpegEncContext * const s = &h->s;
6431     const int mb_xy= mb_x + mb_y*s->mb_stride;
6432     int first_vertical_edge_done = 0;
6433     int dir;
6434     /* FIXME: A given frame may occupy more than one position in
6435      * the reference list. So ref2frm should be populated with
6436      * frame numbers, not indices. */
6437     static const int ref2frm[18] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
6438
6439     if (h->mb_aff_frame
6440             // left mb is in picture
6441             && h->slice_table[mb_xy-1] != 255
6442             // and current and left pair do not have the same interlaced type
6443             && (IS_INTERLACED(s->current_picture.mb_type[mb_xy]) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6444             // and left mb is in the same slice if deblocking_filter == 2
6445             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6446         /* First vertical edge is different in MBAFF frames
6447          * There are 8 different bS to compute and 2 different Qp
6448          */
6449         int bS[8];
6450         int qp[2];
6451         int chroma_qp[2];
6452
6453         int i;
6454         first_vertical_edge_done = 1;
6455         for( i = 0; i < 8; i++ ) {
6456             int y = i>>1;
6457             int b_idx= 8 + 4 + 8*y;
6458             int bn_idx= b_idx - 1;
6459
6460             int mbn_xy = h->mb_field_decoding_flag ? h->left_mb_xy[i>>2] : h->left_mb_xy[i&1];
6461
6462             if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6463                 IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6464                 bS[i] = 4;
6465             } else if( h->non_zero_count_cache[b_idx] != 0 ||
6466                 /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6467                 h->non_zero_count_cache[bn_idx] != 0 ) {
6468                 bS[i] = 2;
6469             } else {
6470                 int l;
6471                 bS[i] = 0;
6472                 for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6473                     if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6474                         ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6475                         ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
6476                         bS[i] = 1;
6477                         break;
6478                     }
6479                 }
6480             }
6481         }
6482         if(bS[0]+bS[1]+bS[2]+bS[3] != 0) {
6483             // Do not use s->qscale as luma quantizer because it has not the same
6484             // value in IPCM macroblocks.
6485             qp[0] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[0]] + 1 ) >> 1;
6486             chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
6487                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[0]] ) + 1 ) >> 1;
6488             qp[1] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[1]] + 1 ) >> 1;
6489             chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
6490                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[1]] ) + 1 ) >> 1;
6491
6492             /* Filter edge */
6493             tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
6494             { int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6495             filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6496             filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
6497             filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
6498         }
6499     }
6500     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6501     for( dir = 0; dir < 2; dir++ )
6502     {
6503         int edge;
6504         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6505         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6506
6507         if (first_vertical_edge_done) {
6508             start = 1;
6509             first_vertical_edge_done = 0;
6510         }
6511
6512         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6513             start = 1;
6514
6515         /* Calculate bS */
6516         for( edge = start; edge < 4; edge++ ) {
6517             /* mbn_xy: neighbor macroblock */
6518             int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6519             int bS[4];
6520             int qp;
6521
6522             if( (edge&1) && IS_8x8DCT(s->current_picture.mb_type[mb_xy]) )
6523                 continue;
6524
6525             if (h->mb_aff_frame && (dir == 1) && (edge == 0) && ((mb_y & 1) == 0)
6526                 && !IS_INTERLACED(s->current_picture.mb_type[mb_xy])
6527                 && IS_INTERLACED(s->current_picture.mb_type[mbn_xy])
6528                 ) {
6529                 // This is a special case in the norm where the filtering must
6530                 // be done twice (one each of the field) even if we are in a
6531                 // frame macroblock.
6532                 //
6533                 unsigned int tmp_linesize   = 2 *   linesize;
6534                 unsigned int tmp_uvlinesize = 2 * uvlinesize;
6535                 int mbn_xy = mb_xy - 2 * s->mb_stride;
6536                 int qp, chroma_qp;
6537
6538                 // first filtering
6539                 if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6540                     IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6541                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6542                 } else {
6543                     // TODO
6544                     assert(0);
6545                 }
6546                 /* Filter edge */
6547                 // Do not use s->qscale as luma quantizer because it has not the same
6548                 // value in IPCM macroblocks.
6549                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6550                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6551                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6552                 filter_mb_edgeh( h, &img_y[0], tmp_linesize, bS, qp );
6553                 chroma_qp = ( h->chroma_qp +
6554                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6555                 filter_mb_edgech( h, &img_cb[0], tmp_uvlinesize, bS, chroma_qp );
6556                 filter_mb_edgech( h, &img_cr[0], tmp_uvlinesize, bS, chroma_qp );
6557
6558                 // second filtering
6559                 mbn_xy += s->mb_stride;
6560                 if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6561                     IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6562                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6563                 } else {
6564                     // TODO
6565                     assert(0);
6566                 }
6567                 /* Filter edge */
6568                 // Do not use s->qscale as luma quantizer because it has not the same
6569                 // value in IPCM macroblocks.
6570                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6571                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6572                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6573                 filter_mb_edgeh( h, &img_y[linesize], tmp_linesize, bS, qp );
6574                 chroma_qp = ( h->chroma_qp +
6575                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6576                 filter_mb_edgech( h, &img_cb[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6577                 filter_mb_edgech( h, &img_cr[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6578                 continue;
6579             }
6580             if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6581                 IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6582                 int value;
6583                 if (edge == 0) {
6584                     if (   (!IS_INTERLACED(s->current_picture.mb_type[mb_xy]) && !IS_INTERLACED(s->current_picture.mb_type[mbm_xy]))
6585                         || ((h->mb_aff_frame || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6586                     ) {
6587                         value = 4;
6588                     } else {
6589                         value = 3;
6590                     }
6591                 } else {
6592                     value = 3;
6593                 }
6594                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6595             } else {
6596                 int i;
6597                 for( i = 0; i < 4; i++ ) {
6598                     int x = dir == 0 ? edge : i;
6599                     int y = dir == 0 ? i    : edge;
6600                     int b_idx= 8 + 4 + x + 8*y;
6601                     int bn_idx= b_idx - (dir ? 8:1);
6602
6603                     if( h->non_zero_count_cache[b_idx] != 0 ||
6604                         h->non_zero_count_cache[bn_idx] != 0 ) {
6605                         bS[i] = 2;
6606                     }
6607                     else
6608                     {
6609                         int l;
6610                         bS[i] = 0;
6611                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6612                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6613                                 ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6614                                 ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
6615                                 bS[i] = 1;
6616                                 break;
6617                             }
6618                         }
6619                     }
6620                 }
6621
6622                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6623                     continue;
6624             }
6625
6626             /* Filter edge */
6627             // Do not use s->qscale as luma quantizer because it has not the same
6628             // value in IPCM macroblocks.
6629             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6630             //tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6631             tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6632             { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6633             if( dir == 0 ) {
6634                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6635                 if( (edge&1) == 0 ) {
6636                     int chroma_qp = ( h->chroma_qp +
6637                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6638                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
6639                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
6640                 }
6641             } else {
6642                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6643                 if( (edge&1) == 0 ) {
6644                     int chroma_qp = ( h->chroma_qp +
6645                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6646                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
6647                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
6648                 }
6649             }
6650         }
6651     }
6652 }
6653
6654 static int decode_slice(H264Context *h){
6655     MpegEncContext * const s = &h->s;
6656     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6657
6658     s->mb_skip_run= -1;
6659
6660     if( h->pps.cabac ) {
6661         int i;
6662
6663         /* realign */
6664         align_get_bits( &s->gb );
6665
6666         /* init cabac */
6667         ff_init_cabac_states( &h->cabac, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64 );
6668         ff_init_cabac_decoder( &h->cabac,
6669                                s->gb.buffer + get_bits_count(&s->gb)/8,
6670                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6671         /* calculate pre-state */
6672         for( i= 0; i < 460; i++ ) {
6673             int pre;
6674             if( h->slice_type == I_TYPE )
6675                 pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6676             else
6677                 pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6678
6679             if( pre <= 63 )
6680                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6681             else
6682                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6683         }
6684
6685         for(;;){
6686             int ret = decode_mb_cabac(h);
6687             int eos;
6688
6689             if(ret>=0) hl_decode_mb(h);
6690
6691             /* XXX: useless as decode_mb_cabac it doesn't support that ... */
6692             if( ret >= 0 && h->mb_aff_frame ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6693                 s->mb_y++;
6694
6695                 if(ret>=0) ret = decode_mb_cabac(h);
6696
6697                 hl_decode_mb(h);
6698                 s->mb_y--;
6699             }
6700             eos = get_cabac_terminate( &h->cabac );
6701
6702             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 1) {
6703                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6704                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6705                 return -1;
6706             }
6707
6708             if( ++s->mb_x >= s->mb_width ) {
6709                 s->mb_x = 0;
6710                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6711                 ++s->mb_y;
6712                 if(h->mb_aff_frame) {
6713                     ++s->mb_y;
6714                 }
6715             }
6716
6717             if( eos || s->mb_y >= s->mb_height ) {
6718                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6719                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6720                 return 0;
6721             }
6722         }
6723
6724     } else {
6725         for(;;){
6726             int ret = decode_mb_cavlc(h);
6727
6728             if(ret>=0) hl_decode_mb(h);
6729
6730             if(ret>=0 && h->mb_aff_frame){ //FIXME optimal? or let mb_decode decode 16x32 ?
6731                 s->mb_y++;
6732                 ret = decode_mb_cavlc(h);
6733
6734                 if(ret>=0) hl_decode_mb(h);
6735                 s->mb_y--;
6736             }
6737
6738             if(ret<0){
6739                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6740                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6741
6742                 return -1;
6743             }
6744
6745             if(++s->mb_x >= s->mb_width){
6746                 s->mb_x=0;
6747                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6748                 ++s->mb_y;
6749                 if(h->mb_aff_frame) {
6750                     ++s->mb_y;
6751                 }
6752                 if(s->mb_y >= s->mb_height){
6753                     tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6754
6755                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6756                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6757
6758                         return 0;
6759                     }else{
6760                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6761
6762                         return -1;
6763                     }
6764                 }
6765             }
6766
6767             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6768                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6769                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6770                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6771
6772                     return 0;
6773                 }else{
6774                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6775
6776                     return -1;
6777                 }
6778             }
6779         }
6780     }
6781
6782 #if 0
6783     for(;s->mb_y < s->mb_height; s->mb_y++){
6784         for(;s->mb_x < s->mb_width; s->mb_x++){
6785             int ret= decode_mb(h);
6786
6787             hl_decode_mb(h);
6788
6789             if(ret<0){
6790                 fprintf(stderr, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6791                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6792
6793                 return -1;
6794             }
6795
6796             if(++s->mb_x >= s->mb_width){
6797                 s->mb_x=0;
6798                 if(++s->mb_y >= s->mb_height){
6799                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6800                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6801
6802                         return 0;
6803                     }else{
6804                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6805
6806                         return -1;
6807                     }
6808                 }
6809             }
6810
6811             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6812                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6813                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6814
6815                     return 0;
6816                 }else{
6817                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6818
6819                     return -1;
6820                 }
6821             }
6822         }
6823         s->mb_x=0;
6824         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6825     }
6826 #endif
6827     return -1; //not reached
6828 }
6829
6830 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6831     MpegEncContext * const s = &h->s;
6832     int cpb_count, i;
6833     cpb_count = get_ue_golomb(&s->gb) + 1;
6834     get_bits(&s->gb, 4); /* bit_rate_scale */
6835     get_bits(&s->gb, 4); /* cpb_size_scale */
6836     for(i=0; i<cpb_count; i++){
6837         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6838         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6839         get_bits1(&s->gb);     /* cbr_flag */
6840     }
6841     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6842     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6843     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6844     get_bits(&s->gb, 5); /* time_offset_length */
6845 }
6846
6847 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6848     MpegEncContext * const s = &h->s;
6849     int aspect_ratio_info_present_flag, aspect_ratio_idc;
6850     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6851
6852     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6853
6854     if( aspect_ratio_info_present_flag ) {
6855         aspect_ratio_idc= get_bits(&s->gb, 8);
6856         if( aspect_ratio_idc == EXTENDED_SAR ) {
6857             sps->sar.num= get_bits(&s->gb, 16);
6858             sps->sar.den= get_bits(&s->gb, 16);
6859         }else if(aspect_ratio_idc < 16){
6860             sps->sar=  pixel_aspect[aspect_ratio_idc];
6861         }else{
6862             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6863             return -1;
6864         }
6865     }else{
6866         sps->sar.num=
6867         sps->sar.den= 0;
6868     }
6869 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6870
6871     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6872         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6873     }
6874
6875     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6876         get_bits(&s->gb, 3);    /* video_format */
6877         get_bits1(&s->gb);      /* video_full_range_flag */
6878         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6879             get_bits(&s->gb, 8); /* colour_primaries */
6880             get_bits(&s->gb, 8); /* transfer_characteristics */
6881             get_bits(&s->gb, 8); /* matrix_coefficients */
6882         }
6883     }
6884
6885     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6886         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6887         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6888     }
6889
6890     sps->timing_info_present_flag = get_bits1(&s->gb);
6891     if(sps->timing_info_present_flag){
6892         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6893         sps->time_scale = get_bits_long(&s->gb, 32);
6894         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6895     }
6896
6897     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6898     if(nal_hrd_parameters_present_flag)
6899         decode_hrd_parameters(h, sps);
6900     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6901     if(vcl_hrd_parameters_present_flag)
6902         decode_hrd_parameters(h, sps);
6903     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6904         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6905     get_bits1(&s->gb);         /* pic_struct_present_flag */
6906
6907     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6908     if(sps->bitstream_restriction_flag){
6909         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6910         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6911         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6912         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6913         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6914         sps->num_reorder_frames = get_ue_golomb(&s->gb);
6915         get_ue_golomb(&s->gb); /* max_dec_frame_buffering */
6916     }
6917
6918     return 0;
6919 }
6920
6921 static inline int decode_seq_parameter_set(H264Context *h){
6922     MpegEncContext * const s = &h->s;
6923     int profile_idc, level_idc;
6924     int sps_id, i;
6925     SPS *sps;
6926
6927     profile_idc= get_bits(&s->gb, 8);
6928     get_bits1(&s->gb);   //constraint_set0_flag
6929     get_bits1(&s->gb);   //constraint_set1_flag
6930     get_bits1(&s->gb);   //constraint_set2_flag
6931     get_bits1(&s->gb);   //constraint_set3_flag
6932     get_bits(&s->gb, 4); // reserved
6933     level_idc= get_bits(&s->gb, 8);
6934     sps_id= get_ue_golomb(&s->gb);
6935
6936     sps= &h->sps_buffer[ sps_id ];
6937     sps->profile_idc= profile_idc;
6938     sps->level_idc= level_idc;
6939
6940     if(sps->profile_idc >= 100){ //high profile
6941         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
6942             get_bits1(&s->gb);  //residual_color_transform_flag
6943         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
6944         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
6945         sps->transform_bypass = get_bits1(&s->gb);
6946         if(get_bits1(&s->gb)){  //seq_scaling_matrix_present_flag
6947             av_log(h->s.avctx, AV_LOG_ERROR, "custom scaling matrix not implemented\n");
6948             return -1;
6949         }
6950     }
6951
6952     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
6953     sps->poc_type= get_ue_golomb(&s->gb);
6954
6955     if(sps->poc_type == 0){ //FIXME #define
6956         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
6957     } else if(sps->poc_type == 1){//FIXME #define
6958         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
6959         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
6960         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
6961         sps->poc_cycle_length= get_ue_golomb(&s->gb);
6962
6963         for(i=0; i<sps->poc_cycle_length; i++)
6964             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
6965     }
6966     if(sps->poc_type > 2){
6967         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
6968         return -1;
6969     }
6970
6971     sps->ref_frame_count= get_ue_golomb(&s->gb);
6972     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2){
6973         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
6974     }
6975     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
6976     sps->mb_width= get_ue_golomb(&s->gb) + 1;
6977     sps->mb_height= get_ue_golomb(&s->gb) + 1;
6978     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
6979        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height))
6980         return -1;
6981
6982     sps->frame_mbs_only_flag= get_bits1(&s->gb);
6983     if(!sps->frame_mbs_only_flag)
6984         sps->mb_aff= get_bits1(&s->gb);
6985     else
6986         sps->mb_aff= 0;
6987
6988     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
6989
6990     sps->crop= get_bits1(&s->gb);
6991     if(sps->crop){
6992         sps->crop_left  = get_ue_golomb(&s->gb);
6993         sps->crop_right = get_ue_golomb(&s->gb);
6994         sps->crop_top   = get_ue_golomb(&s->gb);
6995         sps->crop_bottom= get_ue_golomb(&s->gb);
6996         if(sps->crop_left || sps->crop_top){
6997             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
6998         }
6999     }else{
7000         sps->crop_left  =
7001         sps->crop_right =
7002         sps->crop_top   =
7003         sps->crop_bottom= 0;
7004     }
7005
7006     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7007     if( sps->vui_parameters_present_flag )
7008         decode_vui_parameters(h, sps);
7009
7010     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7011         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%d profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7012                sps_id, sps->profile_idc, sps->level_idc,
7013                sps->poc_type,
7014                sps->ref_frame_count,
7015                sps->mb_width, sps->mb_height,
7016                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7017                sps->direct_8x8_inference_flag ? "8B8" : "",
7018                sps->crop_left, sps->crop_right,
7019                sps->crop_top, sps->crop_bottom,
7020                sps->vui_parameters_present_flag ? "VUI" : ""
7021                );
7022     }
7023     return 0;
7024 }
7025
7026 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7027     MpegEncContext * const s = &h->s;
7028     int pps_id= get_ue_golomb(&s->gb);
7029     PPS *pps= &h->pps_buffer[pps_id];
7030
7031     pps->sps_id= get_ue_golomb(&s->gb);
7032     pps->cabac= get_bits1(&s->gb);
7033     pps->pic_order_present= get_bits1(&s->gb);
7034     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7035     if(pps->slice_group_count > 1 ){
7036         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7037         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7038         switch(pps->mb_slice_group_map_type){
7039         case 0:
7040 #if 0
7041 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7042 |    run_length[ i ]                                |1  |ue(v)   |
7043 #endif
7044             break;
7045         case 2:
7046 #if 0
7047 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7048 |{                                                  |   |        |
7049 |    top_left_mb[ i ]                               |1  |ue(v)   |
7050 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7051 |   }                                               |   |        |
7052 #endif
7053             break;
7054         case 3:
7055         case 4:
7056         case 5:
7057 #if 0
7058 |   slice_group_change_direction_flag               |1  |u(1)    |
7059 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7060 #endif
7061             break;
7062         case 6:
7063 #if 0
7064 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7065 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7066 |)                                                  |   |        |
7067 |    slice_group_id[ i ]                            |1  |u(v)    |
7068 #endif
7069             break;
7070         }
7071     }
7072     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7073     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7074     if(pps->ref_count[0] > 32 || pps->ref_count[1] > 32){
7075         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7076         return -1;
7077     }
7078
7079     pps->weighted_pred= get_bits1(&s->gb);
7080     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7081     pps->init_qp= get_se_golomb(&s->gb) + 26;
7082     pps->init_qs= get_se_golomb(&s->gb) + 26;
7083     pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
7084     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7085     pps->constrained_intra_pred= get_bits1(&s->gb);
7086     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7087
7088     if(get_bits_count(&s->gb) < bit_length){
7089         pps->transform_8x8_mode= get_bits1(&s->gb);
7090         if(get_bits1(&s->gb)){  //pic_scaling_matrix_present_flag
7091             av_log(h->s.avctx, AV_LOG_ERROR, "custom scaling matrix not implemented\n");
7092             return -1;
7093         }
7094         get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
7095     }
7096
7097     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7098         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%d sps:%d %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
7099                pps_id, pps->sps_id,
7100                pps->cabac ? "CABAC" : "CAVLC",
7101                pps->slice_group_count,
7102                pps->ref_count[0], pps->ref_count[1],
7103                pps->weighted_pred ? "weighted" : "",
7104                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
7105                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7106                pps->constrained_intra_pred ? "CONSTR" : "",
7107                pps->redundant_pic_cnt_present ? "REDU" : "",
7108                pps->transform_8x8_mode ? "8x8DCT" : ""
7109                );
7110     }
7111
7112     return 0;
7113 }
7114
7115 /**
7116  * finds the end of the current frame in the bitstream.
7117  * @return the position of the first byte of the next frame, or -1
7118  */
7119 static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
7120     int i;
7121     uint32_t state;
7122     ParseContext *pc = &(h->s.parse_context);
7123 //printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
7124 //    mb_addr= pc->mb_addr - 1;
7125     state= pc->state;
7126     for(i=0; i<=buf_size; i++){
7127         if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
7128             tprintf("find_frame_end new startcode = %08x, frame_start_found = %d, pos = %d\n", state, pc->frame_start_found, i);
7129             if(pc->frame_start_found){
7130                 // If there isn't one more byte in the buffer
7131                 // the test on first_mb_in_slice cannot be done yet
7132                 // do it at next call.
7133                 if (i >= buf_size) break;
7134                 if (buf[i] & 0x80) {
7135                     // first_mb_in_slice is 0, probably the first nal of a new
7136                     // slice
7137                     tprintf("find_frame_end frame_end_found, state = %08x, pos = %d\n", state, i);
7138                     pc->state=-1;
7139                     pc->frame_start_found= 0;
7140                     return i-4;
7141                 }
7142             }
7143             pc->frame_start_found = 1;
7144         }
7145         if((state&0xFFFFFF1F) == 0x107 || (state&0xFFFFFF1F) == 0x108 || (state&0xFFFFFF1F) == 0x109){
7146            if(pc->frame_start_found){
7147                 pc->state=-1;
7148                 pc->frame_start_found= 0;
7149                 return i-4;
7150            }
7151         }
7152         if (i<buf_size)
7153             state= (state<<8) | buf[i];
7154     }
7155
7156     pc->state= state;
7157     return END_NOT_FOUND;
7158 }
7159
7160 static int h264_parse(AVCodecParserContext *s,
7161                       AVCodecContext *avctx,
7162                       uint8_t **poutbuf, int *poutbuf_size,
7163                       const uint8_t *buf, int buf_size)
7164 {
7165     H264Context *h = s->priv_data;
7166     ParseContext *pc = &h->s.parse_context;
7167     int next;
7168
7169     next= find_frame_end(h, buf, buf_size);
7170
7171     if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
7172         *poutbuf = NULL;
7173         *poutbuf_size = 0;
7174         return buf_size;
7175     }
7176
7177     *poutbuf = (uint8_t *)buf;
7178     *poutbuf_size = buf_size;
7179     return next;
7180 }
7181
7182 static int h264_split(AVCodecContext *avctx,
7183                       const uint8_t *buf, int buf_size)
7184 {
7185     int i;
7186     uint32_t state = -1;
7187     int has_sps= 0;
7188
7189     for(i=0; i<=buf_size; i++){
7190         if((state&0xFFFFFF1F) == 0x107)
7191             has_sps=1;
7192 /*        if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
7193         }*/
7194         if((state&0xFFFFFF00) == 0x100 && (state&0xFFFFFF1F) != 0x107 && (state&0xFFFFFF1F) != 0x108 && (state&0xFFFFFF1F) != 0x109){
7195             if(has_sps){
7196                 while(i>4 && buf[i-5]==0) i--;
7197                 return i-4;
7198             }
7199         }
7200         if (i<buf_size)
7201             state= (state<<8) | buf[i];
7202     }
7203     return 0;
7204 }
7205
7206
7207 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7208     MpegEncContext * const s = &h->s;
7209     AVCodecContext * const avctx= s->avctx;
7210     int buf_index=0;
7211 #if 0
7212     int i;
7213     for(i=0; i<32; i++){
7214         printf("%X ", buf[i]);
7215     }
7216 #endif
7217     h->slice_num = 0;
7218     s->current_picture_ptr= NULL;
7219     for(;;){
7220         int consumed;
7221         int dst_length;
7222         int bit_length;
7223         uint8_t *ptr;
7224         int i, nalsize = 0;
7225
7226       if(h->is_avc) {
7227         if(buf_index >= buf_size) break;
7228         nalsize = 0;
7229         for(i = 0; i < h->nal_length_size; i++)
7230             nalsize = (nalsize << 8) | buf[buf_index++];
7231       } else {
7232         // start code prefix search
7233         for(; buf_index + 3 < buf_size; buf_index++){
7234             // this should allways succeed in the first iteration
7235             if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7236                 break;
7237         }
7238
7239         if(buf_index+3 >= buf_size) break;
7240
7241         buf_index+=3;
7242       }
7243
7244         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7245         if(ptr[dst_length - 1] == 0) dst_length--;
7246         bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
7247
7248         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7249             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
7250         }
7251
7252         if (h->is_avc && (nalsize != consumed))
7253             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7254
7255         buf_index += consumed;
7256
7257         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0)
7258            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7259             continue;
7260
7261         switch(h->nal_unit_type){
7262         case NAL_IDR_SLICE:
7263             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7264         case NAL_SLICE:
7265             init_get_bits(&s->gb, ptr, bit_length);
7266             h->intra_gb_ptr=
7267             h->inter_gb_ptr= &s->gb;
7268             s->data_partitioning = 0;
7269
7270             if(decode_slice_header(h) < 0){
7271                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7272                 break;
7273             }
7274             if(h->redundant_pic_count==0 && s->hurry_up < 5
7275                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7276                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7277                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7278                && avctx->skip_frame < AVDISCARD_ALL)
7279                 decode_slice(h);
7280             break;
7281         case NAL_DPA:
7282             init_get_bits(&s->gb, ptr, bit_length);
7283             h->intra_gb_ptr=
7284             h->inter_gb_ptr= NULL;
7285             s->data_partitioning = 1;
7286
7287             if(decode_slice_header(h) < 0){
7288                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7289             }
7290             break;
7291         case NAL_DPB:
7292             init_get_bits(&h->intra_gb, ptr, bit_length);
7293             h->intra_gb_ptr= &h->intra_gb;
7294             break;
7295         case NAL_DPC:
7296             init_get_bits(&h->inter_gb, ptr, bit_length);
7297             h->inter_gb_ptr= &h->inter_gb;
7298
7299             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
7300                && s->hurry_up < 5
7301                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7302                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7303                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7304                && avctx->skip_frame < AVDISCARD_ALL)
7305                 decode_slice(h);
7306             break;
7307         case NAL_SEI:
7308             break;
7309         case NAL_SPS:
7310             init_get_bits(&s->gb, ptr, bit_length);
7311             decode_seq_parameter_set(h);
7312
7313             if(s->flags& CODEC_FLAG_LOW_DELAY)
7314                 s->low_delay=1;
7315
7316             if(avctx->has_b_frames < 2)
7317                 avctx->has_b_frames= !s->low_delay;
7318             break;
7319         case NAL_PPS:
7320             init_get_bits(&s->gb, ptr, bit_length);
7321
7322             decode_picture_parameter_set(h, bit_length);
7323
7324             break;
7325         case NAL_PICTURE_DELIMITER:
7326             break;
7327         case NAL_FILTER_DATA:
7328             break;
7329         default:
7330             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
7331         }
7332     }
7333
7334     if(!s->current_picture_ptr) return buf_index; //no frame
7335
7336     s->current_picture_ptr->pict_type= s->pict_type;
7337     s->current_picture_ptr->key_frame= s->pict_type == I_TYPE && h->nal_unit_type == NAL_IDR_SLICE;
7338
7339     h->prev_frame_num_offset= h->frame_num_offset;
7340     h->prev_frame_num= h->frame_num;
7341     if(s->current_picture_ptr->reference){
7342         h->prev_poc_msb= h->poc_msb;
7343         h->prev_poc_lsb= h->poc_lsb;
7344     }
7345     if(s->current_picture_ptr->reference)
7346         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7347
7348     ff_er_frame_end(s);
7349
7350     MPV_frame_end(s);
7351
7352     return buf_index;
7353 }
7354
7355 /**
7356  * returns the number of bytes consumed for building the current frame
7357  */
7358 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7359     if(s->flags&CODEC_FLAG_TRUNCATED){
7360         pos -= s->parse_context.last_index;
7361         if(pos<0) pos=0; // FIXME remove (unneeded?)
7362
7363         return pos;
7364     }else{
7365         if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
7366         if(pos+10>buf_size) pos=buf_size; // oops ;)
7367
7368         return pos;
7369     }
7370 }
7371
7372 static int decode_frame(AVCodecContext *avctx,
7373                              void *data, int *data_size,
7374                              uint8_t *buf, int buf_size)
7375 {
7376     H264Context *h = avctx->priv_data;
7377     MpegEncContext *s = &h->s;
7378     AVFrame *pict = data;
7379     int buf_index;
7380
7381     s->flags= avctx->flags;
7382     s->flags2= avctx->flags2;
7383
7384    /* no supplementary picture */
7385     if (buf_size == 0) {
7386         return 0;
7387     }
7388
7389     if(s->flags&CODEC_FLAG_TRUNCATED){
7390         int next= find_frame_end(h, buf, buf_size);
7391
7392         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
7393             return buf_size;
7394 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7395     }
7396
7397     if(h->is_avc && !h->got_avcC) {
7398         int i, cnt, nalsize;
7399         unsigned char *p = avctx->extradata;
7400         if(avctx->extradata_size < 7) {
7401             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7402             return -1;
7403         }
7404         if(*p != 1) {
7405             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7406             return -1;
7407         }
7408         /* sps and pps in the avcC always have length coded with 2 bytes,
7409            so put a fake nal_length_size = 2 while parsing them */
7410         h->nal_length_size = 2;
7411         // Decode sps from avcC
7412         cnt = *(p+5) & 0x1f; // Number of sps
7413         p += 6;
7414         for (i = 0; i < cnt; i++) {
7415             nalsize = BE_16(p) + 2;
7416             if(decode_nal_units(h, p, nalsize) != nalsize) {
7417                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7418                 return -1;
7419             }
7420             p += nalsize;
7421         }
7422         // Decode pps from avcC
7423         cnt = *(p++); // Number of pps
7424         for (i = 0; i < cnt; i++) {
7425             nalsize = BE_16(p) + 2;
7426             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7427                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7428                 return -1;
7429             }
7430             p += nalsize;
7431         }
7432         // Now store right nal length size, that will be use to parse all other nals
7433         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7434         // Do not reparse avcC
7435         h->got_avcC = 1;
7436     }
7437
7438     if(!h->is_avc && s->avctx->extradata_size && s->picture_number==0){
7439         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7440             return -1;
7441     }
7442
7443     buf_index=decode_nal_units(h, buf, buf_size);
7444     if(buf_index < 0)
7445         return -1;
7446
7447     //FIXME do something with unavailable reference frames
7448
7449 //    if(ret==FRAME_SKIPPED) return get_consumed_bytes(s, buf_index, buf_size);
7450     if(!s->current_picture_ptr){
7451         av_log(h->s.avctx, AV_LOG_DEBUG, "error, NO frame\n");
7452         return -1;
7453     }
7454
7455     {
7456         Picture *out = s->current_picture_ptr;
7457 #if 0 //decode order
7458         *data_size = sizeof(AVFrame);
7459 #else
7460         /* Sort B-frames into display order */
7461         Picture *cur = s->current_picture_ptr;
7462         Picture *prev = h->delayed_output_pic;
7463         int out_idx = 0;
7464         int pics = 0;
7465         int out_of_order;
7466         int cross_idr = 0;
7467         int dropped_frame = 0;
7468         int i;
7469
7470         if(h->sps.bitstream_restriction_flag
7471            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7472             s->avctx->has_b_frames = h->sps.num_reorder_frames;
7473             s->low_delay = 0;
7474         }
7475
7476         while(h->delayed_pic[pics]) pics++;
7477         h->delayed_pic[pics++] = cur;
7478         if(cur->reference == 0)
7479             cur->reference = 1;
7480
7481         for(i=0; h->delayed_pic[i]; i++)
7482             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7483                 cross_idr = 1;
7484
7485         out = h->delayed_pic[0];
7486         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7487             if(h->delayed_pic[i]->poc < out->poc){
7488                 out = h->delayed_pic[i];
7489                 out_idx = i;
7490             }
7491
7492         out_of_order = !cross_idr && prev && out->poc < prev->poc;
7493         if(prev && pics <= s->avctx->has_b_frames)
7494             out = prev;
7495         else if((out_of_order && pics-1 == s->avctx->has_b_frames)
7496            || (s->low_delay &&
7497             ((!cross_idr && prev && out->poc > prev->poc + 2)
7498              || cur->pict_type == B_TYPE)))
7499         {
7500             s->low_delay = 0;
7501             s->avctx->has_b_frames++;
7502             out = prev;
7503         }
7504         else if(out_of_order)
7505             out = prev;
7506
7507         if(out_of_order || pics > s->avctx->has_b_frames){
7508             dropped_frame = (out != h->delayed_pic[out_idx]);
7509             for(i=out_idx; h->delayed_pic[i]; i++)
7510                 h->delayed_pic[i] = h->delayed_pic[i+1];
7511         }
7512
7513         if(prev == out && !dropped_frame)
7514             *data_size = 0;
7515         else
7516             *data_size = sizeof(AVFrame);
7517         if(prev && prev != out && prev->reference == 1)
7518             prev->reference = 0;
7519         h->delayed_output_pic = out;
7520 #endif
7521
7522         *pict= *(AVFrame*)out;
7523     }
7524
7525     assert(pict->data[0]);
7526     ff_print_debug_info(s, pict);
7527 //printf("out %d\n", (int)pict->data[0]);
7528 #if 0 //?
7529
7530     /* Return the Picture timestamp as the frame number */
7531     /* we substract 1 because it is added on utils.c    */
7532     avctx->frame_number = s->picture_number - 1;
7533 #endif
7534     return get_consumed_bytes(s, buf_index, buf_size);
7535 }
7536 #if 0
7537 static inline void fill_mb_avail(H264Context *h){
7538     MpegEncContext * const s = &h->s;
7539     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7540
7541     if(s->mb_y){
7542         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7543         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7544         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7545     }else{
7546         h->mb_avail[0]=
7547         h->mb_avail[1]=
7548         h->mb_avail[2]= 0;
7549     }
7550     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7551     h->mb_avail[4]= 1; //FIXME move out
7552     h->mb_avail[5]= 0; //FIXME move out
7553 }
7554 #endif
7555
7556 #if 0 //selftest
7557 #define COUNT 8000
7558 #define SIZE (COUNT*40)
7559 int main(){
7560     int i;
7561     uint8_t temp[SIZE];
7562     PutBitContext pb;
7563     GetBitContext gb;
7564 //    int int_temp[10000];
7565     DSPContext dsp;
7566     AVCodecContext avctx;
7567
7568     dsputil_init(&dsp, &avctx);
7569
7570     init_put_bits(&pb, temp, SIZE);
7571     printf("testing unsigned exp golomb\n");
7572     for(i=0; i<COUNT; i++){
7573         START_TIMER
7574         set_ue_golomb(&pb, i);
7575         STOP_TIMER("set_ue_golomb");
7576     }
7577     flush_put_bits(&pb);
7578
7579     init_get_bits(&gb, temp, 8*SIZE);
7580     for(i=0; i<COUNT; i++){
7581         int j, s;
7582
7583         s= show_bits(&gb, 24);
7584
7585         START_TIMER
7586         j= get_ue_golomb(&gb);
7587         if(j != i){
7588             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7589 //            return -1;
7590         }
7591         STOP_TIMER("get_ue_golomb");
7592     }
7593
7594
7595     init_put_bits(&pb, temp, SIZE);
7596     printf("testing signed exp golomb\n");
7597     for(i=0; i<COUNT; i++){
7598         START_TIMER
7599         set_se_golomb(&pb, i - COUNT/2);
7600         STOP_TIMER("set_se_golomb");
7601     }
7602     flush_put_bits(&pb);
7603
7604     init_get_bits(&gb, temp, 8*SIZE);
7605     for(i=0; i<COUNT; i++){
7606         int j, s;
7607
7608         s= show_bits(&gb, 24);
7609
7610         START_TIMER
7611         j= get_se_golomb(&gb);
7612         if(j != i - COUNT/2){
7613             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7614 //            return -1;
7615         }
7616         STOP_TIMER("get_se_golomb");
7617     }
7618
7619     printf("testing 4x4 (I)DCT\n");
7620
7621     DCTELEM block[16];
7622     uint8_t src[16], ref[16];
7623     uint64_t error= 0, max_error=0;
7624
7625     for(i=0; i<COUNT; i++){
7626         int j;
7627 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7628         for(j=0; j<16; j++){
7629             ref[j]= random()%255;
7630             src[j]= random()%255;
7631         }
7632
7633         h264_diff_dct_c(block, src, ref, 4);
7634
7635         //normalize
7636         for(j=0; j<16; j++){
7637 //            printf("%d ", block[j]);
7638             block[j]= block[j]*4;
7639             if(j&1) block[j]= (block[j]*4 + 2)/5;
7640             if(j&4) block[j]= (block[j]*4 + 2)/5;
7641         }
7642 //        printf("\n");
7643
7644         s->dsp.h264_idct_add(ref, block, 4);
7645 /*        for(j=0; j<16; j++){
7646             printf("%d ", ref[j]);
7647         }
7648         printf("\n");*/
7649
7650         for(j=0; j<16; j++){
7651             int diff= ABS(src[j] - ref[j]);
7652
7653             error+= diff*diff;
7654             max_error= FFMAX(max_error, diff);
7655         }
7656     }
7657     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7658 #if 0
7659     printf("testing quantizer\n");
7660     for(qp=0; qp<52; qp++){
7661         for(i=0; i<16; i++)
7662             src1_block[i]= src2_block[i]= random()%255;
7663
7664     }
7665 #endif
7666     printf("Testing NAL layer\n");
7667
7668     uint8_t bitstream[COUNT];
7669     uint8_t nal[COUNT*2];
7670     H264Context h;
7671     memset(&h, 0, sizeof(H264Context));
7672
7673     for(i=0; i<COUNT; i++){
7674         int zeros= i;
7675         int nal_length;
7676         int consumed;
7677         int out_length;
7678         uint8_t *out;
7679         int j;
7680
7681         for(j=0; j<COUNT; j++){
7682             bitstream[j]= (random() % 255) + 1;
7683         }
7684
7685         for(j=0; j<zeros; j++){
7686             int pos= random() % COUNT;
7687             while(bitstream[pos] == 0){
7688                 pos++;
7689                 pos %= COUNT;
7690             }
7691             bitstream[pos]=0;
7692         }
7693
7694         START_TIMER
7695
7696         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7697         if(nal_length<0){
7698             printf("encoding failed\n");
7699             return -1;
7700         }
7701
7702         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7703
7704         STOP_TIMER("NAL")
7705
7706         if(out_length != COUNT){
7707             printf("incorrect length %d %d\n", out_length, COUNT);
7708             return -1;
7709         }
7710
7711         if(consumed != nal_length){
7712             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7713             return -1;
7714         }
7715
7716         if(memcmp(bitstream, out, COUNT)){
7717             printf("missmatch\n");
7718             return -1;
7719         }
7720     }
7721
7722     printf("Testing RBSP\n");
7723
7724
7725     return 0;
7726 }
7727 #endif
7728
7729
7730 static int decode_end(AVCodecContext *avctx)
7731 {
7732     H264Context *h = avctx->priv_data;
7733     MpegEncContext *s = &h->s;
7734
7735     free_tables(h); //FIXME cleanup init stuff perhaps
7736     MPV_common_end(s);
7737
7738 //    memset(h, 0, sizeof(H264Context));
7739
7740     return 0;
7741 }
7742
7743
7744 AVCodec h264_decoder = {
7745     "h264",
7746     CODEC_TYPE_VIDEO,
7747     CODEC_ID_H264,
7748     sizeof(H264Context),
7749     decode_init,
7750     NULL,
7751     decode_end,
7752     decode_frame,
7753     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
7754     .flush= flush_dpb,
7755 };
7756
7757 AVCodecParser h264_parser = {
7758     { CODEC_ID_H264 },
7759     sizeof(H264Context),
7760     NULL,
7761     h264_parse,
7762     ff_parse_close,
7763     h264_split,
7764 };
7765
7766 #include "svq3.c"