libavcodec/vp8.c

   1 /*
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  *
   9  * This file is part of Libav.
  10  *
  11  * Libav is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License as published by the Free Software Foundation; either
  14  * version 2.1 of the License, or (at your option) any later version.
  15  *
  16  * Libav is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with Libav; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24  */
  25
  26 #include "libavutil/imgutils.h"
  27 #include "avcodec.h"
  28 #include "internal.h"
  29 #include "vp8.h"
  30 #include "vp8data.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33
  34 #if ARCH_ARM
  35 #   include "arm/vp8.h"
  36 #endif
  37
  38 static void free_buffers(VP8Context *s)
  39 {
  40     int i;
  41     if (s->thread_data)
  42         for (i = 0; i < MAX_THREADS; i++) {
  43             av_freep(&s->thread_data[i].filter_strength);
  44             av_freep(&s->thread_data[i].edge_emu_buffer);
  45         }
  46     av_freep(&s->thread_data);
  47     av_freep(&s->macroblocks_base);
  48     av_freep(&s->intra4x4_pred_mode_top);
  49     av_freep(&s->top_nnz);
  50     av_freep(&s->top_border);
  51
  52     s->macroblocks = NULL;
  53 }
  54
  55 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  56 {
  57     int ret;
  58     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  59                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  60         return ret;
  61     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  62         ff_thread_release_buffer(s->avctx, &f->tf);
  63         return AVERROR(ENOMEM);
  64     }
  65     return 0;
  66 }
  67
  68 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  69 {
  70     av_buffer_unref(&f->seg_map);
  71     ff_thread_release_buffer(s->avctx, &f->tf);
  72 }
  73
  74 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  75 {
  76     int ret;
  77
  78     vp8_release_frame(s, dst);
  79
  80     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  81         return ret;
  82     if (src->seg_map &&
  83         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  84         vp8_release_frame(s, dst);
  85         return AVERROR(ENOMEM);
  86     }
  87
  88     return 0;
  89 }
  90
  91
  92 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
  93 {
  94     VP8Context *s = avctx->priv_data;
  95     int i;
  96
  97     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
  98         vp8_release_frame(s, &s->frames[i]);
  99     memset(s->framep, 0, sizeof(s->framep));
 100
 101     if (free_mem)
 102         free_buffers(s);
 103 }
 104
 105 static void vp8_decode_flush(AVCodecContext *avctx)
 106 {
 107     vp8_decode_flush_impl(avctx, 0);
 108 }
 109
 110 static int update_dimensions(VP8Context *s, int width, int height)
 111 {
 112     AVCodecContext *avctx = s->avctx;
 113     int i;
 114
 115     if (width  != s->avctx->width ||
 116         height != s->avctx->height) {
 117         if (av_image_check_size(width, height, 0, s->avctx))
 118             return AVERROR_INVALIDDATA;
 119
 120         vp8_decode_flush_impl(s->avctx, 1);
 121
 122         avcodec_set_dimensions(s->avctx, width, height);
 123     }
 124
 125     s->mb_width  = (s->avctx->coded_width +15) / 16;
 126     s->mb_height = (s->avctx->coded_height+15) / 16;
 127
 128     s->mb_layout = (avctx->active_thread_type == FF_THREAD_SLICE) && (FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1);
 129     if (!s->mb_layout) { // Frame threading and one thread
 130         s->macroblocks_base       = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
 131         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4);
 132     }
 133     else // Sliced threading
 134         s->macroblocks_base       = av_mallocz((s->mb_width+2)*(s->mb_height+2)*sizeof(*s->macroblocks));
 135     s->top_nnz                    = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
 136     s->top_border                 = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
 137     s->thread_data                = av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
 138
 139     for (i = 0; i < MAX_THREADS; i++) {
 140         s->thread_data[i].filter_strength = av_mallocz(s->mb_width*sizeof(*s->thread_data[0].filter_strength));
 141 #if HAVE_THREADS
 142         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 143         pthread_cond_init(&s->thread_data[i].cond, NULL);
 144 #endif
 145     }
 146
 147     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 148         (!s->intra4x4_pred_mode_top && !s->mb_layout))
 149         return AVERROR(ENOMEM);
 150
 151     s->macroblocks        = s->macroblocks_base + 1;
 152
 153     return 0;
 154 }
 155
 156 static void parse_segment_info(VP8Context *s)
 157 {
 158     VP56RangeCoder *c = &s->c;
 159     int i;
 160
 161     s->segmentation.update_map = vp8_rac_get(c);
 162
 163     if (vp8_rac_get(c)) { // update segment feature data
 164         s->segmentation.absolute_vals = vp8_rac_get(c);
 165
 166         for (i = 0; i < 4; i++)
 167             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 168
 169         for (i = 0; i < 4; i++)
 170             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 171     }
 172     if (s->segmentation.update_map)
 173         for (i = 0; i < 3; i++)
 174             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 175 }
 176
 177 static void update_lf_deltas(VP8Context *s)
 178 {
 179     VP56RangeCoder *c = &s->c;
 180     int i;
 181
 182     for (i = 0; i < 4; i++) {
 183         if (vp8_rac_get(c)) {
 184             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 185
 186             if (vp8_rac_get(c))
 187                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 188         }
 189     }
 190
 191     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 192         if (vp8_rac_get(c)) {
 193             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 194
 195             if (vp8_rac_get(c))
 196                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 197         }
 198     }
 199 }
 200
 201 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 202 {
 203     const uint8_t *sizes = buf;
 204     int i;
 205
 206     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 207
 208     buf      += 3*(s->num_coeff_partitions-1);
 209     buf_size -= 3*(s->num_coeff_partitions-1);
 210     if (buf_size < 0)
 211         return -1;
 212
 213     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 214         int size = AV_RL24(sizes + 3*i);
 215         if (buf_size - size < 0)
 216             return -1;
 217
 218         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 219         buf      += size;
 220         buf_size -= size;
 221     }
 222     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 223
 224     return 0;
 225 }
 226
 227 static void get_quants(VP8Context *s)
 228 {
 229     VP56RangeCoder *c = &s->c;
 230     int i, base_qi;
 231
 232     int yac_qi     = vp8_rac_get_uint(c, 7);
 233     int ydc_delta  = vp8_rac_get_sint(c, 4);
 234     int y2dc_delta = vp8_rac_get_sint(c, 4);
 235     int y2ac_delta = vp8_rac_get_sint(c, 4);
 236     int uvdc_delta = vp8_rac_get_sint(c, 4);
 237     int uvac_delta = vp8_rac_get_sint(c, 4);
 238
 239     for (i = 0; i < 4; i++) {
 240         if (s->segmentation.enabled) {
 241             base_qi = s->segmentation.base_quant[i];
 242             if (!s->segmentation.absolute_vals)
 243                 base_qi += yac_qi;
 244         } else
 245             base_qi = yac_qi;
 246
 247         s->qmat[i].luma_qmul[0]    =           vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 248         s->qmat[i].luma_qmul[1]    =           vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 249         s->qmat[i].luma_dc_qmul[0] =       2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 250         /* 101581>>16 is equivalent to 155/100 */
 251         s->qmat[i].luma_dc_qmul[1] = (101581 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)]) >> 16;
 252         s->qmat[i].chroma_qmul[0]  =           vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 253         s->qmat[i].chroma_qmul[1]  =           vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 254
 255         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 256         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 257     }
 258 }
 259
 260 /**
 261  * Determine which buffers golden and altref should be updated with after this frame.
 262  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 263  *
 264  * Intra frames update all 3 references
 265  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 266  * If the update (golden|altref) flag is set, it's updated with the current frame
 267  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 268  * If the flag is not set, the number read means:
 269  *      0: no update
 270  *      1: VP56_FRAME_PREVIOUS
 271  *      2: update golden with altref, or update altref with golden
 272  */
 273 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 274 {
 275     VP56RangeCoder *c = &s->c;
 276
 277     if (update)
 278         return VP56_FRAME_CURRENT;
 279
 280     switch (vp8_rac_get_uint(c, 2)) {
 281     case 1:
 282         return VP56_FRAME_PREVIOUS;
 283     case 2:
 284         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 285     }
 286     return VP56_FRAME_NONE;
 287 }
 288
 289 static void update_refs(VP8Context *s)
 290 {
 291     VP56RangeCoder *c = &s->c;
 292
 293     int update_golden = vp8_rac_get(c);
 294     int update_altref = vp8_rac_get(c);
 295
 296     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 297     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 298 }
 299
 300 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 301 {
 302     VP56RangeCoder *c = &s->c;
 303     int header_size, hscale, vscale, i, j, k, l, m, ret;
 304     int width  = s->avctx->width;
 305     int height = s->avctx->height;
 306
 307     s->keyframe  = !(buf[0] & 1);
 308     s->profile   =  (buf[0]>>1) & 7;
 309     s->invisible = !(buf[0] & 0x10);
 310     header_size  = AV_RL24(buf) >> 5;
 311     buf      += 3;
 312     buf_size -= 3;
 313
 314     if (s->profile > 3)
 315         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 316
 317     if (!s->profile)
 318         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 319     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 320         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 321
 322     if (header_size > buf_size - 7*s->keyframe) {
 323         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 324         return AVERROR_INVALIDDATA;
 325     }
 326
 327     if (s->keyframe) {
 328         if (AV_RL24(buf) != 0x2a019d) {
 329             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 330             return AVERROR_INVALIDDATA;
 331         }
 332         width  = AV_RL16(buf+3) & 0x3fff;
 333         height = AV_RL16(buf+5) & 0x3fff;
 334         hscale = buf[4] >> 6;
 335         vscale = buf[6] >> 6;
 336         buf      += 7;
 337         buf_size -= 7;
 338
 339         if (hscale || vscale)
 340             av_log_missing_feature(s->avctx, "Upscaling", 1);
 341
 342         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 343         for (i = 0; i < 4; i++)
 344             for (j = 0; j < 16; j++)
 345                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 346                        sizeof(s->prob->token[i][j]));
 347         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 348         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 349         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 350         memset(&s->segmentation, 0, sizeof(s->segmentation));
 351         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 352     }
 353
 354     ff_vp56_init_range_decoder(c, buf, header_size);
 355     buf      += header_size;
 356     buf_size -= header_size;
 357
 358     if (s->keyframe) {
 359         if (vp8_rac_get(c))
 360             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 361         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 362     }
 363
 364     if ((s->segmentation.enabled = vp8_rac_get(c)))
 365         parse_segment_info(s);
 366     else
 367         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 368
 369     s->filter.simple    = vp8_rac_get(c);
 370     s->filter.level     = vp8_rac_get_uint(c, 6);
 371     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 372
 373     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 374         if (vp8_rac_get(c))
 375             update_lf_deltas(s);
 376
 377     if (setup_partitions(s, buf, buf_size)) {
 378         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 379         return AVERROR_INVALIDDATA;
 380     }
 381
 382     if (!s->macroblocks_base || /* first frame */
 383         width != s->avctx->width || height != s->avctx->height) {
 384         if ((ret = update_dimensions(s, width, height)) < 0)
 385             return ret;
 386     }
 387
 388     get_quants(s);
 389
 390     if (!s->keyframe) {
 391         update_refs(s);
 392         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 393         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 394     }
 395
 396     // if we aren't saving this frame's probabilities for future frames,
 397     // make a copy of the current probabilities
 398     if (!(s->update_probabilities = vp8_rac_get(c)))
 399         s->prob[1] = s->prob[0];
 400
 401     s->update_last = s->keyframe || vp8_rac_get(c);
 402
 403     for (i = 0; i < 4; i++)
 404         for (j = 0; j < 8; j++)
 405             for (k = 0; k < 3; k++)
 406                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 407                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 408                         int prob = vp8_rac_get_uint(c, 8);
 409                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 410                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 411                     }
 412
 413     if ((s->mbskip_enabled = vp8_rac_get(c)))
 414         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 415
 416     if (!s->keyframe) {
 417         s->prob->intra  = vp8_rac_get_uint(c, 8);
 418         s->prob->last   = vp8_rac_get_uint(c, 8);
 419         s->prob->golden = vp8_rac_get_uint(c, 8);
 420
 421         if (vp8_rac_get(c))
 422             for (i = 0; i < 4; i++)
 423                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 424         if (vp8_rac_get(c))
 425             for (i = 0; i < 3; i++)
 426                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 427
 428         // 17.2 MV probability update
 429         for (i = 0; i < 2; i++)
 430             for (j = 0; j < 19; j++)
 431                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 432                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 433     }
 434
 435     return 0;
 436 }
 437
 438 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 439 {
 440     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 441     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 442 }
 443
 444 /**
 445  * Motion vector coding, 17.1.
 446  */
 447 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 448 {
 449     int bit, x = 0;
 450
 451     if (vp56_rac_get_prob_branchy(c, p[0])) {
 452         int i;
 453
 454         for (i = 0; i < 3; i++)
 455             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 456         for (i = 9; i > 3; i--)
 457             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 458         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 459             x += 8;
 460     } else {
 461         // small_mvtree
 462         const uint8_t *ps = p+2;
 463         bit = vp56_rac_get_prob(c, *ps);
 464         ps += 1 + 3*bit;
 465         x  += 4*bit;
 466         bit = vp56_rac_get_prob(c, *ps);
 467         ps += 1 + bit;
 468         x  += 2*bit;
 469         x  += vp56_rac_get_prob(c, *ps);
 470     }
 471
 472     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 473 }
 474
 475 static av_always_inline
 476 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 477 {
 478     if (left == top)
 479         return vp8_submv_prob[4-!!left];
 480     if (!top)
 481         return vp8_submv_prob[2];
 482     return vp8_submv_prob[1-!!left];
 483 }
 484
 485 /**
 486  * Split motion vector prediction, 16.4.
 487  * @returns the number of motion vectors parsed (2, 4 or 16)
 488  */
 489 static av_always_inline
 490 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout)
 491 {
 492     int part_idx;
 493     int n, num;
 494     VP8Macroblock *top_mb;
 495     VP8Macroblock *left_mb = &mb[-1];
 496     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 497                   *mbsplits_top,
 498                   *mbsplits_cur, *firstidx;
 499     VP56mv *top_mv;
 500     VP56mv *left_mv = left_mb->bmv;
 501     VP56mv *cur_mv  = mb->bmv;
 502
 503     if (!layout) // layout is inlined, s->mb_layout is not
 504         top_mb = &mb[2];
 505     else
 506         top_mb = &mb[-s->mb_width-1];
 507     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 508     top_mv = top_mb->bmv;
 509
 510     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 511         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 512             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 513         } else {
 514             part_idx = VP8_SPLITMVMODE_8x8;
 515         }
 516     } else {
 517         part_idx = VP8_SPLITMVMODE_4x4;
 518     }
 519
 520     num = vp8_mbsplit_count[part_idx];
 521     mbsplits_cur = vp8_mbsplits[part_idx],
 522     firstidx = vp8_mbfirstidx[part_idx];
 523     mb->partitioning = part_idx;
 524
 525     for (n = 0; n < num; n++) {
 526         int k = firstidx[n];
 527         uint32_t left, above;
 528         const uint8_t *submv_prob;
 529
 530         if (!(k & 3))
 531             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 532         else
 533             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 534         if (k <= 3)
 535             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 536         else
 537             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 538
 539         submv_prob = get_submv_prob(left, above);
 540
 541         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 542             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 543                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 544                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 545                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 546                 } else {
 547                     AV_ZERO32(&mb->bmv[n]);
 548                 }
 549             } else {
 550                 AV_WN32A(&mb->bmv[n], above);
 551             }
 552         } else {
 553             AV_WN32A(&mb->bmv[n], left);
 554         }
 555     }
 556
 557     return num;
 558 }
 559
 560 static av_always_inline
 561 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int layout)
 562 {
 563     VP8Macroblock *mb_edge[3] = { 0 /* top */,
 564                                   mb - 1 /* left */,
 565                                   0 /* top-left */ };
 566     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 567     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 568     int idx = CNT_ZERO;
 569     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 570     int8_t *sign_bias = s->sign_bias;
 571     VP56mv near_mv[4];
 572     uint8_t cnt[4] = { 0 };
 573     VP56RangeCoder *c = &s->c;
 574
 575     if (!layout) { // layout is inlined (s->mb_layout is not)
 576         mb_edge[0] = mb + 2;
 577         mb_edge[2] = mb + 1;
 578     }
 579     else {
 580         mb_edge[0] = mb - s->mb_width-1;
 581         mb_edge[2] = mb - s->mb_width-2;
 582     }
 583
 584     AV_ZERO32(&near_mv[0]);
 585     AV_ZERO32(&near_mv[1]);
 586     AV_ZERO32(&near_mv[2]);
 587
 588     /* Process MB on top, left and top-left */
 589     #define MV_EDGE_CHECK(n)\
 590     {\
 591         VP8Macroblock *edge = mb_edge[n];\
 592         int edge_ref = edge->ref_frame;\
 593         if (edge_ref != VP56_FRAME_CURRENT) {\
 594             uint32_t mv = AV_RN32A(&edge->mv);\
 595             if (mv) {\
 596                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 597                     /* SWAR negate of the values in mv. */\
 598                     mv = ~mv;\
 599                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 600                 }\
 601                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 602                     AV_WN32A(&near_mv[++idx], mv);\
 603                 cnt[idx]      += 1 + (n != 2);\
 604             } else\
 605                 cnt[CNT_ZERO] += 1 + (n != 2);\
 606         }\
 607     }
 608
 609     MV_EDGE_CHECK(0)
 610     MV_EDGE_CHECK(1)
 611     MV_EDGE_CHECK(2)
 612
 613     mb->partitioning = VP8_SPLITMVMODE_NONE;
 614     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 615         mb->mode = VP8_MVMODE_MV;
 616
 617         /* If we have three distinct MVs, merge first and last if they're the same */
 618         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 619             cnt[CNT_NEAREST] += 1;
 620
 621         /* Swap near and nearest if necessary */
 622         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 623             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 624             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 625         }
 626
 627         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 628             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 629
 630                 /* Choose the best mv out of 0,0 and the nearest mv */
 631                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 632                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 633                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 634                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 635
 636                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 637                     mb->mode = VP8_MVMODE_SPLIT;
 638                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout) - 1];
 639                 } else {
 640                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 641                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 642                     mb->bmv[0] = mb->mv;
 643                 }
 644             } else {
 645                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 646                 mb->bmv[0] = mb->mv;
 647             }
 648         } else {
 649             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 650             mb->bmv[0] = mb->mv;
 651         }
 652     } else {
 653         mb->mode = VP8_MVMODE_ZERO;
 654         AV_ZERO32(&mb->mv);
 655         mb->bmv[0] = mb->mv;
 656     }
 657 }
 658
 659 static av_always_inline
 660 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 661                            int mb_x, int keyframe, int layout)
 662 {
 663     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
 664
 665     if (layout == 1) {
 666         VP8Macroblock *mb_top = mb - s->mb_width - 1;
 667         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
 668     }
 669     if (keyframe) {
 670         int x, y;
 671         uint8_t* top;
 672         uint8_t* const left = s->intra4x4_pred_mode_left;
 673         if (layout == 1)
 674             top = mb->intra4x4_pred_mode_top;
 675         else
 676             top = s->intra4x4_pred_mode_top + 4 * mb_x;
 677         for (y = 0; y < 4; y++) {
 678             for (x = 0; x < 4; x++) {
 679                 const uint8_t *ctx;
 680                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 681                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 682                 left[y] = top[x] = *intra4x4;
 683                 intra4x4++;
 684             }
 685         }
 686     } else {
 687         int i;
 688         for (i = 0; i < 16; i++)
 689             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 690     }
 691 }
 692
 693 static av_always_inline
 694 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
 695                     uint8_t *segment, uint8_t *ref, int layout)
 696 {
 697     VP56RangeCoder *c = &s->c;
 698
 699     if (s->segmentation.update_map)
 700         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
 701     else if (s->segmentation.enabled)
 702         *segment = ref ? *ref : *segment;
 703     mb->segment = *segment;
 704
 705     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 706
 707     if (s->keyframe) {
 708         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 709
 710         if (mb->mode == MODE_I4x4) {
 711             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
 712         } else {
 713             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 714             if (s->mb_layout == 1)
 715                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
 716             else
 717                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 718             AV_WN32A( s->intra4x4_pred_mode_left, modes);
 719         }
 720
 721         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 722         mb->ref_frame = VP56_FRAME_CURRENT;
 723     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 724         // inter MB, 16.2
 725         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 726             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 727                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 728         else
 729             mb->ref_frame = VP56_FRAME_PREVIOUS;
 730         s->ref_count[mb->ref_frame-1]++;
 731
 732         // motion vectors, 16.3
 733         decode_mvs(s, mb, mb_x, mb_y, layout);
 734     } else {
 735         // intra MB, 16.1
 736         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 737
 738         if (mb->mode == MODE_I4x4)
 739             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
 740
 741         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 742         mb->ref_frame = VP56_FRAME_CURRENT;
 743         mb->partitioning = VP8_SPLITMVMODE_NONE;
 744         AV_ZERO32(&mb->bmv[0]);
 745     }
 746 }
 747
 748 #ifndef decode_block_coeffs_internal
 749 /**
 750  * @param r arithmetic bitstream reader context
 751  * @param block destination for block coefficients
 752  * @param probs probabilities to use when reading trees from the bitstream
 753  * @param i initial coeff index, 0 unless a separate DC block is coded
 754  * @param qmul array holding the dc/ac dequant factor at position 0/1
 755  * @return 0 if no coeffs were decoded
 756  *         otherwise, the index of the last coeff decoded plus one
 757  */
 758 static int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
 759                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 760                                         int i, uint8_t *token_prob, int16_t qmul[2])
 761 {
 762     VP56RangeCoder c = *r;
 763     goto skip_eob;
 764     do {
 765         int coeff;
 766         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
 767             break;
 768
 769 skip_eob:
 770         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
 771             if (++i == 16)
 772                 break; // invalid input; blocks should end with EOB
 773             token_prob = probs[i][0];
 774             goto skip_eob;
 775         }
 776
 777         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
 778             coeff = 1;
 779             token_prob = probs[i+1][1];
 780         } else {
 781             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
 782                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
 783                 if (coeff)
 784                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
 785                 coeff += 2;
 786             } else {
 787                 // DCT_CAT*
 788                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
 789                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
 790                         coeff  = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
 791                     } else {                                    // DCT_CAT2
 792                         coeff  = 7;
 793                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
 794                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
 795                     }
 796                 } else {    // DCT_CAT3 and up
 797                     int a = vp56_rac_get_prob(&c, token_prob[8]);
 798                     int b = vp56_rac_get_prob(&c, token_prob[9+a]);
 799                     int cat = (a<<1) + b;
 800                     coeff  = 3 + (8<<cat);
 801                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
 802                 }
 803             }
 804             token_prob = probs[i+1][2];
 805         }
 806         block[zigzag_scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
 807     } while (++i < 16);
 808
 809     *r = c;
 810     return i;
 811 }
 812 #endif
 813
 814 /**
 815  * @param c arithmetic bitstream reader context
 816  * @param block destination for block coefficients
 817  * @param probs probabilities to use when reading trees from the bitstream
 818  * @param i initial coeff index, 0 unless a separate DC block is coded
 819  * @param zero_nhood the initial prediction context for number of surrounding
 820  *                   all-zero blocks (only left/top, so 0-2)
 821  * @param qmul array holding the dc/ac dequant factor at position 0/1
 822  * @return 0 if no coeffs were decoded
 823  *         otherwise, the index of the last coeff decoded plus one
 824  */
 825 static av_always_inline
 826 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
 827                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 828                         int i, int zero_nhood, int16_t qmul[2])
 829 {
 830     uint8_t *token_prob = probs[i][zero_nhood];
 831     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 832         return 0;
 833     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 834 }
 835
 836 static av_always_inline
 837 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c, VP8Macroblock *mb,
 838                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 839 {
 840     int i, x, y, luma_start = 0, luma_ctx = 3;
 841     int nnz_pred, nnz, nnz_total = 0;
 842     int segment = mb->segment;
 843     int block_dc = 0;
 844
 845     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 846         nnz_pred = t_nnz[8] + l_nnz[8];
 847
 848         // decode DC values and do hadamard
 849         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0, nnz_pred,
 850                                   s->qmat[segment].luma_dc_qmul);
 851         l_nnz[8] = t_nnz[8] = !!nnz;
 852         if (nnz) {
 853             nnz_total += nnz;
 854             block_dc = 1;
 855             if (nnz == 1)
 856                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
 857             else
 858                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
 859         }
 860         luma_start = 1;
 861         luma_ctx = 0;
 862     }
 863
 864     // luma blocks
 865     for (y = 0; y < 4; y++)
 866         for (x = 0; x < 4; x++) {
 867             nnz_pred = l_nnz[y] + t_nnz[x];
 868             nnz = decode_block_coeffs(c, td->block[y][x], s->prob->token[luma_ctx], luma_start,
 869                                       nnz_pred, s->qmat[segment].luma_qmul);
 870             // nnz+block_dc may be one more than the actual last index, but we don't care
 871             td->non_zero_count_cache[y][x] = nnz + block_dc;
 872             t_nnz[x] = l_nnz[y] = !!nnz;
 873             nnz_total += nnz;
 874         }
 875
 876     // chroma blocks
 877     // TODO: what to do about dimensions? 2nd dim for luma is x,
 878     // but for chroma it's (y<<1)|x
 879     for (i = 4; i < 6; i++)
 880         for (y = 0; y < 2; y++)
 881             for (x = 0; x < 2; x++) {
 882                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 883                 nnz = decode_block_coeffs(c, td->block[i][(y<<1)+x], s->prob->token[2], 0,
 884                                           nnz_pred, s->qmat[segment].chroma_qmul);
 885                 td->non_zero_count_cache[i][(y<<1)+x] = nnz;
 886                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 887                 nnz_total += nnz;
 888             }
 889
 890     // if there were no coded coeffs despite the macroblock not being marked skip,
 891     // we MUST not do the inner loop filter and should not do IDCT
 892     // Since skip isn't used for bitstream prediction, just manually set it.
 893     if (!nnz_total)
 894         mb->skip = 1;
 895 }
 896
 897 static av_always_inline
 898 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 899                       int linesize, int uvlinesize, int simple)
 900 {
 901     AV_COPY128(top_border, src_y + 15*linesize);
 902     if (!simple) {
 903         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 904         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 905     }
 906 }
 907
 908 static av_always_inline
 909 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 910                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 911                     int simple, int xchg)
 912 {
 913     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 914     src_y  -=   linesize;
 915     src_cb -= uvlinesize;
 916     src_cr -= uvlinesize;
 917
 918 #define XCHG(a,b,xchg) do {                     \
 919         if (xchg) AV_SWAP64(b,a);               \
 920         else      AV_COPY64(b,a);               \
 921     } while (0)
 922
 923     XCHG(top_border_m1+8, src_y-8, xchg);
 924     XCHG(top_border,      src_y,   xchg);
 925     XCHG(top_border+8,    src_y+8, 1);
 926     if (mb_x < mb_width-1)
 927         XCHG(top_border+32, src_y+16, 1);
 928
 929     // only copy chroma for normal loop filter
 930     // or to initialize the top row to 127
 931     if (!simple || !mb_y) {
 932         XCHG(top_border_m1+16, src_cb-8, xchg);
 933         XCHG(top_border_m1+24, src_cr-8, xchg);
 934         XCHG(top_border+16,    src_cb, 1);
 935         XCHG(top_border+24,    src_cr, 1);
 936     }
 937 }
 938
 939 static av_always_inline
 940 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 941 {
 942     if (!mb_x) {
 943         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 944     } else {
 945         return mb_y ? mode : LEFT_DC_PRED8x8;
 946     }
 947 }
 948
 949 static av_always_inline
 950 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 951 {
 952     if (!mb_x) {
 953         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 954     } else {
 955         return mb_y ? mode : HOR_PRED8x8;
 956     }
 957 }
 958
 959 static av_always_inline
 960 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 961 {
 962     if (mode == DC_PRED8x8) {
 963         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 964     } else {
 965         return mode;
 966     }
 967 }
 968
 969 static av_always_inline
 970 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 971 {
 972     switch (mode) {
 973     case DC_PRED8x8:
 974         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 975     case VERT_PRED8x8:
 976         return !mb_y ? DC_127_PRED8x8 : mode;
 977     case HOR_PRED8x8:
 978         return !mb_x ? DC_129_PRED8x8 : mode;
 979     case PLANE_PRED8x8 /*TM*/:
 980         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 981     }
 982     return mode;
 983 }
 984
 985 static av_always_inline
 986 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 987 {
 988     if (!mb_x) {
 989         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 990     } else {
 991         return mb_y ? mode : HOR_VP8_PRED;
 992     }
 993 }
 994
 995 static av_always_inline
 996 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
 997 {
 998     switch (mode) {
 999     case VERT_PRED:
1000         if (!mb_x && mb_y) {
1001             *copy_buf = 1;
1002             return mode;
1003         }
1004         /* fall-through */
1005     case DIAG_DOWN_LEFT_PRED:
1006     case VERT_LEFT_PRED:
1007         return !mb_y ? DC_127_PRED : mode;
1008     case HOR_PRED:
1009         if (!mb_y) {
1010             *copy_buf = 1;
1011             return mode;
1012         }
1013         /* fall-through */
1014     case HOR_UP_PRED:
1015         return !mb_x ? DC_129_PRED : mode;
1016     case TM_VP8_PRED:
1017         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
1018     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
1019     case DIAG_DOWN_RIGHT_PRED:
1020     case VERT_RIGHT_PRED:
1021     case HOR_DOWN_PRED:
1022         if (!mb_y || !mb_x)
1023             *copy_buf = 1;
1024         return mode;
1025     }
1026     return mode;
1027 }
1028
1029 static av_always_inline
1030 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1031                    VP8Macroblock *mb, int mb_x, int mb_y)
1032 {
1033     AVCodecContext *avctx = s->avctx;
1034     int x, y, mode, nnz;
1035     uint32_t tr;
1036
1037     // for the first row, we need to run xchg_mb_border to init the top edge to 127
1038     // otherwise, skip it if we aren't going to deblock
1039     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1040         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1041                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1042                        s->filter.simple, 1);
1043
1044     if (mb->mode < MODE_I4x4) {
1045         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
1046             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
1047         } else {
1048             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
1049         }
1050         s->hpc.pred16x16[mode](dst[0], s->linesize);
1051     } else {
1052         uint8_t *ptr = dst[0];
1053         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1054         uint8_t tr_top[4] = { 127, 127, 127, 127 };
1055
1056         // all blocks on the right edge of the macroblock use bottom edge
1057         // the top macroblock for their topright edge
1058         uint8_t *tr_right = ptr - s->linesize + 16;
1059
1060         // if we're on the right edge of the frame, said edge is extended
1061         // from the top macroblock
1062         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1063             mb_x == s->mb_width-1) {
1064             tr = tr_right[-1]*0x01010101u;
1065             tr_right = (uint8_t *)&tr;
1066         }
1067
1068         if (mb->skip)
1069             AV_ZERO128(td->non_zero_count_cache);
1070
1071         for (y = 0; y < 4; y++) {
1072             uint8_t *topright = ptr + 4 - s->linesize;
1073             for (x = 0; x < 4; x++) {
1074                 int copy = 0, linesize = s->linesize;
1075                 uint8_t *dst = ptr+4*x;
1076                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1077
1078                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1079                     topright = tr_top;
1080                 } else if (x == 3)
1081                     topright = tr_right;
1082
1083                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1084                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1085                     if (copy) {
1086                         dst = copy_dst + 12;
1087                         linesize = 8;
1088                         if (!(mb_y + y)) {
1089                             copy_dst[3] = 127U;
1090                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1091                         } else {
1092                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1093                             if (!(mb_x + x)) {
1094                                 copy_dst[3] = 129U;
1095                             } else {
1096                                 copy_dst[3] = ptr[4*x-s->linesize-1];
1097                             }
1098                         }
1099                         if (!(mb_x + x)) {
1100                             copy_dst[11] =
1101                             copy_dst[19] =
1102                             copy_dst[27] =
1103                             copy_dst[35] = 129U;
1104                         } else {
1105                             copy_dst[11] = ptr[4*x              -1];
1106                             copy_dst[19] = ptr[4*x+s->linesize  -1];
1107                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
1108                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
1109                         }
1110                     }
1111                 } else {
1112                     mode = intra4x4[x];
1113                 }
1114                 s->hpc.pred4x4[mode](dst, topright, linesize);
1115                 if (copy) {
1116                     AV_COPY32(ptr+4*x              , copy_dst+12);
1117                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1118                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1119                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1120                 }
1121
1122                 nnz = td->non_zero_count_cache[y][x];
1123                 if (nnz) {
1124                     if (nnz == 1)
1125                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, td->block[y][x], s->linesize);
1126                     else
1127                         s->vp8dsp.vp8_idct_add(ptr+4*x, td->block[y][x], s->linesize);
1128                 }
1129                 topright += 4;
1130             }
1131
1132             ptr   += 4*s->linesize;
1133             intra4x4 += 4;
1134         }
1135     }
1136
1137     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1138         mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode, mb_x, mb_y);
1139     } else {
1140         mode = check_intra_pred8x8_mode(mb->chroma_pred_mode, mb_x, mb_y);
1141     }
1142     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1143     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1144
1145     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1146         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1147                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1148                        s->filter.simple, 0);
1149 }
1150
1151 static const uint8_t subpel_idx[3][8] = {
1152     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1153                                 // also function pointer index
1154     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1155     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1156 };
1157
1158 /**
1159  * luma MC function
1160  *
1161  * @param s VP8 decoding context
1162  * @param dst target buffer for block data at block position
1163  * @param ref reference picture buffer at origin (0, 0)
1164  * @param mv motion vector (relative to block position) to get pixel data from
1165  * @param x_off horizontal position of block from origin (0, 0)
1166  * @param y_off vertical position of block from origin (0, 0)
1167  * @param block_w width of block (16, 8 or 4)
1168  * @param block_h height of block (always same as block_w)
1169  * @param width width of src/dst plane data
1170  * @param height height of src/dst plane data
1171  * @param linesize size of a single line of plane data, including padding
1172  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1173  */
1174 static av_always_inline
1175 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1176                  ThreadFrame *ref, const VP56mv *mv,
1177                  int x_off, int y_off, int block_w, int block_h,
1178                  int width, int height, int linesize,
1179                  vp8_mc_func mc_func[3][3])
1180 {
1181     uint8_t *src = ref->f->data[0];
1182
1183     if (AV_RN32A(mv)) {
1184
1185         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1186         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1187
1188         x_off += mv->x >> 2;
1189         y_off += mv->y >> 2;
1190
1191         // edge emulation
1192         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1193         src += y_off * linesize + x_off;
1194         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1195             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1196             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1197                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1198                                      x_off - mx_idx, y_off - my_idx, width, height);
1199             src = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1200         }
1201         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1202     } else {
1203         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1204         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1205     }
1206 }
1207
1208 /**
1209  * chroma MC function
1210  *
1211  * @param s VP8 decoding context
1212  * @param dst1 target buffer for block data at block position (U plane)
1213  * @param dst2 target buffer for block data at block position (V plane)
1214  * @param ref reference picture buffer at origin (0, 0)
1215  * @param mv motion vector (relative to block position) to get pixel data from
1216  * @param x_off horizontal position of block from origin (0, 0)
1217  * @param y_off vertical position of block from origin (0, 0)
1218  * @param block_w width of block (16, 8 or 4)
1219  * @param block_h height of block (always same as block_w)
1220  * @param width width of src/dst plane data
1221  * @param height height of src/dst plane data
1222  * @param linesize size of a single line of plane data, including padding
1223  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1224  */
1225 static av_always_inline
1226 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, uint8_t *dst2,
1227                    ThreadFrame *ref, const VP56mv *mv, int x_off, int y_off,
1228                    int block_w, int block_h, int width, int height, int linesize,
1229                    vp8_mc_func mc_func[3][3])
1230 {
1231     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1232
1233     if (AV_RN32A(mv)) {
1234         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1235         int my = mv->y&7, my_idx = subpel_idx[0][my];
1236
1237         x_off += mv->x >> 3;
1238         y_off += mv->y >> 3;
1239
1240         // edge emulation
1241         src1 += y_off * linesize + x_off;
1242         src2 += y_off * linesize + x_off;
1243         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1244         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1245             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1246             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1247                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1248                                      x_off - mx_idx, y_off - my_idx, width, height);
1249             src1 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1250             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1251
1252             s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1253                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1254                                      x_off - mx_idx, y_off - my_idx, width, height);
1255             src2 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1256             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1257         } else {
1258             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1259             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1260         }
1261     } else {
1262         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1263         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1264         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1265     }
1266 }
1267
1268 static av_always_inline
1269 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1270                  ThreadFrame *ref_frame, int x_off, int y_off,
1271                  int bx_off, int by_off,
1272                  int block_w, int block_h,
1273                  int width, int height, VP56mv *mv)
1274 {
1275     VP56mv uvmv = *mv;
1276
1277     /* Y */
1278     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1279                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1280                 block_w, block_h, width, height, s->linesize,
1281                 s->put_pixels_tab[block_w == 8]);
1282
1283     /* U/V */
1284     if (s->profile == 3) {
1285         uvmv.x &= ~7;
1286         uvmv.y &= ~7;
1287     }
1288     x_off   >>= 1; y_off   >>= 1;
1289     bx_off  >>= 1; by_off  >>= 1;
1290     width   >>= 1; height  >>= 1;
1291     block_w >>= 1; block_h >>= 1;
1292     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1293                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1294                   &uvmv, x_off + bx_off, y_off + by_off,
1295                   block_w, block_h, width, height, s->uvlinesize,
1296                   s->put_pixels_tab[1 + (block_w == 4)]);
1297 }
1298
1299 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1300  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1301 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1302 {
1303     /* Don't prefetch refs that haven't been used very often this frame. */
1304     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1305         int x_off = mb_x << 4, y_off = mb_y << 4;
1306         int mx = (mb->mv.x>>2) + x_off + 8;
1307         int my = (mb->mv.y>>2) + y_off;
1308         uint8_t **src= s->framep[ref]->tf.f->data;
1309         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1310         /* For threading, a ff_thread_await_progress here might be useful, but
1311          * it actually slows down the decoder. Since a bad prefetch doesn't
1312          * generate bad decoder output, we don't run it here. */
1313         s->vdsp.prefetch(src[0]+off, s->linesize, 4);
1314         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1315         s->vdsp.prefetch(src[1]+off, src[2]-src[1], 2);
1316     }
1317 }
1318
1319 /**
1320  * Apply motion vectors to prediction buffer, chapter 18.
1321  */
1322 static av_always_inline
1323 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1324                    VP8Macroblock *mb, int mb_x, int mb_y)
1325 {
1326     int x_off = mb_x << 4, y_off = mb_y << 4;
1327     int width = 16*s->mb_width, height = 16*s->mb_height;
1328     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1329     VP56mv *bmv = mb->bmv;
1330
1331     switch (mb->partitioning) {
1332     case VP8_SPLITMVMODE_NONE:
1333         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1334                     0, 0, 16, 16, width, height, &mb->mv);
1335         break;
1336     case VP8_SPLITMVMODE_4x4: {
1337         int x, y;
1338         VP56mv uvmv;
1339
1340         /* Y */
1341         for (y = 0; y < 4; y++) {
1342             for (x = 0; x < 4; x++) {
1343                 vp8_mc_luma(s, td, dst[0] + 4*y*s->linesize + x*4,
1344                             ref, &bmv[4*y + x],
1345                             4*x + x_off, 4*y + y_off, 4, 4,
1346                             width, height, s->linesize,
1347                             s->put_pixels_tab[2]);
1348             }
1349         }
1350
1351         /* U/V */
1352         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1353         for (y = 0; y < 2; y++) {
1354             for (x = 0; x < 2; x++) {
1355                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1356                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1357                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1358                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1359                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1360                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1361                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1362                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1363                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1364                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1365                 if (s->profile == 3) {
1366                     uvmv.x &= ~7;
1367                     uvmv.y &= ~7;
1368                 }
1369                 vp8_mc_chroma(s, td, dst[1] + 4*y*s->uvlinesize + x*4,
1370                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1371                               4*x + x_off, 4*y + y_off, 4, 4,
1372                               width, height, s->uvlinesize,
1373                               s->put_pixels_tab[2]);
1374             }
1375         }
1376         break;
1377     }
1378     case VP8_SPLITMVMODE_16x8:
1379         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1380                     0, 0, 16, 8, width, height, &bmv[0]);
1381         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1382                     0, 8, 16, 8, width, height, &bmv[1]);
1383         break;
1384     case VP8_SPLITMVMODE_8x16:
1385         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1386                     0, 0, 8, 16, width, height, &bmv[0]);
1387         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1388                     8, 0, 8, 16, width, height, &bmv[1]);
1389         break;
1390     case VP8_SPLITMVMODE_8x8:
1391         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1392                     0, 0, 8, 8, width, height, &bmv[0]);
1393         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1394                     8, 0, 8, 8, width, height, &bmv[1]);
1395         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1396                     0, 8, 8, 8, width, height, &bmv[2]);
1397         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1398                     8, 8, 8, 8, width, height, &bmv[3]);
1399         break;
1400     }
1401 }
1402
1403 static av_always_inline void idct_mb(VP8Context *s, VP8ThreadData *td,
1404                                      uint8_t *dst[3], VP8Macroblock *mb)
1405 {
1406     int x, y, ch;
1407
1408     if (mb->mode != MODE_I4x4) {
1409         uint8_t *y_dst = dst[0];
1410         for (y = 0; y < 4; y++) {
1411             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1412             if (nnz4) {
1413                 if (nnz4&~0x01010101) {
1414                     for (x = 0; x < 4; x++) {
1415                         if ((uint8_t)nnz4 == 1)
1416                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, td->block[y][x], s->linesize);
1417                         else if((uint8_t)nnz4 > 1)
1418                             s->vp8dsp.vp8_idct_add(y_dst+4*x, td->block[y][x], s->linesize);
1419                         nnz4 >>= 8;
1420                         if (!nnz4)
1421                             break;
1422                     }
1423                 } else {
1424                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1425                 }
1426             }
1427             y_dst += 4*s->linesize;
1428         }
1429     }
1430
1431     for (ch = 0; ch < 2; ch++) {
1432         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4+ch]);
1433         if (nnz4) {
1434             uint8_t *ch_dst = dst[1+ch];
1435             if (nnz4&~0x01010101) {
1436                 for (y = 0; y < 2; y++) {
1437                     for (x = 0; x < 2; x++) {
1438                         if ((uint8_t)nnz4 == 1)
1439                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1440                         else if((uint8_t)nnz4 > 1)
1441                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1442                         nnz4 >>= 8;
1443                         if (!nnz4)
1444                             goto chroma_idct_end;
1445                     }
1446                     ch_dst += 4*s->uvlinesize;
1447                 }
1448             } else {
1449                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4+ch], s->uvlinesize);
1450             }
1451         }
1452 chroma_idct_end: ;
1453     }
1454 }
1455
1456 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1457 {
1458     int interior_limit, filter_level;
1459
1460     if (s->segmentation.enabled) {
1461         filter_level = s->segmentation.filter_level[mb->segment];
1462         if (!s->segmentation.absolute_vals)
1463             filter_level += s->filter.level;
1464     } else
1465         filter_level = s->filter.level;
1466
1467     if (s->lf_delta.enabled) {
1468         filter_level += s->lf_delta.ref[mb->ref_frame];
1469         filter_level += s->lf_delta.mode[mb->mode];
1470     }
1471
1472     filter_level = av_clip_uintp2(filter_level, 6);
1473
1474     interior_limit = filter_level;
1475     if (s->filter.sharpness) {
1476         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1477         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1478     }
1479     interior_limit = FFMAX(interior_limit, 1);
1480
1481     f->filter_level = filter_level;
1482     f->inner_limit = interior_limit;
1483     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1484 }
1485
1486 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1487 {
1488     int mbedge_lim, bedge_lim, hev_thresh;
1489     int filter_level = f->filter_level;
1490     int inner_limit = f->inner_limit;
1491     int inner_filter = f->inner_filter;
1492     int linesize = s->linesize;
1493     int uvlinesize = s->uvlinesize;
1494     static const uint8_t hev_thresh_lut[2][64] = {
1495         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1496           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1497           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1498           3, 3, 3, 3 },
1499         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1500           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1501           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1502           2, 2, 2, 2 }
1503     };
1504
1505     if (!filter_level)
1506         return;
1507
1508      bedge_lim = 2*filter_level + inner_limit;
1509     mbedge_lim = bedge_lim + 4;
1510
1511     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1512
1513     if (mb_x) {
1514         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1515                                        mbedge_lim, inner_limit, hev_thresh);
1516         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1517                                        mbedge_lim, inner_limit, hev_thresh);
1518     }
1519
1520     if (inner_filter) {
1521         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1522                                              inner_limit, hev_thresh);
1523         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1524                                              inner_limit, hev_thresh);
1525         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1526                                              inner_limit, hev_thresh);
1527         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1528                                              uvlinesize,  bedge_lim,
1529                                              inner_limit, hev_thresh);
1530     }
1531
1532     if (mb_y) {
1533         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1534                                        mbedge_lim, inner_limit, hev_thresh);
1535         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1536                                        mbedge_lim, inner_limit, hev_thresh);
1537     }
1538
1539     if (inner_filter) {
1540         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1541                                              linesize,    bedge_lim,
1542                                              inner_limit, hev_thresh);
1543         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1544                                              linesize,    bedge_lim,
1545                                              inner_limit, hev_thresh);
1546         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1547                                              linesize,    bedge_lim,
1548                                              inner_limit, hev_thresh);
1549         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1550                                              dst[2] + 4 * uvlinesize,
1551                                              uvlinesize,  bedge_lim,
1552                                              inner_limit, hev_thresh);
1553     }
1554 }
1555
1556 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1557 {
1558     int mbedge_lim, bedge_lim;
1559     int filter_level = f->filter_level;
1560     int inner_limit = f->inner_limit;
1561     int inner_filter = f->inner_filter;
1562     int linesize = s->linesize;
1563
1564     if (!filter_level)
1565         return;
1566
1567      bedge_lim = 2*filter_level + inner_limit;
1568     mbedge_lim = bedge_lim + 4;
1569
1570     if (mb_x)
1571         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1572     if (inner_filter) {
1573         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1574         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1575         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1576     }
1577
1578     if (mb_y)
1579         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1580     if (inner_filter) {
1581         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1582         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1583         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1584     }
1585 }
1586
1587 #define MARGIN (16 << 2)
1588 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
1589                                    VP8Frame *prev_frame)
1590 {
1591     VP8Context *s = avctx->priv_data;
1592     int mb_x, mb_y;
1593
1594     s->mv_min.y = -MARGIN;
1595     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1596     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1597         VP8Macroblock *mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1598         int mb_xy = mb_y*s->mb_width;
1599
1600         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1601
1602         s->mv_min.x = -MARGIN;
1603         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1604         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1605             if (mb_y == 0)
1606                 AV_WN32A((mb-s->mb_width-1)->intra4x4_pred_mode_top, DC_PRED*0x01010101);
1607             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1608                            prev_frame && prev_frame->seg_map ?
1609                            prev_frame->seg_map->data + mb_xy : NULL, 1);
1610             s->mv_min.x -= 64;
1611             s->mv_max.x -= 64;
1612         }
1613         s->mv_min.y -= 64;
1614         s->mv_max.y -= 64;
1615     }
1616 }
1617
1618 #if HAVE_THREADS
1619 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)\
1620     do {\
1621         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);\
1622         if (otd->thread_mb_pos < tmp) {\
1623             pthread_mutex_lock(&otd->lock);\
1624             td->wait_mb_pos = tmp;\
1625             do {\
1626                 if (otd->thread_mb_pos >= tmp)\
1627                     break;\
1628                 pthread_cond_wait(&otd->cond, &otd->lock);\
1629             } while (1);\
1630             td->wait_mb_pos = INT_MAX;\
1631             pthread_mutex_unlock(&otd->lock);\
1632         }\
1633     } while(0);
1634
1635 #define update_pos(td, mb_y, mb_x)\
1636     do {\
1637     int pos              = (mb_y << 16) | (mb_x & 0xFFFF);\
1638     int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && (num_jobs > 1);\
1639     int is_null          = (next_td == NULL) || (prev_td == NULL);\
1640     int pos_check        = (is_null) ? 1 :\
1641                             (next_td != td && pos >= next_td->wait_mb_pos) ||\
1642                             (prev_td != td && pos >= prev_td->wait_mb_pos);\
1643     td->thread_mb_pos = pos;\
1644     if (sliced_threading && pos_check) {\
1645         pthread_mutex_lock(&td->lock);\
1646         pthread_cond_broadcast(&td->cond);\
1647         pthread_mutex_unlock(&td->lock);\
1648     }\
1649     } while(0);
1650 #else
1651 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
1652 #define update_pos(td, mb_y, mb_x)
1653 #endif
1654
1655 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
1656                                         int jobnr, int threadnr)
1657 {
1658     VP8Context *s = avctx->priv_data;
1659     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
1660     int mb_y = td->thread_mb_pos>>16;
1661     int i, y, mb_x, mb_xy = mb_y*s->mb_width;
1662     int num_jobs = s->num_jobs;
1663     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
1664     VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1665     VP8Macroblock *mb;
1666     uint8_t *dst[3] = {
1667         curframe->tf.f->data[0] + 16*mb_y*s->linesize,
1668         curframe->tf.f->data[1] +  8*mb_y*s->uvlinesize,
1669         curframe->tf.f->data[2] +  8*mb_y*s->uvlinesize
1670     };
1671     if (mb_y == 0) prev_td = td;
1672     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1673     if (mb_y == s->mb_height-1) next_td = td;
1674     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1675     if (s->mb_layout == 1)
1676         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1677     else {
1678         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1679         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
1680         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1681     }
1682
1683     memset(td->left_nnz, 0, sizeof(td->left_nnz));
1684     // left edge of 129 for intra prediction
1685     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1686         for (i = 0; i < 3; i++)
1687             for (y = 0; y < 16>>!!i; y++)
1688                 dst[i][y*curframe->tf.f->linesize[i]-1] = 129;
1689         if (mb_y == 1) {
1690             s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1691         }
1692     }
1693
1694     s->mv_min.x = -MARGIN;
1695     s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1696
1697     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1698         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
1699         if (prev_td != td) {
1700             if (threadnr != 0) {
1701                 check_thread_pos(td, prev_td, mb_x+1, mb_y-1);
1702             } else {
1703                 check_thread_pos(td, prev_td, (s->mb_width+3) + (mb_x+1), mb_y-1);
1704             }
1705         }
1706
1707         s->vdsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1708         s->vdsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1709
1710         if (!s->mb_layout)
1711             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1712                            prev_frame && prev_frame->seg_map ?
1713                            prev_frame->seg_map->data + mb_xy : NULL, 0);
1714
1715         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1716
1717         if (!mb->skip)
1718             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz);
1719
1720         if (mb->mode <= MODE_I4x4)
1721             intra_predict(s, td, dst, mb, mb_x, mb_y);
1722         else
1723             inter_predict(s, td, dst, mb, mb_x, mb_y);
1724
1725         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1726
1727         if (!mb->skip) {
1728             idct_mb(s, td, dst, mb);
1729         } else {
1730             AV_ZERO64(td->left_nnz);
1731             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1732
1733             // Reset DC block predictors if they would exist if the mb had coefficients
1734             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1735                 td->left_nnz[8]     = 0;
1736                 s->top_nnz[mb_x][8] = 0;
1737             }
1738         }
1739
1740         if (s->deblock_filter)
1741             filter_level_for_mb(s, mb, &td->filter_strength[mb_x]);
1742
1743         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs-1) {
1744             if (s->filter.simple)
1745                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1746             else
1747                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1748         }
1749
1750         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1751
1752         dst[0] += 16;
1753         dst[1] += 8;
1754         dst[2] += 8;
1755         s->mv_min.x -= 64;
1756         s->mv_max.x -= 64;
1757
1758         if (mb_x == s->mb_width+1) {
1759             update_pos(td, mb_y, s->mb_width+3);
1760         } else {
1761             update_pos(td, mb_y, mb_x);
1762         }
1763     }
1764 }
1765
1766 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
1767                               int jobnr, int threadnr)
1768 {
1769     VP8Context *s = avctx->priv_data;
1770     VP8ThreadData *td = &s->thread_data[threadnr];
1771     int mb_x, mb_y = td->thread_mb_pos>>16, num_jobs = s->num_jobs;
1772     AVFrame *curframe = s->curframe->tf.f;
1773     VP8Macroblock *mb;
1774     VP8ThreadData *prev_td, *next_td;
1775     uint8_t *dst[3] = {
1776         curframe->data[0] + 16*mb_y*s->linesize,
1777         curframe->data[1] +  8*mb_y*s->uvlinesize,
1778         curframe->data[2] +  8*mb_y*s->uvlinesize
1779     };
1780
1781     if (s->mb_layout == 1)
1782         mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1783     else
1784         mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1785
1786     if (mb_y == 0) prev_td = td;
1787     else           prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1788     if (mb_y == s->mb_height-1) next_td = td;
1789     else                        next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1790
1791     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
1792         VP8FilterStrength *f = &td->filter_strength[mb_x];
1793         if (prev_td != td) {
1794             check_thread_pos(td, prev_td, (mb_x+1) + (s->mb_width+3), mb_y-1);
1795         }
1796         if (next_td != td)
1797             if (next_td != &s->thread_data[0]) {
1798                 check_thread_pos(td, next_td, mb_x+1, mb_y+1);
1799             }
1800
1801         if (num_jobs == 1) {
1802             if (s->filter.simple)
1803                 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1804             else
1805                 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1806         }
1807
1808         if (s->filter.simple)
1809             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
1810         else
1811             filter_mb(s, dst, f, mb_x, mb_y);
1812         dst[0] += 16;
1813         dst[1] += 8;
1814         dst[2] += 8;
1815
1816         update_pos(td, mb_y, (s->mb_width+3) + mb_x);
1817     }
1818 }
1819
1820 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
1821                                     int jobnr, int threadnr)
1822 {
1823     VP8Context *s = avctx->priv_data;
1824     VP8ThreadData *td = &s->thread_data[jobnr];
1825     VP8ThreadData *next_td = NULL, *prev_td = NULL;
1826     VP8Frame *curframe = s->curframe;
1827     int mb_y, num_jobs = s->num_jobs;
1828     td->thread_nr = threadnr;
1829     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
1830         if (mb_y >= s->mb_height) break;
1831         td->thread_mb_pos = mb_y<<16;
1832         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
1833         if (s->deblock_filter)
1834             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr);
1835         update_pos(td, mb_y, INT_MAX & 0xFFFF);
1836
1837         s->mv_min.y -= 64;
1838         s->mv_max.y -= 64;
1839
1840         if (avctx->active_thread_type == FF_THREAD_FRAME)
1841             ff_thread_report_progress(&curframe->tf, mb_y, 0);
1842     }
1843
1844     return 0;
1845 }
1846
1847 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
1848                             AVPacket *avpkt)
1849 {
1850     VP8Context *s = avctx->priv_data;
1851     int ret, i, referenced, num_jobs;
1852     enum AVDiscard skip_thresh;
1853     VP8Frame *av_uninit(curframe), *prev_frame;
1854
1855     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1856         goto err;
1857
1858     prev_frame = s->framep[VP56_FRAME_CURRENT];
1859
1860     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1861                                 || s->update_altref == VP56_FRAME_CURRENT;
1862
1863     skip_thresh = !referenced ? AVDISCARD_NONREF :
1864                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1865
1866     if (avctx->skip_frame >= skip_thresh) {
1867         s->invisible = 1;
1868         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1869         goto skip_decode;
1870     }
1871     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1872
1873     // release no longer referenced frames
1874     for (i = 0; i < 5; i++)
1875         if (s->frames[i].tf.f->data[0] &&
1876             &s->frames[i] != prev_frame &&
1877             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1878             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1879             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1880             vp8_release_frame(s, &s->frames[i]);
1881
1882     // find a free buffer
1883     for (i = 0; i < 5; i++)
1884         if (&s->frames[i] != prev_frame &&
1885             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1886             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1887             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1888             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1889             break;
1890         }
1891     if (i == 5) {
1892         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1893         abort();
1894     }
1895     if (curframe->tf.f->data[0])
1896         vp8_release_frame(s, curframe);
1897
1898     // Given that arithmetic probabilities are updated every frame, it's quite likely
1899     // that the values we have on a random interframe are complete junk if we didn't
1900     // start decode on a keyframe. So just don't display anything rather than junk.
1901     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1902                          !s->framep[VP56_FRAME_GOLDEN] ||
1903                          !s->framep[VP56_FRAME_GOLDEN2])) {
1904         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1905         ret = AVERROR_INVALIDDATA;
1906         goto err;
1907     }
1908
1909     curframe->tf.f->key_frame = s->keyframe;
1910     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1911     if ((ret = vp8_alloc_frame(s, curframe, referenced))) {
1912         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1913         goto err;
1914     }
1915
1916     // check if golden and altref are swapped
1917     if (s->update_altref != VP56_FRAME_NONE) {
1918         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1919     } else {
1920         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1921     }
1922     if (s->update_golden != VP56_FRAME_NONE) {
1923         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1924     } else {
1925         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1926     }
1927     if (s->update_last) {
1928         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1929     } else {
1930         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1931     }
1932     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1933
1934     ff_thread_finish_setup(avctx);
1935
1936     s->linesize   = curframe->tf.f->linesize[0];
1937     s->uvlinesize = curframe->tf.f->linesize[1];
1938
1939     if (!s->thread_data[0].edge_emu_buffer)
1940         for (i = 0; i < MAX_THREADS; i++)
1941             s->thread_data[i].edge_emu_buffer = av_malloc(21*s->linesize);
1942
1943     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1944     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1945     if (!s->mb_layout)
1946         memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1947     if (!s->mb_layout && s->keyframe)
1948         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1949
1950     // top edge of 127 for intra prediction
1951     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1952         s->top_border[0][15] = s->top_border[0][23] = 127;
1953         s->top_border[0][31] = 127;
1954         memset(s->top_border[1], 127, s->mb_width*sizeof(*s->top_border));
1955     }
1956     memset(s->ref_count, 0, sizeof(s->ref_count));
1957
1958
1959     // Make sure the previous frame has read its segmentation map,
1960     // if we re-use the same map.
1961     if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1962         ff_thread_await_progress(&prev_frame->tf, 1, 0);
1963
1964     if (s->mb_layout == 1)
1965         vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
1966
1967     if (avctx->active_thread_type == FF_THREAD_FRAME)
1968         num_jobs = 1;
1969     else
1970         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
1971     s->num_jobs   = num_jobs;
1972     s->curframe   = curframe;
1973     s->prev_frame = prev_frame;
1974     s->mv_min.y   = -MARGIN;
1975     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
1976     for (i = 0; i < MAX_THREADS; i++) {
1977         s->thread_data[i].thread_mb_pos = 0;
1978         s->thread_data[i].wait_mb_pos = INT_MAX;
1979     }
1980     avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL, num_jobs);
1981
1982     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
1983     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1984
1985 skip_decode:
1986     // if future frames don't use the updated probabilities,
1987     // reset them to the values we saved
1988     if (!s->update_probabilities)
1989         s->prob[0] = s->prob[1];
1990
1991     if (!s->invisible) {
1992         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
1993             return ret;
1994         *got_frame      = 1;
1995     }
1996
1997     return avpkt->size;
1998 err:
1999     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2000     return ret;
2001 }
2002
2003 static av_cold int vp8_decode_free(AVCodecContext *avctx)
2004 {
2005     VP8Context *s = avctx->priv_data;
2006     int i;
2007
2008     vp8_decode_flush_impl(avctx, 1);
2009     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2010         av_frame_free(&s->frames[i].tf.f);
2011
2012     return 0;
2013 }
2014
2015 static av_cold int vp8_init_frames(VP8Context *s)
2016 {
2017     int i;
2018     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2019         s->frames[i].tf.f = av_frame_alloc();
2020         if (!s->frames[i].tf.f)
2021             return AVERROR(ENOMEM);
2022     }
2023     return 0;
2024 }
2025
2026 static av_cold int vp8_decode_init(AVCodecContext *avctx)
2027 {
2028     VP8Context *s = avctx->priv_data;
2029     int ret;
2030
2031     s->avctx = avctx;
2032     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2033     avctx->internal->allocate_progress = 1;
2034
2035     ff_videodsp_init(&s->vdsp, 8);
2036     ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2037     ff_vp8dsp_init(&s->vp8dsp);
2038
2039     if ((ret = vp8_init_frames(s)) < 0) {
2040         vp8_decode_free(avctx);
2041         return ret;
2042     }
2043
2044     return 0;
2045 }
2046
2047 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2048 {
2049     VP8Context *s = avctx->priv_data;
2050     int ret;
2051
2052     s->avctx = avctx;
2053
2054     if ((ret = vp8_init_frames(s)) < 0) {
2055         vp8_decode_free(avctx);
2056         return ret;
2057     }
2058
2059     return 0;
2060 }
2061
2062 #define REBASE(pic) \
2063     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2064
2065 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
2066 {
2067     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2068     int i;
2069
2070     if (s->macroblocks_base &&
2071         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2072         free_buffers(s);
2073         s->mb_width  = s_src->mb_width;
2074         s->mb_height = s_src->mb_height;
2075     }
2076
2077     s->prob[0] = s_src->prob[!s_src->update_probabilities];
2078     s->segmentation = s_src->segmentation;
2079     s->lf_delta = s_src->lf_delta;
2080     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2081
2082     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2083         if (s_src->frames[i].tf.f->data[0]) {
2084             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2085             if (ret < 0)
2086                 return ret;
2087         }
2088     }
2089
2090     s->framep[0] = REBASE(s_src->next_framep[0]);
2091     s->framep[1] = REBASE(s_src->next_framep[1]);
2092     s->framep[2] = REBASE(s_src->next_framep[2]);
2093     s->framep[3] = REBASE(s_src->next_framep[3]);
2094
2095     return 0;
2096 }
2097
2098 AVCodec ff_vp8_decoder = {
2099     .name                  = "vp8",
2100     .type                  = AVMEDIA_TYPE_VIDEO,
2101     .id                    = AV_CODEC_ID_VP8,
2102     .priv_data_size        = sizeof(VP8Context),
2103     .init                  = vp8_decode_init,
2104     .close                 = vp8_decode_free,
2105     .decode                = vp8_decode_frame,
2106     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2107     .flush                 = vp8_decode_flush,
2108     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2109     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2110     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2111 };