libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of Libav.
  11  *
  12  * Libav is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * Libav is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with Libav; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "hwaccel.h"
  31 #include "internal.h"
  32 #include "mathops.h"
  33 #include "rectangle.h"
  34 #include "thread.h"
  35 #include "vp8.h"
  36 #include "vp8data.h"
  37
  38 #if ARCH_ARM
  39 #   include "arm/vp8.h"
  40 #endif
  41
  42 static void free_buffers(VP8Context *s)
  43 {
  44     int i;
  45     if (s->thread_data)
  46         for (i = 0; i < MAX_THREADS; i++) {
  47 #if HAVE_THREADS
  48             pthread_cond_destroy(&s->thread_data[i].cond);
  49             pthread_mutex_destroy(&s->thread_data[i].lock);
  50 #endif
  51             av_freep(&s->thread_data[i].filter_strength);
  52         }
  53     av_freep(&s->thread_data);
  54     av_freep(&s->macroblocks_base);
  55     av_freep(&s->intra4x4_pred_mode_top);
  56     av_freep(&s->top_nnz);
  57     av_freep(&s->top_border);
  58
  59     s->macroblocks = NULL;
  60 }
  61
  62 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  63 {
  64     int ret;
  65     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  66                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  67         return ret;
  68     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height)))
  69         goto fail;
  70     if (s->avctx->hwaccel) {
  71         const AVHWAccel *hwaccel = s->avctx->hwaccel;
  72         if (hwaccel->frame_priv_data_size) {
  73             f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
  74             if (!f->hwaccel_priv_buf)
  75                 goto fail;
  76             f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
  77         }
  78     }
  79     return 0;
  80
  81 fail:
  82     av_buffer_unref(&f->seg_map);
  83     ff_thread_release_buffer(s->avctx, &f->tf);
  84     return AVERROR(ENOMEM);
  85 }
  86
  87 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  88 {
  89     av_buffer_unref(&f->seg_map);
  90     av_buffer_unref(&f->hwaccel_priv_buf);
  91     f->hwaccel_picture_private = NULL;
  92     ff_thread_release_buffer(s->avctx, &f->tf);
  93 }
  94
  95 #if CONFIG_VP8_DECODER
  96 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  97 {
  98     int ret;
  99
 100     vp8_release_frame(s, dst);
 101
 102     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
 103         return ret;
 104     if (src->seg_map &&
 105         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
 106         vp8_release_frame(s, dst);
 107         return AVERROR(ENOMEM);
 108     }
 109     if (src->hwaccel_picture_private) {
 110         dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
 111         if (!dst->hwaccel_priv_buf)
 112             return AVERROR(ENOMEM);
 113         dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
 114     }
 115
 116     return 0;
 117 }
 118 #endif /* CONFIG_VP8_DECODER */
 119
 120 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 121 {
 122     VP8Context *s = avctx->priv_data;
 123     int i;
 124
 125     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 126         vp8_release_frame(s, &s->frames[i]);
 127     memset(s->framep, 0, sizeof(s->framep));
 128
 129     if (free_mem)
 130         free_buffers(s);
 131 }
 132
 133 static void vp8_decode_flush(AVCodecContext *avctx)
 134 {
 135     vp8_decode_flush_impl(avctx, 0);
 136 }
 137
 138 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 139 {
 140     VP8Frame *frame = NULL;
 141     int i;
 142
 143     // find a free buffer
 144     for (i = 0; i < 5; i++)
 145         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 146             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 147             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 148             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 149             frame = &s->frames[i];
 150             break;
 151         }
 152     if (i == 5) {
 153         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 154         abort();
 155     }
 156     if (frame->tf.f->buf[0])
 157         vp8_release_frame(s, frame);
 158
 159     return frame;
 160 }
 161
 162 static av_always_inline
 163 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 164 {
 165     AVCodecContext *avctx = s->avctx;
 166     int i, ret;
 167
 168     if (width  != s->avctx->width ||
 169         height != s->avctx->height) {
 170         vp8_decode_flush_impl(s->avctx, 1);
 171
 172         ret = ff_set_dimensions(s->avctx, width, height);
 173         if (ret < 0)
 174             return ret;
 175     }
 176
 177     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 178     s->mb_height = (s->avctx->coded_height + 15) / 16;
 179
 180     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 181                    FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1;
 182     if (!s->mb_layout) { // Frame threading and one thread
 183         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 184                                                sizeof(*s->macroblocks));
 185         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 186     } else // Sliced threading
 187         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 188                                          sizeof(*s->macroblocks));
 189     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 190     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 191     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 192
 193     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 194         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 195         free_buffers(s);
 196         return AVERROR(ENOMEM);
 197     }
 198
 199     for (i = 0; i < MAX_THREADS; i++) {
 200         s->thread_data[i].filter_strength =
 201             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 202         if (!s->thread_data[i].filter_strength) {
 203             free_buffers(s);
 204             return AVERROR(ENOMEM);
 205         }
 206 #if HAVE_THREADS
 207         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 208         pthread_cond_init(&s->thread_data[i].cond, NULL);
 209 #endif
 210     }
 211
 212     s->macroblocks = s->macroblocks_base + 1;
 213
 214     return 0;
 215 }
 216
 217 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 218 {
 219     return update_dimensions(s, width, height, IS_VP7);
 220 }
 221
 222 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 223 {
 224     return update_dimensions(s, width, height, IS_VP8);
 225 }
 226
 227 static void parse_segment_info(VP8Context *s)
 228 {
 229     VP56RangeCoder *c = &s->c;
 230     int i;
 231
 232     s->segmentation.update_map = vp8_rac_get(c);
 233     s->segmentation.update_feature_data = vp8_rac_get(c);
 234
 235     if (s->segmentation.update_feature_data) {
 236         s->segmentation.absolute_vals = vp8_rac_get(c);
 237
 238         for (i = 0; i < 4; i++)
 239             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 240
 241         for (i = 0; i < 4; i++)
 242             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 243     }
 244     if (s->segmentation.update_map)
 245         for (i = 0; i < 3; i++)
 246             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 247 }
 248
 249 static void update_lf_deltas(VP8Context *s)
 250 {
 251     VP56RangeCoder *c = &s->c;
 252     int i;
 253
 254     for (i = 0; i < 4; i++) {
 255         if (vp8_rac_get(c)) {
 256             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 257
 258             if (vp8_rac_get(c))
 259                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 260         }
 261     }
 262
 263     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 264         if (vp8_rac_get(c)) {
 265             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 266
 267             if (vp8_rac_get(c))
 268                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 269         }
 270     }
 271 }
 272
 273 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 274 {
 275     const uint8_t *sizes = buf;
 276     int i;
 277
 278     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 279
 280     buf      += 3 * (s->num_coeff_partitions - 1);
 281     buf_size -= 3 * (s->num_coeff_partitions - 1);
 282     if (buf_size < 0)
 283         return -1;
 284
 285     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 286         int size = AV_RL24(sizes + 3 * i);
 287         if (buf_size - size < 0)
 288             return -1;
 289         s->coeff_partition_size[i] = size;
 290
 291         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 292         buf      += size;
 293         buf_size -= size;
 294     }
 295
 296     s->coeff_partition_size[i] = buf_size;
 297     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 298
 299     return 0;
 300 }
 301
 302 static void vp7_get_quants(VP8Context *s)
 303 {
 304     VP56RangeCoder *c = &s->c;
 305
 306     int yac_qi  = vp8_rac_get_uint(c, 7);
 307     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 308     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 309     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 310     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 311     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 312
 313     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 314     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 315     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 316     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 317     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 318     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 319 }
 320
 321 static void get_quants(VP8Context *s)
 322 {
 323     VP56RangeCoder *c = &s->c;
 324     int i, base_qi;
 325
 326     s->quant.yac_qi     = vp8_rac_get_uint(c, 7);
 327     s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
 328     s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
 329     s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
 330     s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
 331     s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
 332
 333     for (i = 0; i < 4; i++) {
 334         if (s->segmentation.enabled) {
 335             base_qi = s->segmentation.base_quant[i];
 336             if (!s->segmentation.absolute_vals)
 337                 base_qi += s->quant.yac_qi;
 338         } else
 339             base_qi = s->quant.yac_qi;
 340
 341         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
 342         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 343         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
 344         /* 101581>>16 is equivalent to 155/100 */
 345         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
 346         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
 347         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
 348
 349         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 350         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 351     }
 352 }
 353
 354 /**
 355  * Determine which buffers golden and altref should be updated with after this frame.
 356  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 357  *
 358  * Intra frames update all 3 references
 359  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 360  * If the update (golden|altref) flag is set, it's updated with the current frame
 361  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 362  * If the flag is not set, the number read means:
 363  *      0: no update
 364  *      1: VP56_FRAME_PREVIOUS
 365  *      2: update golden with altref, or update altref with golden
 366  */
 367 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 368 {
 369     VP56RangeCoder *c = &s->c;
 370
 371     if (update)
 372         return VP56_FRAME_CURRENT;
 373
 374     switch (vp8_rac_get_uint(c, 2)) {
 375     case 1:
 376         return VP56_FRAME_PREVIOUS;
 377     case 2:
 378         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 379     }
 380     return VP56_FRAME_NONE;
 381 }
 382
 383 static void vp78_reset_probability_tables(VP8Context *s)
 384 {
 385     int i, j;
 386     for (i = 0; i < 4; i++)
 387         for (j = 0; j < 16; j++)
 388             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 389                    sizeof(s->prob->token[i][j]));
 390 }
 391
 392 static void vp78_update_probability_tables(VP8Context *s)
 393 {
 394     VP56RangeCoder *c = &s->c;
 395     int i, j, k, l, m;
 396
 397     for (i = 0; i < 4; i++)
 398         for (j = 0; j < 8; j++)
 399             for (k = 0; k < 3; k++)
 400                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 401                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 402                         int prob = vp8_rac_get_uint(c, 8);
 403                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 404                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 405                     }
 406 }
 407
 408 #define VP7_MVC_SIZE 17
 409 #define VP8_MVC_SIZE 19
 410
 411 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 412                                                             int mvc_size)
 413 {
 414     VP56RangeCoder *c = &s->c;
 415     int i, j;
 416
 417     if (vp8_rac_get(c))
 418         for (i = 0; i < 4; i++)
 419             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 420     if (vp8_rac_get(c))
 421         for (i = 0; i < 3; i++)
 422             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 423
 424     // 17.2 MV probability update
 425     for (i = 0; i < 2; i++)
 426         for (j = 0; j < mvc_size; j++)
 427             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 428                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 429 }
 430
 431 static void update_refs(VP8Context *s)
 432 {
 433     VP56RangeCoder *c = &s->c;
 434
 435     int update_golden = vp8_rac_get(c);
 436     int update_altref = vp8_rac_get(c);
 437
 438     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 439     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 440 }
 441
 442 static void copy_luma(AVFrame *dst, AVFrame *src, int width, int height)
 443 {
 444     int i, j;
 445
 446     for (j = 1; j < 3; j++) {
 447         for (i = 0; i < height / 2; i++)
 448             memcpy(dst->data[j] + i * dst->linesize[j],
 449                    src->data[j] + i * src->linesize[j], width / 2);
 450     }
 451 }
 452
 453 static void fade(uint8_t *dst, uint8_t *src,
 454                  int width, int height, ptrdiff_t linesize,
 455                  int alpha, int beta)
 456 {
 457     int i, j;
 458
 459     for (j = 0; j < height; j++) {
 460         for (i = 0; i < width; i++) {
 461             uint8_t y = src[j * linesize + i];
 462             dst[j * linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 463         }
 464     }
 465 }
 466
 467 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 468 {
 469     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 470     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 471     int ret;
 472
 473     if (!s->keyframe && (alpha || beta)) {
 474         int width  = s->mb_width * 16;
 475         int height = s->mb_height * 16;
 476         AVFrame *src, *dst;
 477
 478         if (!s->framep[VP56_FRAME_PREVIOUS])
 479             return AVERROR_INVALIDDATA;
 480
 481         dst =
 482         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 483
 484         /* preserve the golden frame, write a new previous frame */
 485         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 486             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 487             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 488                return ret;
 489
 490             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 491
 492             copy_luma(dst, src, width, height);
 493         }
 494
 495         fade(dst->data[0], src->data[0],
 496              width, height, dst->linesize[0], alpha, beta);
 497     }
 498
 499     return 0;
 500 }
 501
 502 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 503 {
 504     VP56RangeCoder *c = &s->c;
 505     int part1_size, hscale, vscale, i, j, ret;
 506     int width  = s->avctx->width;
 507     int height = s->avctx->height;
 508
 509     if (buf_size < 4) {
 510         return AVERROR_INVALIDDATA;
 511     }
 512
 513     s->profile = (buf[0] >> 1) & 7;
 514     if (s->profile > 1) {
 515         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 516         return AVERROR_INVALIDDATA;
 517     }
 518
 519     s->keyframe  = !(buf[0] & 1);
 520     s->invisible = 0;
 521     part1_size   = AV_RL24(buf) >> 4;
 522
 523     buf      += 4 - s->profile;
 524     buf_size -= 4 - s->profile;
 525
 526     if (buf_size < part1_size) {
 527         return AVERROR_INVALIDDATA;
 528     }
 529
 530     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 531
 532     ff_vp56_init_range_decoder(c, buf, part1_size);
 533     buf      += part1_size;
 534     buf_size -= part1_size;
 535
 536     /* A. Dimension information (keyframes only) */
 537     if (s->keyframe) {
 538         width  = vp8_rac_get_uint(c, 12);
 539         height = vp8_rac_get_uint(c, 12);
 540         hscale = vp8_rac_get_uint(c, 2);
 541         vscale = vp8_rac_get_uint(c, 2);
 542         if (hscale || vscale)
 543             avpriv_request_sample(s->avctx, "Upscaling");
 544
 545         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 546         vp78_reset_probability_tables(s);
 547         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 548                sizeof(s->prob->pred16x16));
 549         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 550                sizeof(s->prob->pred8x8c));
 551         for (i = 0; i < 2; i++)
 552             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 553                    sizeof(vp7_mv_default_prob[i]));
 554         memset(&s->segmentation, 0, sizeof(s->segmentation));
 555         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 556         memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
 557     }
 558
 559     if (s->keyframe || s->profile > 0)
 560         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 561
 562     /* B. Decoding information for all four macroblock-level features */
 563     for (i = 0; i < 4; i++) {
 564         s->feature_enabled[i] = vp8_rac_get(c);
 565         if (s->feature_enabled[i]) {
 566              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 567
 568              for (j = 0; j < 3; j++)
 569                  s->feature_index_prob[i][j] =
 570                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 571
 572              if (vp7_feature_value_size[s->profile][i])
 573                  for (j = 0; j < 4; j++)
 574                      s->feature_value[i][j] =
 575                          vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 576         }
 577     }
 578
 579     s->segmentation.enabled    = 0;
 580     s->segmentation.update_map = 0;
 581     s->lf_delta.enabled        = 0;
 582
 583     s->num_coeff_partitions = 1;
 584     ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 585
 586     if (!s->macroblocks_base || /* first frame */
 587         width != s->avctx->width || height != s->avctx->height ||
 588         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 589         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 590             return ret;
 591     }
 592
 593     /* C. Dequantization indices */
 594     vp7_get_quants(s);
 595
 596     /* D. Golden frame update flag (a Flag) for interframes only */
 597     if (!s->keyframe) {
 598         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 599         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 600     }
 601
 602     s->update_last          = 1;
 603     s->update_probabilities = 1;
 604     s->fade_present         = 1;
 605
 606     if (s->profile > 0) {
 607         s->update_probabilities = vp8_rac_get(c);
 608         if (!s->update_probabilities)
 609             s->prob[1] = s->prob[0];
 610
 611         if (!s->keyframe)
 612             s->fade_present = vp8_rac_get(c);
 613     }
 614
 615     /* E. Fading information for previous frame */
 616     if (s->fade_present && vp8_rac_get(c)) {
 617         if ((ret = vp7_fade_frame(s ,c)) < 0)
 618             return ret;
 619     }
 620
 621     /* F. Loop filter type */
 622     if (!s->profile)
 623         s->filter.simple = vp8_rac_get(c);
 624
 625     /* G. DCT coefficient ordering specification */
 626     if (vp8_rac_get(c))
 627         for (i = 1; i < 16; i++)
 628             s->prob[0].scan[i] = ff_zigzag_scan[vp8_rac_get_uint(c, 4)];
 629
 630     /* H. Loop filter levels  */
 631     if (s->profile > 0)
 632         s->filter.simple = vp8_rac_get(c);
 633     s->filter.level     = vp8_rac_get_uint(c, 6);
 634     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 635
 636     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 637     vp78_update_probability_tables(s);
 638
 639     s->mbskip_enabled = 0;
 640
 641     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 642     if (!s->keyframe) {
 643         s->prob->intra  = vp8_rac_get_uint(c, 8);
 644         s->prob->last   = vp8_rac_get_uint(c, 8);
 645         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 646     }
 647
 648     return 0;
 649 }
 650
 651 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 652 {
 653     VP56RangeCoder *c = &s->c;
 654     int header_size, hscale, vscale, ret;
 655     int width  = s->avctx->width;
 656     int height = s->avctx->height;
 657
 658     s->keyframe  = !(buf[0] & 1);
 659     s->profile   =  (buf[0]>>1) & 7;
 660     s->invisible = !(buf[0] & 0x10);
 661     header_size  = AV_RL24(buf) >> 5;
 662     buf      += 3;
 663     buf_size -= 3;
 664
 665     s->header_partition_size = header_size;
 666
 667     if (s->profile > 3)
 668         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 669
 670     if (!s->profile)
 671         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 672                sizeof(s->put_pixels_tab));
 673     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 674         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 675                sizeof(s->put_pixels_tab));
 676
 677     if (header_size > buf_size - 7 * s->keyframe) {
 678         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 679         return AVERROR_INVALIDDATA;
 680     }
 681
 682     if (s->keyframe) {
 683         if (AV_RL24(buf) != 0x2a019d) {
 684             av_log(s->avctx, AV_LOG_ERROR,
 685                    "Invalid start code 0x%x\n", AV_RL24(buf));
 686             return AVERROR_INVALIDDATA;
 687         }
 688         width     = AV_RL16(buf + 3) & 0x3fff;
 689         height    = AV_RL16(buf + 5) & 0x3fff;
 690         hscale    = buf[4] >> 6;
 691         vscale    = buf[6] >> 6;
 692         buf      += 7;
 693         buf_size -= 7;
 694
 695         if (hscale || vscale)
 696             avpriv_request_sample(s->avctx, "Upscaling");
 697
 698         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 699         vp78_reset_probability_tables(s);
 700         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 701                sizeof(s->prob->pred16x16));
 702         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 703                sizeof(s->prob->pred8x8c));
 704         memcpy(s->prob->mvc, vp8_mv_default_prob,
 705                sizeof(s->prob->mvc));
 706         memset(&s->segmentation, 0, sizeof(s->segmentation));
 707         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 708     }
 709
 710     ff_vp56_init_range_decoder(c, buf, header_size);
 711     buf      += header_size;
 712     buf_size -= header_size;
 713
 714     if (s->keyframe) {
 715         s->colorspace = vp8_rac_get(c);
 716         if (s->colorspace)
 717             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 718         s->fullrange = vp8_rac_get(c);
 719     }
 720
 721     if ((s->segmentation.enabled = vp8_rac_get(c)))
 722         parse_segment_info(s);
 723     else
 724         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 725
 726     s->filter.simple    = vp8_rac_get(c);
 727     s->filter.level     = vp8_rac_get_uint(c, 6);
 728     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 729
 730     if ((s->lf_delta.enabled = vp8_rac_get(c))) {
 731         s->lf_delta.update = vp8_rac_get(c);
 732         if (s->lf_delta.update)
 733             update_lf_deltas(s);
 734     }
 735
 736     if (setup_partitions(s, buf, buf_size)) {
 737         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 738         return AVERROR_INVALIDDATA;
 739     }
 740
 741     if (!s->macroblocks_base || /* first frame */
 742         width != s->avctx->width || height != s->avctx->height)
 743         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 744             return ret;
 745
 746     get_quants(s);
 747
 748     if (!s->keyframe) {
 749         update_refs(s);
 750         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 751         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 752     }
 753
 754     // if we aren't saving this frame's probabilities for future frames,
 755     // make a copy of the current probabilities
 756     if (!(s->update_probabilities = vp8_rac_get(c)))
 757         s->prob[1] = s->prob[0];
 758
 759     s->update_last = s->keyframe || vp8_rac_get(c);
 760
 761     vp78_update_probability_tables(s);
 762
 763     if ((s->mbskip_enabled = vp8_rac_get(c)))
 764         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 765
 766     if (!s->keyframe) {
 767         s->prob->intra  = vp8_rac_get_uint(c, 8);
 768         s->prob->last   = vp8_rac_get_uint(c, 8);
 769         s->prob->golden = vp8_rac_get_uint(c, 8);
 770         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 771     }
 772
 773     // Record the entropy coder state here so that hwaccels can use it.
 774     s->c.code_word = vp56_rac_renorm(&s->c);
 775     s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
 776     s->coder_state_at_header_end.range     = s->c.high;
 777     s->coder_state_at_header_end.value     = s->c.code_word >> 16;
 778     s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
 779
 780     return 0;
 781 }
 782
 783 static av_always_inline
 784 void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 785 {
 786     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 787     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 788 }
 789
 790 /**
 791  * Motion vector coding, 17.1.
 792  */
 793 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 794 {
 795     int bit, x = 0;
 796
 797     if (vp56_rac_get_prob_branchy(c, p[0])) {
 798         int i;
 799
 800         for (i = 0; i < 3; i++)
 801             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 802         for (i = (vp7 ? 7 : 9); i > 3; i--)
 803             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 804         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 805             x += 8;
 806     } else {
 807         // small_mvtree
 808         const uint8_t *ps = p + 2;
 809         bit = vp56_rac_get_prob(c, *ps);
 810         ps += 1 + 3 * bit;
 811         x  += 4 * bit;
 812         bit = vp56_rac_get_prob(c, *ps);
 813         ps += 1 + bit;
 814         x  += 2 * bit;
 815         x  += vp56_rac_get_prob(c, *ps);
 816     }
 817
 818     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 819 }
 820
 821 static av_always_inline
 822 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 823 {
 824     if (is_vp7)
 825         return vp7_submv_prob;
 826
 827     if (left == top)
 828         return vp8_submv_prob[4 - !!left];
 829     if (!top)
 830         return vp8_submv_prob[2];
 831     return vp8_submv_prob[1 - !!left];
 832 }
 833
 834 /**
 835  * Split motion vector prediction, 16.4.
 836  * @returns the number of motion vectors parsed (2, 4 or 16)
 837  */
 838 static av_always_inline
 839 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 840                     int layout, int is_vp7)
 841 {
 842     int part_idx;
 843     int n, num;
 844     VP8Macroblock *top_mb;
 845     VP8Macroblock *left_mb = &mb[-1];
 846     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 847     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 848     VP56mv *top_mv;
 849     VP56mv *left_mv = left_mb->bmv;
 850     VP56mv *cur_mv  = mb->bmv;
 851
 852     if (!layout) // layout is inlined, s->mb_layout is not
 853         top_mb = &mb[2];
 854     else
 855         top_mb = &mb[-s->mb_width - 1];
 856     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 857     top_mv       = top_mb->bmv;
 858
 859     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 860         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 861             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 862         else
 863             part_idx = VP8_SPLITMVMODE_8x8;
 864     } else {
 865         part_idx = VP8_SPLITMVMODE_4x4;
 866     }
 867
 868     num              = vp8_mbsplit_count[part_idx];
 869     mbsplits_cur     = vp8_mbsplits[part_idx],
 870     firstidx         = vp8_mbfirstidx[part_idx];
 871     mb->partitioning = part_idx;
 872
 873     for (n = 0; n < num; n++) {
 874         int k = firstidx[n];
 875         uint32_t left, above;
 876         const uint8_t *submv_prob;
 877
 878         if (!(k & 3))
 879             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 880         else
 881             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 882         if (k <= 3)
 883             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 884         else
 885             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 886
 887         submv_prob = get_submv_prob(left, above, is_vp7);
 888
 889         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 890             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 891                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 892                     mb->bmv[n].y = mb->mv.y +
 893                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 894                     mb->bmv[n].x = mb->mv.x +
 895                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 896                 } else {
 897                     AV_ZERO32(&mb->bmv[n]);
 898                 }
 899             } else {
 900                 AV_WN32A(&mb->bmv[n], above);
 901             }
 902         } else {
 903             AV_WN32A(&mb->bmv[n], left);
 904         }
 905     }
 906
 907     return num;
 908 }
 909
 910 /**
 911  * The vp7 reference decoder uses a padding macroblock column (added to right
 912  * edge of the frame) to guard against illegal macroblock offsets. The
 913  * algorithm has bugs that permit offsets to straddle the padding column.
 914  * This function replicates those bugs.
 915  *
 916  * @param[out] edge_x macroblock x address
 917  * @param[out] edge_y macroblock y address
 918  *
 919  * @return macroblock offset legal (boolean)
 920  */
 921 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 922                                    int xoffset, int yoffset, int boundary,
 923                                    int *edge_x, int *edge_y)
 924 {
 925     int vwidth = mb_width + 1;
 926     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 927     if (new < boundary || new % vwidth == vwidth - 1)
 928         return 0;
 929     *edge_y = new / vwidth;
 930     *edge_x = new % vwidth;
 931     return 1;
 932 }
 933
 934 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
 935 {
 936     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
 937 }
 938
 939 static av_always_inline
 940 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 941                     int mb_x, int mb_y, int layout)
 942 {
 943     VP8Macroblock *mb_edge[12];
 944     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
 945     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 946     int idx = CNT_ZERO;
 947     VP56mv near_mv[3];
 948     uint8_t cnt[3] = { 0 };
 949     VP56RangeCoder *c = &s->c;
 950     int i;
 951
 952     AV_ZERO32(&near_mv[0]);
 953     AV_ZERO32(&near_mv[1]);
 954     AV_ZERO32(&near_mv[2]);
 955
 956     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
 957         const VP7MVPred * pred = &vp7_mv_pred[i];
 958         int edge_x, edge_y;
 959
 960         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
 961                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
 962             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
 963                                              ? s->macroblocks_base + 1 + edge_x +
 964                                                (s->mb_width + 1) * (edge_y + 1)
 965                                              : s->macroblocks + edge_x +
 966                                                (s->mb_height - edge_y - 1) * 2;
 967             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
 968             if (mv) {
 969                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
 970                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
 971                         idx = CNT_NEAREST;
 972                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
 973                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
 974                             continue;
 975                         idx = CNT_NEAR;
 976                     } else {
 977                         AV_WN32A(&near_mv[CNT_NEAR], mv);
 978                         idx = CNT_NEAR;
 979                     }
 980                 } else {
 981                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
 982                     idx = CNT_NEAREST;
 983                 }
 984             } else {
 985                 idx = CNT_ZERO;
 986             }
 987         } else {
 988             idx = CNT_ZERO;
 989         }
 990         cnt[idx] += vp7_mv_pred[i].score;
 991     }
 992
 993     mb->partitioning = VP8_SPLITMVMODE_NONE;
 994
 995     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
 996         mb->mode = VP8_MVMODE_MV;
 997
 998         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
 999
1000             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
1001
1002                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
1003                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
1004                 else
1005                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
1006
1007                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
1008                     mb->mode = VP8_MVMODE_SPLIT;
1009                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1010                 } else {
1011                     mb->mv.y += read_mv_component(c, s->prob->mvc[0], IS_VP7);
1012                     mb->mv.x += read_mv_component(c, s->prob->mvc[1], IS_VP7);
1013                     mb->bmv[0] = mb->mv;
1014                 }
1015             } else {
1016                 mb->mv = near_mv[CNT_NEAR];
1017                 mb->bmv[0] = mb->mv;
1018             }
1019         } else {
1020             mb->mv = near_mv[CNT_NEAREST];
1021             mb->bmv[0] = mb->mv;
1022         }
1023     } else {
1024         mb->mode = VP8_MVMODE_ZERO;
1025         AV_ZERO32(&mb->mv);
1026         mb->bmv[0] = mb->mv;
1027     }
1028 }
1029
1030 static av_always_inline
1031 void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1032                     int mb_x, int mb_y, int layout)
1033 {
1034     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1035                                   mb - 1 /* left */,
1036                                   0      /* top-left */ };
1037     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1038     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1039     int idx = CNT_ZERO;
1040     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1041     int8_t *sign_bias = s->sign_bias;
1042     VP56mv near_mv[4];
1043     uint8_t cnt[4] = { 0 };
1044     VP56RangeCoder *c = &s->c;
1045
1046     if (!layout) { // layout is inlined (s->mb_layout is not)
1047         mb_edge[0] = mb + 2;
1048         mb_edge[2] = mb + 1;
1049     } else {
1050         mb_edge[0] = mb - s->mb_width - 1;
1051         mb_edge[2] = mb - s->mb_width - 2;
1052     }
1053
1054     AV_ZERO32(&near_mv[0]);
1055     AV_ZERO32(&near_mv[1]);
1056     AV_ZERO32(&near_mv[2]);
1057
1058     /* Process MB on top, left and top-left */
1059 #define MV_EDGE_CHECK(n)                                                      \
1060     {                                                                         \
1061         VP8Macroblock *edge = mb_edge[n];                                     \
1062         int edge_ref = edge->ref_frame;                                       \
1063         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1064             uint32_t mv = AV_RN32A(&edge->mv);                                \
1065             if (mv) {                                                         \
1066                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1067                     /* SWAR negate of the values in mv. */                    \
1068                     mv = ~mv;                                                 \
1069                     mv = ((mv & 0x7fff7fff) +                                 \
1070                           0x00010001) ^ (mv & 0x80008000);                    \
1071                 }                                                             \
1072                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1073                     AV_WN32A(&near_mv[++idx], mv);                            \
1074                 cnt[idx] += 1 + (n != 2);                                     \
1075             } else                                                            \
1076                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1077         }                                                                     \
1078     }
1079
1080     MV_EDGE_CHECK(0)
1081     MV_EDGE_CHECK(1)
1082     MV_EDGE_CHECK(2)
1083
1084     mb->partitioning = VP8_SPLITMVMODE_NONE;
1085     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1086         mb->mode = VP8_MVMODE_MV;
1087
1088         /* If we have three distinct MVs, merge first and last if they're the same */
1089         if (cnt[CNT_SPLITMV] &&
1090             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1091             cnt[CNT_NEAREST] += 1;
1092
1093         /* Swap near and nearest if necessary */
1094         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1095             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1096             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1097         }
1098
1099         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1100             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1101                 /* Choose the best mv out of 0,0 and the nearest mv */
1102                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1103                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1104                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1105                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1106
1107                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1108                     mb->mode = VP8_MVMODE_SPLIT;
1109                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1110                 } else {
1111                     mb->mv.y  += read_mv_component(c, s->prob->mvc[0], IS_VP8);
1112                     mb->mv.x  += read_mv_component(c, s->prob->mvc[1], IS_VP8);
1113                     mb->bmv[0] = mb->mv;
1114                 }
1115             } else {
1116                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
1117                 mb->bmv[0] = mb->mv;
1118             }
1119         } else {
1120             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
1121             mb->bmv[0] = mb->mv;
1122         }
1123     } else {
1124         mb->mode = VP8_MVMODE_ZERO;
1125         AV_ZERO32(&mb->mv);
1126         mb->bmv[0] = mb->mv;
1127     }
1128 }
1129
1130 static av_always_inline
1131 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1132                            int mb_x, int keyframe, int layout)
1133 {
1134     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1135
1136     if (layout == 1) {
1137         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1138         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1139     }
1140     if (keyframe) {
1141         int x, y;
1142         uint8_t *top;
1143         uint8_t *const left = s->intra4x4_pred_mode_left;
1144         if (layout == 1)
1145             top = mb->intra4x4_pred_mode_top;
1146         else
1147             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1148         for (y = 0; y < 4; y++) {
1149             for (x = 0; x < 4; x++) {
1150                 const uint8_t *ctx;
1151                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1152                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1153                 left[y]   = top[x] = *intra4x4;
1154                 intra4x4++;
1155             }
1156         }
1157     } else {
1158         int i;
1159         for (i = 0; i < 16; i++)
1160             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1161                                            vp8_pred4x4_prob_inter);
1162     }
1163 }
1164
1165 static av_always_inline
1166 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1167                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1168 {
1169     VP56RangeCoder *c = &s->c;
1170     static const char * const vp7_feature_name[] = { "q-index",
1171                                                      "lf-delta",
1172                                                      "partial-golden-update",
1173                                                      "blit-pitch" };
1174     if (is_vp7) {
1175         int i;
1176         *segment = 0;
1177         for (i = 0; i < 4; i++) {
1178             if (s->feature_enabled[i]) {
1179                 if (vp56_rac_get_prob(c, s->feature_present_prob[i])) {
1180                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1181                                                    s->feature_index_prob[i]);
1182                       av_log(s->avctx, AV_LOG_WARNING,
1183                              "Feature %s present in macroblock (value 0x%x)\n",
1184                              vp7_feature_name[i], s->feature_value[i][index]);
1185                 }
1186            }
1187         }
1188     } else if (s->segmentation.update_map)
1189         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
1190     else if (s->segmentation.enabled)
1191         *segment = ref ? *ref : *segment;
1192     mb->segment = *segment;
1193
1194     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1195
1196     if (s->keyframe) {
1197         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1198                                     vp8_pred16x16_prob_intra);
1199
1200         if (mb->mode == MODE_I4x4) {
1201             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1202         } else {
1203             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1204                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1205             if (s->mb_layout == 1)
1206                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1207             else
1208                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1209             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1210         }
1211
1212         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1213                                                 vp8_pred8x8c_prob_intra);
1214         mb->ref_frame        = VP56_FRAME_CURRENT;
1215     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1216         // inter MB, 16.2
1217         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1218             mb->ref_frame =
1219                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1220                                                                    : VP56_FRAME_GOLDEN;
1221         else
1222             mb->ref_frame = VP56_FRAME_PREVIOUS;
1223         s->ref_count[mb->ref_frame - 1]++;
1224
1225         // motion vectors, 16.3
1226         if (is_vp7)
1227             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1228         else
1229             vp8_decode_mvs(s, mb, mb_x, mb_y, layout);
1230     } else {
1231         // intra MB, 16.1
1232         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1233
1234         if (mb->mode == MODE_I4x4)
1235             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1236
1237         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1238                                                 s->prob->pred8x8c);
1239         mb->ref_frame        = VP56_FRAME_CURRENT;
1240         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1241         AV_ZERO32(&mb->bmv[0]);
1242     }
1243 }
1244
1245 /**
1246  * @param r     arithmetic bitstream reader context
1247  * @param block destination for block coefficients
1248  * @param probs probabilities to use when reading trees from the bitstream
1249  * @param i     initial coeff index, 0 unless a separate DC block is coded
1250  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1251  *
1252  * @return 0 if no coeffs were decoded
1253  *         otherwise, the index of the last coeff decoded plus one
1254  */
1255 static av_always_inline
1256 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1257                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1258                                  int i, uint8_t *token_prob, int16_t qmul[2],
1259                                  const uint8_t scan[16], int vp7)
1260 {
1261     VP56RangeCoder c = *r;
1262     goto skip_eob;
1263     do {
1264         int coeff;
1265 restart:
1266         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1267             break;
1268
1269 skip_eob:
1270         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1271             if (++i == 16)
1272                 break; // invalid input; blocks should end with EOB
1273             token_prob = probs[i][0];
1274             if (vp7)
1275                 goto restart;
1276             goto skip_eob;
1277         }
1278
1279         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1280             coeff = 1;
1281             token_prob = probs[i + 1][1];
1282         } else {
1283             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1284                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1285                 if (coeff)
1286                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1287                 coeff += 2;
1288             } else {
1289                 // DCT_CAT*
1290                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1291                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1292                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1293                     } else {                                    // DCT_CAT2
1294                         coeff  = 7;
1295                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1296                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1297                     }
1298                 } else {    // DCT_CAT3 and up
1299                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1300                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1301                     int cat = (a << 1) + b;
1302                     coeff  = 3 + (8 << cat);
1303                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1304                 }
1305             }
1306             token_prob = probs[i + 1][2];
1307         }
1308         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1309     } while (++i < 16);
1310
1311     *r = c;
1312     return i;
1313 }
1314
1315 static av_always_inline
1316 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1317 {
1318     int16_t dc = block[0];
1319     int ret = 0;
1320
1321     if (pred[1] > 3) {
1322         dc += pred[0];
1323         ret = 1;
1324     }
1325
1326     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1327         block[0] = pred[0] = dc;
1328         pred[1] = 0;
1329     } else {
1330         if (pred[0] == dc)
1331             pred[1]++;
1332         block[0] = pred[0] = dc;
1333     }
1334
1335     return ret;
1336 }
1337
1338 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1339                                             int16_t block[16],
1340                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1341                                             int i, uint8_t *token_prob,
1342                                             int16_t qmul[2],
1343                                             const uint8_t scan[16])
1344 {
1345     return decode_block_coeffs_internal(r, block, probs, i,
1346                                         token_prob, qmul, scan, IS_VP7);
1347 }
1348
1349 #ifndef vp8_decode_block_coeffs_internal
1350 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1351                                             int16_t block[16],
1352                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1353                                             int i, uint8_t *token_prob,
1354                                             int16_t qmul[2])
1355 {
1356     return decode_block_coeffs_internal(r, block, probs, i,
1357                                         token_prob, qmul, ff_zigzag_scan, IS_VP8);
1358 }
1359 #endif
1360
1361 /**
1362  * @param c          arithmetic bitstream reader context
1363  * @param block      destination for block coefficients
1364  * @param probs      probabilities to use when reading trees from the bitstream
1365  * @param i          initial coeff index, 0 unless a separate DC block is coded
1366  * @param zero_nhood the initial prediction context for number of surrounding
1367  *                   all-zero blocks (only left/top, so 0-2)
1368  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1369  *
1370  * @return 0 if no coeffs were decoded
1371  *         otherwise, the index of the last coeff decoded plus one
1372  */
1373 static av_always_inline
1374 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1375                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1376                         int i, int zero_nhood, int16_t qmul[2],
1377                         const uint8_t scan[16], int vp7)
1378 {
1379     uint8_t *token_prob = probs[i][zero_nhood];
1380     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1381         return 0;
1382     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1383                                                   token_prob, qmul, scan)
1384                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1385                                                   token_prob, qmul);
1386 }
1387
1388 static av_always_inline
1389 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1390                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1391                       int is_vp7)
1392 {
1393     int i, x, y, luma_start = 0, luma_ctx = 3;
1394     int nnz_pred, nnz, nnz_total = 0;
1395     int segment = mb->segment;
1396     int block_dc = 0;
1397
1398     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1399         nnz_pred = t_nnz[8] + l_nnz[8];
1400
1401         // decode DC values and do hadamard
1402         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1403                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1404                                   ff_zigzag_scan, is_vp7);
1405         l_nnz[8] = t_nnz[8] = !!nnz;
1406
1407         if (is_vp7 && mb->mode > MODE_I4x4) {
1408             nnz |=  inter_predict_dc(td->block_dc,
1409                                      s->inter_dc_pred[mb->ref_frame - 1]);
1410         }
1411
1412         if (nnz) {
1413             nnz_total += nnz;
1414             block_dc   = 1;
1415             if (nnz == 1)
1416                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1417             else
1418                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1419         }
1420         luma_start = 1;
1421         luma_ctx   = 0;
1422     }
1423
1424     // luma blocks
1425     for (y = 0; y < 4; y++)
1426         for (x = 0; x < 4; x++) {
1427             nnz_pred = l_nnz[y] + t_nnz[x];
1428             nnz = decode_block_coeffs(c, td->block[y][x],
1429                                       s->prob->token[luma_ctx],
1430                                       luma_start, nnz_pred,
1431                                       s->qmat[segment].luma_qmul,
1432                                       s->prob[0].scan, is_vp7);
1433             /* nnz+block_dc may be one more than the actual last index,
1434              * but we don't care */
1435             td->non_zero_count_cache[y][x] = nnz + block_dc;
1436             t_nnz[x] = l_nnz[y] = !!nnz;
1437             nnz_total += nnz;
1438         }
1439
1440     // chroma blocks
1441     // TODO: what to do about dimensions? 2nd dim for luma is x,
1442     // but for chroma it's (y<<1)|x
1443     for (i = 4; i < 6; i++)
1444         for (y = 0; y < 2; y++)
1445             for (x = 0; x < 2; x++) {
1446                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1447                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1448                                           s->prob->token[2], 0, nnz_pred,
1449                                           s->qmat[segment].chroma_qmul,
1450                                           s->prob[0].scan, is_vp7);
1451                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1452                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1453                 nnz_total += nnz;
1454             }
1455
1456     // if there were no coded coeffs despite the macroblock not being marked skip,
1457     // we MUST not do the inner loop filter and should not do IDCT
1458     // Since skip isn't used for bitstream prediction, just manually set it.
1459     if (!nnz_total)
1460         mb->skip = 1;
1461 }
1462
1463 static av_always_inline
1464 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1465                       uint8_t *src_cb, uint8_t *src_cr,
1466                       ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
1467 {
1468     AV_COPY128(top_border, src_y + 15 * linesize);
1469     if (!simple) {
1470         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1471         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1472     }
1473 }
1474
1475 static av_always_inline
1476 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1477                     uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
1478                     int mb_y, int mb_width, int simple, int xchg)
1479 {
1480     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1481     src_y  -= linesize;
1482     src_cb -= uvlinesize;
1483     src_cr -= uvlinesize;
1484
1485 #define XCHG(a, b, xchg)                                                      \
1486     do {                                                                      \
1487         if (xchg)                                                             \
1488             AV_SWAP64(b, a);                                                  \
1489         else                                                                  \
1490             AV_COPY64(b, a);                                                  \
1491     } while (0)
1492
1493     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1494     XCHG(top_border, src_y, xchg);
1495     XCHG(top_border + 8, src_y + 8, 1);
1496     if (mb_x < mb_width - 1)
1497         XCHG(top_border + 32, src_y + 16, 1);
1498
1499     // only copy chroma for normal loop filter
1500     // or to initialize the top row to 127
1501     if (!simple || !mb_y) {
1502         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1503         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1504         XCHG(top_border + 16, src_cb, 1);
1505         XCHG(top_border + 24, src_cr, 1);
1506     }
1507 }
1508
1509 static av_always_inline
1510 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1511 {
1512     if (!mb_x)
1513         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1514     else
1515         return mb_y ? mode : LEFT_DC_PRED8x8;
1516 }
1517
1518 static av_always_inline
1519 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1520 {
1521     if (!mb_x)
1522         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1523     else
1524         return mb_y ? mode : HOR_PRED8x8;
1525 }
1526
1527 static av_always_inline
1528 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1529 {
1530     switch (mode) {
1531     case DC_PRED8x8:
1532         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1533     case VERT_PRED8x8:
1534         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1535     case HOR_PRED8x8:
1536         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1537     case PLANE_PRED8x8: /* TM */
1538         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1539     }
1540     return mode;
1541 }
1542
1543 static av_always_inline
1544 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1545 {
1546     if (!mb_x) {
1547         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1548     } else {
1549         return mb_y ? mode : HOR_VP8_PRED;
1550     }
1551 }
1552
1553 static av_always_inline
1554 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1555                                      int *copy_buf, int vp7)
1556 {
1557     switch (mode) {
1558     case VERT_PRED:
1559         if (!mb_x && mb_y) {
1560             *copy_buf = 1;
1561             return mode;
1562         }
1563         /* fall-through */
1564     case DIAG_DOWN_LEFT_PRED:
1565     case VERT_LEFT_PRED:
1566         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1567     case HOR_PRED:
1568         if (!mb_y) {
1569             *copy_buf = 1;
1570             return mode;
1571         }
1572         /* fall-through */
1573     case HOR_UP_PRED:
1574         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1575     case TM_VP8_PRED:
1576         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1577     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1578                    * as 16x16/8x8 DC */
1579     case DIAG_DOWN_RIGHT_PRED:
1580     case VERT_RIGHT_PRED:
1581     case HOR_DOWN_PRED:
1582         if (!mb_y || !mb_x)
1583             *copy_buf = 1;
1584         return mode;
1585     }
1586     return mode;
1587 }
1588
1589 static av_always_inline
1590 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1591                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1592 {
1593     int x, y, mode, nnz;
1594     uint32_t tr;
1595
1596     /* for the first row, we need to run xchg_mb_border to init the top edge
1597      * to 127 otherwise, skip it if we aren't going to deblock */
1598     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1599         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1600                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1601                        s->filter.simple, 1);
1602
1603     if (mb->mode < MODE_I4x4) {
1604         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1605         s->hpc.pred16x16[mode](dst[0], s->linesize);
1606     } else {
1607         uint8_t *ptr = dst[0];
1608         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1609         const uint8_t lo = is_vp7 ? 128 : 127;
1610         const uint8_t hi = is_vp7 ? 128 : 129;
1611         uint8_t tr_top[4] = { lo, lo, lo, lo };
1612
1613         // all blocks on the right edge of the macroblock use bottom edge
1614         // the top macroblock for their topright edge
1615         uint8_t *tr_right = ptr - s->linesize + 16;
1616
1617         // if we're on the right edge of the frame, said edge is extended
1618         // from the top macroblock
1619         if (mb_y && mb_x == s->mb_width - 1) {
1620             tr       = tr_right[-1] * 0x01010101u;
1621             tr_right = (uint8_t *) &tr;
1622         }
1623
1624         if (mb->skip)
1625             AV_ZERO128(td->non_zero_count_cache);
1626
1627         for (y = 0; y < 4; y++) {
1628             uint8_t *topright = ptr + 4 - s->linesize;
1629             for (x = 0; x < 4; x++) {
1630                 int copy = 0;
1631                 ptrdiff_t linesize = s->linesize;
1632                 uint8_t *dst = ptr + 4 * x;
1633                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5 * 8];
1634
1635                 if ((y == 0 || x == 3) && mb_y == 0) {
1636                     topright = tr_top;
1637                 } else if (x == 3)
1638                     topright = tr_right;
1639
1640                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1641                                                         mb_y + y, &copy, is_vp7);
1642                 if (copy) {
1643                     dst      = copy_dst + 12;
1644                     linesize = 8;
1645                     if (!(mb_y + y)) {
1646                         copy_dst[3] = lo;
1647                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1648                     } else {
1649                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1650                         if (!(mb_x + x)) {
1651                             copy_dst[3] = hi;
1652                         } else {
1653                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1654                         }
1655                     }
1656                     if (!(mb_x + x)) {
1657                         copy_dst[11] =
1658                         copy_dst[19] =
1659                         copy_dst[27] =
1660                         copy_dst[35] = hi;
1661                     } else {
1662                         copy_dst[11] = ptr[4 * x                   - 1];
1663                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1664                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1665                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1666                     }
1667                 }
1668                 s->hpc.pred4x4[mode](dst, topright, linesize);
1669                 if (copy) {
1670                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1671                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1672                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1673                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1674                 }
1675
1676                 nnz = td->non_zero_count_cache[y][x];
1677                 if (nnz) {
1678                     if (nnz == 1)
1679                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1680                                                   td->block[y][x], s->linesize);
1681                     else
1682                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1683                                                td->block[y][x], s->linesize);
1684                 }
1685                 topright += 4;
1686             }
1687
1688             ptr      += 4 * s->linesize;
1689             intra4x4 += 4;
1690         }
1691     }
1692
1693     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1694                                             mb_x, mb_y, is_vp7);
1695     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1696     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1697
1698     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1699         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1700                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1701                        s->filter.simple, 0);
1702 }
1703
1704 static const uint8_t subpel_idx[3][8] = {
1705     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1706                                 // also function pointer index
1707     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1708     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1709 };
1710
1711 /**
1712  * luma MC function
1713  *
1714  * @param s        VP8 decoding context
1715  * @param dst      target buffer for block data at block position
1716  * @param ref      reference picture buffer at origin (0, 0)
1717  * @param mv       motion vector (relative to block position) to get pixel data from
1718  * @param x_off    horizontal position of block from origin (0, 0)
1719  * @param y_off    vertical position of block from origin (0, 0)
1720  * @param block_w  width of block (16, 8 or 4)
1721  * @param block_h  height of block (always same as block_w)
1722  * @param width    width of src/dst plane data
1723  * @param height   height of src/dst plane data
1724  * @param linesize size of a single line of plane data, including padding
1725  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1726  */
1727 static av_always_inline
1728 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1729                  ThreadFrame *ref, const VP56mv *mv,
1730                  int x_off, int y_off, int block_w, int block_h,
1731                  int width, int height, ptrdiff_t linesize,
1732                  vp8_mc_func mc_func[3][3])
1733 {
1734     uint8_t *src = ref->f->data[0];
1735
1736     if (AV_RN32A(mv)) {
1737         ptrdiff_t src_linesize = linesize;
1738
1739         int mx = (mv->x << 1) & 7, mx_idx = subpel_idx[0][mx];
1740         int my = (mv->y << 1) & 7, my_idx = subpel_idx[0][my];
1741
1742         x_off += mv->x >> 2;
1743         y_off += mv->y >> 2;
1744
1745         // edge emulation
1746         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1747         src += y_off * linesize + x_off;
1748         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1749             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1750             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1751                                      src - my_idx * linesize - mx_idx,
1752                                      EDGE_EMU_LINESIZE, linesize,
1753                                      block_w + subpel_idx[1][mx],
1754                                      block_h + subpel_idx[1][my],
1755                                      x_off - mx_idx, y_off - my_idx,
1756                                      width, height);
1757             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1758             src_linesize = EDGE_EMU_LINESIZE;
1759         }
1760         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1761     } else {
1762         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1763         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1764                       linesize, block_h, 0, 0);
1765     }
1766 }
1767
1768 /**
1769  * chroma MC function
1770  *
1771  * @param s        VP8 decoding context
1772  * @param dst1     target buffer for block data at block position (U plane)
1773  * @param dst2     target buffer for block data at block position (V plane)
1774  * @param ref      reference picture buffer at origin (0, 0)
1775  * @param mv       motion vector (relative to block position) to get pixel data from
1776  * @param x_off    horizontal position of block from origin (0, 0)
1777  * @param y_off    vertical position of block from origin (0, 0)
1778  * @param block_w  width of block (16, 8 or 4)
1779  * @param block_h  height of block (always same as block_w)
1780  * @param width    width of src/dst plane data
1781  * @param height   height of src/dst plane data
1782  * @param linesize size of a single line of plane data, including padding
1783  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1784  */
1785 static av_always_inline
1786 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1787                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1788                    int x_off, int y_off, int block_w, int block_h,
1789                    int width, int height, ptrdiff_t linesize,
1790                    vp8_mc_func mc_func[3][3])
1791 {
1792     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1793
1794     if (AV_RN32A(mv)) {
1795         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1796         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1797
1798         x_off += mv->x >> 3;
1799         y_off += mv->y >> 3;
1800
1801         // edge emulation
1802         src1 += y_off * linesize + x_off;
1803         src2 += y_off * linesize + x_off;
1804         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1805         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1806             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1807             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1808                                      src1 - my_idx * linesize - mx_idx,
1809                                      EDGE_EMU_LINESIZE, linesize,
1810                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1811                                      x_off - mx_idx, y_off - my_idx, width, height);
1812             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1813             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1814
1815             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1816                                      src2 - my_idx * linesize - mx_idx,
1817                                      EDGE_EMU_LINESIZE, linesize,
1818                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1819                                      x_off - mx_idx, y_off - my_idx, width, height);
1820             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1821             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1822         } else {
1823             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1824             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1825         }
1826     } else {
1827         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1828         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1829         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1830     }
1831 }
1832
1833 static av_always_inline
1834 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1835                  ThreadFrame *ref_frame, int x_off, int y_off,
1836                  int bx_off, int by_off, int block_w, int block_h,
1837                  int width, int height, VP56mv *mv)
1838 {
1839     VP56mv uvmv = *mv;
1840
1841     /* Y */
1842     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1843                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1844                 block_w, block_h, width, height, s->linesize,
1845                 s->put_pixels_tab[block_w == 8]);
1846
1847     /* U/V */
1848     if (s->profile == 3) {
1849         /* this block only applies VP8; it is safe to check
1850          * only the profile, as VP7 profile <= 1 */
1851         uvmv.x &= ~7;
1852         uvmv.y &= ~7;
1853     }
1854     x_off   >>= 1;
1855     y_off   >>= 1;
1856     bx_off  >>= 1;
1857     by_off  >>= 1;
1858     width   >>= 1;
1859     height  >>= 1;
1860     block_w >>= 1;
1861     block_h >>= 1;
1862     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1863                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1864                   &uvmv, x_off + bx_off, y_off + by_off,
1865                   block_w, block_h, width, height, s->uvlinesize,
1866                   s->put_pixels_tab[1 + (block_w == 4)]);
1867 }
1868
1869 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1870  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1871 static av_always_inline
1872 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1873                      int mb_xy, int ref)
1874 {
1875     /* Don't prefetch refs that haven't been used very often this frame. */
1876     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1877         int x_off = mb_x << 4, y_off = mb_y << 4;
1878         int mx = (mb->mv.x >> 2) + x_off + 8;
1879         int my = (mb->mv.y >> 2) + y_off;
1880         uint8_t **src = s->framep[ref]->tf.f->data;
1881         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1882         /* For threading, a ff_thread_await_progress here might be useful, but
1883          * it actually slows down the decoder. Since a bad prefetch doesn't
1884          * generate bad decoder output, we don't run it here. */
1885         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1886         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1887         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1888     }
1889 }
1890
1891 /**
1892  * Apply motion vectors to prediction buffer, chapter 18.
1893  */
1894 static av_always_inline
1895 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1896                    VP8Macroblock *mb, int mb_x, int mb_y)
1897 {
1898     int x_off = mb_x << 4, y_off = mb_y << 4;
1899     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1900     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1901     VP56mv *bmv = mb->bmv;
1902
1903     switch (mb->partitioning) {
1904     case VP8_SPLITMVMODE_NONE:
1905         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1906                     0, 0, 16, 16, width, height, &mb->mv);
1907         break;
1908     case VP8_SPLITMVMODE_4x4: {
1909         int x, y;
1910         VP56mv uvmv;
1911
1912         /* Y */
1913         for (y = 0; y < 4; y++) {
1914             for (x = 0; x < 4; x++) {
1915                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1916                             ref, &bmv[4 * y + x],
1917                             4 * x + x_off, 4 * y + y_off, 4, 4,
1918                             width, height, s->linesize,
1919                             s->put_pixels_tab[2]);
1920             }
1921         }
1922
1923         /* U/V */
1924         x_off  >>= 1;
1925         y_off  >>= 1;
1926         width  >>= 1;
1927         height >>= 1;
1928         for (y = 0; y < 2; y++) {
1929             for (x = 0; x < 2; x++) {
1930                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
1931                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
1932                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
1933                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
1934                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
1935                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
1936                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
1937                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
1938                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
1939                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
1940                 if (s->profile == 3) {
1941                     uvmv.x &= ~7;
1942                     uvmv.y &= ~7;
1943                 }
1944                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
1945                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
1946                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
1947                               width, height, s->uvlinesize,
1948                               s->put_pixels_tab[2]);
1949             }
1950         }
1951         break;
1952     }
1953     case VP8_SPLITMVMODE_16x8:
1954         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1955                     0, 0, 16, 8, width, height, &bmv[0]);
1956         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1957                     0, 8, 16, 8, width, height, &bmv[1]);
1958         break;
1959     case VP8_SPLITMVMODE_8x16:
1960         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1961                     0, 0, 8, 16, width, height, &bmv[0]);
1962         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1963                     8, 0, 8, 16, width, height, &bmv[1]);
1964         break;
1965     case VP8_SPLITMVMODE_8x8:
1966         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1967                     0, 0, 8, 8, width, height, &bmv[0]);
1968         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1969                     8, 0, 8, 8, width, height, &bmv[1]);
1970         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1971                     0, 8, 8, 8, width, height, &bmv[2]);
1972         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1973                     8, 8, 8, 8, width, height, &bmv[3]);
1974         break;
1975     }
1976 }
1977
1978 static av_always_inline
1979 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
1980 {
1981     int x, y, ch;
1982
1983     if (mb->mode != MODE_I4x4) {
1984         uint8_t *y_dst = dst[0];
1985         for (y = 0; y < 4; y++) {
1986             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1987             if (nnz4) {
1988                 if (nnz4 & ~0x01010101) {
1989                     for (x = 0; x < 4; x++) {
1990                         if ((uint8_t) nnz4 == 1)
1991                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
1992                                                       td->block[y][x],
1993                                                       s->linesize);
1994                         else if ((uint8_t) nnz4 > 1)
1995                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
1996                                                    td->block[y][x],
1997                                                    s->linesize);
1998                         nnz4 >>= 8;
1999                         if (!nnz4)
2000                             break;
2001                     }
2002                 } else {
2003                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
2004                 }
2005             }
2006             y_dst += 4 * s->linesize;
2007         }
2008     }
2009
2010     for (ch = 0; ch < 2; ch++) {
2011         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2012         if (nnz4) {
2013             uint8_t *ch_dst = dst[1 + ch];
2014             if (nnz4 & ~0x01010101) {
2015                 for (y = 0; y < 2; y++) {
2016                     for (x = 0; x < 2; x++) {
2017                         if ((uint8_t) nnz4 == 1)
2018                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2019                                                       td->block[4 + ch][(y << 1) + x],
2020                                                       s->uvlinesize);
2021                         else if ((uint8_t) nnz4 > 1)
2022                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2023                                                    td->block[4 + ch][(y << 1) + x],
2024                                                    s->uvlinesize);
2025                         nnz4 >>= 8;
2026                         if (!nnz4)
2027                             goto chroma_idct_end;
2028                     }
2029                     ch_dst += 4 * s->uvlinesize;
2030                 }
2031             } else {
2032                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2033             }
2034         }
2035 chroma_idct_end:
2036         ;
2037     }
2038 }
2039
2040 static av_always_inline
2041 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2042                          VP8FilterStrength *f, int is_vp7)
2043 {
2044     int interior_limit, filter_level;
2045
2046     if (s->segmentation.enabled) {
2047         filter_level = s->segmentation.filter_level[mb->segment];
2048         if (!s->segmentation.absolute_vals)
2049             filter_level += s->filter.level;
2050     } else
2051         filter_level = s->filter.level;
2052
2053     if (s->lf_delta.enabled) {
2054         filter_level += s->lf_delta.ref[mb->ref_frame];
2055         filter_level += s->lf_delta.mode[mb->mode];
2056     }
2057
2058     filter_level = av_clip_uintp2(filter_level, 6);
2059
2060     interior_limit = filter_level;
2061     if (s->filter.sharpness) {
2062         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2063         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2064     }
2065     interior_limit = FFMAX(interior_limit, 1);
2066
2067     f->filter_level = filter_level;
2068     f->inner_limit = interior_limit;
2069     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2070                       mb->mode == VP8_MVMODE_SPLIT;
2071 }
2072
2073 static av_always_inline
2074 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2075                int mb_x, int mb_y, int is_vp7)
2076 {
2077     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2078     int filter_level = f->filter_level;
2079     int inner_limit = f->inner_limit;
2080     int inner_filter = f->inner_filter;
2081     ptrdiff_t linesize   = s->linesize;
2082     ptrdiff_t uvlinesize = s->uvlinesize;
2083     static const uint8_t hev_thresh_lut[2][64] = {
2084         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2085           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2086           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2087           3, 3, 3, 3 },
2088         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2089           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2090           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2091           2, 2, 2, 2 }
2092     };
2093
2094     if (!filter_level)
2095         return;
2096
2097     if (is_vp7) {
2098         bedge_lim_y  = filter_level;
2099         bedge_lim_uv = filter_level * 2;
2100         mbedge_lim   = filter_level + 2;
2101     } else {
2102         bedge_lim_y  =
2103         bedge_lim_uv = filter_level * 2 + inner_limit;
2104         mbedge_lim   = bedge_lim_y + 4;
2105     }
2106
2107     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2108
2109     if (mb_x) {
2110         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2111                                        mbedge_lim, inner_limit, hev_thresh);
2112         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2113                                        mbedge_lim, inner_limit, hev_thresh);
2114     }
2115
2116 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2117     if (cond && inner_filter) {                                               \
2118         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2119                                              bedge_lim_y, inner_limit,        \
2120                                              hev_thresh);                     \
2121         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2122                                              bedge_lim_y, inner_limit,        \
2123                                              hev_thresh);                     \
2124         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2125                                              bedge_lim_y, inner_limit,        \
2126                                              hev_thresh);                     \
2127         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2128                                              uvlinesize,  bedge_lim_uv,       \
2129                                              inner_limit, hev_thresh);        \
2130     }
2131
2132     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2133
2134     if (mb_y) {
2135         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2136                                        mbedge_lim, inner_limit, hev_thresh);
2137         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2138                                        mbedge_lim, inner_limit, hev_thresh);
2139     }
2140
2141     if (inner_filter) {
2142         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2143                                              linesize, bedge_lim_y,
2144                                              inner_limit, hev_thresh);
2145         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2146                                              linesize, bedge_lim_y,
2147                                              inner_limit, hev_thresh);
2148         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2149                                              linesize, bedge_lim_y,
2150                                              inner_limit, hev_thresh);
2151         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2152                                              dst[2] +  4 * uvlinesize,
2153                                              uvlinesize, bedge_lim_uv,
2154                                              inner_limit, hev_thresh);
2155     }
2156
2157     H_LOOP_FILTER_16Y_INNER(is_vp7)
2158 }
2159
2160 static av_always_inline
2161 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2162                       int mb_x, int mb_y)
2163 {
2164     int mbedge_lim, bedge_lim;
2165     int filter_level = f->filter_level;
2166     int inner_limit  = f->inner_limit;
2167     int inner_filter = f->inner_filter;
2168     ptrdiff_t linesize = s->linesize;
2169
2170     if (!filter_level)
2171         return;
2172
2173     bedge_lim  = 2 * filter_level + inner_limit;
2174     mbedge_lim = bedge_lim + 4;
2175
2176     if (mb_x)
2177         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2178     if (inner_filter) {
2179         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2180         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2181         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2182     }
2183
2184     if (mb_y)
2185         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2186     if (inner_filter) {
2187         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2188         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2189         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2190     }
2191 }
2192
2193 #define MARGIN (16 << 2)
2194 static av_always_inline
2195 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2196                                     VP8Frame *prev_frame, int is_vp7)
2197 {
2198     VP8Context *s = avctx->priv_data;
2199     int mb_x, mb_y;
2200
2201     s->mv_min.y = -MARGIN;
2202     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2203     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2204         VP8Macroblock *mb = s->macroblocks_base +
2205                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2206         int mb_xy = mb_y * s->mb_width;
2207
2208         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2209
2210         s->mv_min.x = -MARGIN;
2211         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2212         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2213             if (mb_y == 0)
2214                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2215                          DC_PRED * 0x01010101);
2216             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2217                            prev_frame && prev_frame->seg_map ?
2218                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2219             s->mv_min.x -= 64;
2220             s->mv_max.x -= 64;
2221         }
2222         s->mv_min.y -= 64;
2223         s->mv_max.y -= 64;
2224     }
2225 }
2226
2227 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2228                                    VP8Frame *prev_frame)
2229 {
2230     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2231 }
2232
2233 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2234                                    VP8Frame *prev_frame)
2235 {
2236     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2237 }
2238
2239 #if HAVE_THREADS
2240 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2241     do {                                                                      \
2242         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2243         if (otd->thread_mb_pos < tmp) {                                       \
2244             pthread_mutex_lock(&otd->lock);                                   \
2245             td->wait_mb_pos = tmp;                                            \
2246             do {                                                              \
2247                 if (otd->thread_mb_pos >= tmp)                                \
2248                     break;                                                    \
2249                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2250             } while (1);                                                      \
2251             td->wait_mb_pos = INT_MAX;                                        \
2252             pthread_mutex_unlock(&otd->lock);                                 \
2253         }                                                                     \
2254     } while (0);
2255
2256 #define update_pos(td, mb_y, mb_x)                                            \
2257     do {                                                                      \
2258         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2259         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2260                                (num_jobs > 1);                                \
2261         int is_null          = !next_td || !prev_td;                          \
2262         int pos_check        = (is_null) ? 1                                  \
2263                                          : (next_td != td &&                  \
2264                                             pos >= next_td->wait_mb_pos) ||   \
2265                                            (prev_td != td &&                  \
2266                                             pos >= prev_td->wait_mb_pos);     \
2267         td->thread_mb_pos = pos;                                              \
2268         if (sliced_threading && pos_check) {                                  \
2269             pthread_mutex_lock(&td->lock);                                    \
2270             pthread_cond_broadcast(&td->cond);                                \
2271             pthread_mutex_unlock(&td->lock);                                  \
2272         }                                                                     \
2273     } while (0);
2274 #else
2275 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
2276 #define update_pos(td, mb_y, mb_x)
2277 #endif
2278
2279 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2280                                         int jobnr, int threadnr, int is_vp7)
2281 {
2282     VP8Context *s = avctx->priv_data;
2283     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2284     int mb_y = td->thread_mb_pos >> 16;
2285     int mb_x, mb_xy = mb_y * s->mb_width;
2286     int num_jobs = s->num_jobs;
2287     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2288     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2289     VP8Macroblock *mb;
2290     uint8_t *dst[3] = {
2291         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2292         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2293         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2294     };
2295     if (mb_y == 0)
2296         prev_td = td;
2297     else
2298         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2299     if (mb_y == s->mb_height - 1)
2300         next_td = td;
2301     else
2302         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2303     if (s->mb_layout == 1)
2304         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2305     else {
2306         // Make sure the previous frame has read its segmentation map,
2307         // if we re-use the same map.
2308         if (prev_frame && s->segmentation.enabled &&
2309             !s->segmentation.update_map)
2310             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2311         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2312         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2313         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2314     }
2315
2316     if (!is_vp7 || mb_y == 0)
2317         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2318
2319     s->mv_min.x = -MARGIN;
2320     s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2321
2322     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2323         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2324         if (prev_td != td) {
2325             if (threadnr != 0) {
2326                 check_thread_pos(td, prev_td,
2327                                  mb_x + (is_vp7 ? 2 : 1),
2328                                  mb_y - (is_vp7 ? 2 : 1));
2329             } else {
2330                 check_thread_pos(td, prev_td,
2331                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2332                                  mb_y - (is_vp7 ? 2 : 1));
2333             }
2334         }
2335
2336         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2337                          s->linesize, 4);
2338         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2339                          dst[2] - dst[1], 2);
2340
2341         if (!s->mb_layout)
2342             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2343                            prev_frame && prev_frame->seg_map ?
2344                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2345
2346         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2347
2348         if (!mb->skip)
2349             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2350
2351         if (mb->mode <= MODE_I4x4)
2352             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2353         else
2354             inter_predict(s, td, dst, mb, mb_x, mb_y);
2355
2356         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2357
2358         if (!mb->skip) {
2359             idct_mb(s, td, dst, mb);
2360         } else {
2361             AV_ZERO64(td->left_nnz);
2362             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2363
2364             /* Reset DC block predictors if they would exist
2365              * if the mb had coefficients */
2366             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2367                 td->left_nnz[8]     = 0;
2368                 s->top_nnz[mb_x][8] = 0;
2369             }
2370         }
2371
2372         if (s->deblock_filter)
2373             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2374
2375         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2376             if (s->filter.simple)
2377                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2378                                  NULL, NULL, s->linesize, 0, 1);
2379             else
2380                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2381                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2382         }
2383
2384         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2385
2386         dst[0]      += 16;
2387         dst[1]      += 8;
2388         dst[2]      += 8;
2389         s->mv_min.x -= 64;
2390         s->mv_max.x -= 64;
2391
2392         if (mb_x == s->mb_width + 1) {
2393             update_pos(td, mb_y, s->mb_width + 3);
2394         } else {
2395             update_pos(td, mb_y, mb_x);
2396         }
2397     }
2398 }
2399
2400 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2401                               int jobnr, int threadnr, int is_vp7)
2402 {
2403     VP8Context *s = avctx->priv_data;
2404     VP8ThreadData *td = &s->thread_data[threadnr];
2405     int mb_x, mb_y = td->thread_mb_pos >> 16, num_jobs = s->num_jobs;
2406     AVFrame *curframe = s->curframe->tf.f;
2407     VP8Macroblock *mb;
2408     VP8ThreadData *prev_td, *next_td;
2409     uint8_t *dst[3] = {
2410         curframe->data[0] + 16 * mb_y * s->linesize,
2411         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2412         curframe->data[2] +  8 * mb_y * s->uvlinesize
2413     };
2414
2415     if (s->mb_layout == 1)
2416         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2417     else
2418         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2419
2420     if (mb_y == 0)
2421         prev_td = td;
2422     else
2423         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2424     if (mb_y == s->mb_height - 1)
2425         next_td = td;
2426     else
2427         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2428
2429     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2430         VP8FilterStrength *f = &td->filter_strength[mb_x];
2431         if (prev_td != td)
2432             check_thread_pos(td, prev_td,
2433                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2434         if (next_td != td)
2435             if (next_td != &s->thread_data[0])
2436                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2437
2438         if (num_jobs == 1) {
2439             if (s->filter.simple)
2440                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2441                                  NULL, NULL, s->linesize, 0, 1);
2442             else
2443                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2444                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2445         }
2446
2447         if (s->filter.simple)
2448             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2449         else
2450             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2451         dst[0] += 16;
2452         dst[1] += 8;
2453         dst[2] += 8;
2454
2455         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2456     }
2457 }
2458
2459 static av_always_inline
2460 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2461                               int threadnr, int is_vp7)
2462 {
2463     VP8Context *s = avctx->priv_data;
2464     VP8ThreadData *td = &s->thread_data[jobnr];
2465     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2466     VP8Frame *curframe = s->curframe;
2467     int mb_y, num_jobs = s->num_jobs;
2468
2469     td->thread_nr = threadnr;
2470     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2471         if (mb_y >= s->mb_height)
2472             break;
2473         td->thread_mb_pos = mb_y << 16;
2474         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, is_vp7);
2475         if (s->deblock_filter)
2476             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr, is_vp7);
2477         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2478
2479         s->mv_min.y -= 64;
2480         s->mv_max.y -= 64;
2481
2482         if (avctx->active_thread_type == FF_THREAD_FRAME)
2483             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2484     }
2485
2486     return 0;
2487 }
2488
2489 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2490                                     int jobnr, int threadnr)
2491 {
2492     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2493 }
2494
2495 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2496                                     int jobnr, int threadnr)
2497 {
2498     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2499 }
2500
2501 static av_always_inline
2502 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2503                       AVPacket *avpkt, int is_vp7)
2504 {
2505     VP8Context *s = avctx->priv_data;
2506     int ret, i, referenced, num_jobs;
2507     enum AVDiscard skip_thresh;
2508     VP8Frame *av_uninit(curframe), *prev_frame;
2509
2510     if (is_vp7)
2511         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2512     else
2513         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2514
2515     if (ret < 0)
2516         goto err;
2517
2518     if (s->actually_webp) {
2519         // avctx->pix_fmt already set in caller.
2520     } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
2521         enum AVPixelFormat pix_fmts[] = {
2522 #if CONFIG_VP8_VAAPI_HWACCEL
2523             AV_PIX_FMT_VAAPI,
2524 #endif
2525             AV_PIX_FMT_YUV420P,
2526             AV_PIX_FMT_NONE,
2527         };
2528
2529         s->pix_fmt = ff_get_format(s->avctx, pix_fmts);
2530         if (s->pix_fmt < 0) {
2531             ret = AVERROR(EINVAL);
2532             goto err;
2533         }
2534         avctx->pix_fmt = s->pix_fmt;
2535     }
2536
2537     prev_frame = s->framep[VP56_FRAME_CURRENT];
2538
2539     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2540                  s->update_altref == VP56_FRAME_CURRENT;
2541
2542     skip_thresh = !referenced ? AVDISCARD_NONREF
2543                               : !s->keyframe ? AVDISCARD_NONKEY
2544                                              : AVDISCARD_ALL;
2545
2546     if (avctx->skip_frame >= skip_thresh) {
2547         s->invisible = 1;
2548         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2549         goto skip_decode;
2550     }
2551     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2552
2553     // release no longer referenced frames
2554     for (i = 0; i < 5; i++)
2555         if (s->frames[i].tf.f->data[0] &&
2556             &s->frames[i] != prev_frame &&
2557             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2558             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2559             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2560             vp8_release_frame(s, &s->frames[i]);
2561
2562     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2563
2564     if (!s->colorspace)
2565         avctx->colorspace = AVCOL_SPC_BT470BG;
2566     if (s->fullrange)
2567         avctx->color_range = AVCOL_RANGE_JPEG;
2568     else
2569         avctx->color_range = AVCOL_RANGE_MPEG;
2570
2571     /* Given that arithmetic probabilities are updated every frame, it's quite
2572      * likely that the values we have on a random interframe are complete
2573      * junk if we didn't start decode on a keyframe. So just don't display
2574      * anything rather than junk. */
2575     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2576                          !s->framep[VP56_FRAME_GOLDEN]   ||
2577                          !s->framep[VP56_FRAME_GOLDEN2])) {
2578         av_log(avctx, AV_LOG_WARNING,
2579                "Discarding interframe without a prior keyframe!\n");
2580         ret = AVERROR_INVALIDDATA;
2581         goto err;
2582     }
2583
2584     curframe->tf.f->key_frame = s->keyframe;
2585     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2586                                             : AV_PICTURE_TYPE_P;
2587     if ((ret = vp8_alloc_frame(s, curframe, referenced))) {
2588         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
2589         goto err;
2590     }
2591
2592     // check if golden and altref are swapped
2593     if (s->update_altref != VP56_FRAME_NONE)
2594         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2595     else
2596         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2597
2598     if (s->update_golden != VP56_FRAME_NONE)
2599         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2600     else
2601         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2602
2603     if (s->update_last)
2604         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2605     else
2606         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2607
2608     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2609
2610     ff_thread_finish_setup(avctx);
2611
2612     if (avctx->hwaccel) {
2613         ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
2614         if (ret < 0)
2615             goto err;
2616
2617         ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
2618         if (ret < 0)
2619             goto err;
2620
2621         ret = avctx->hwaccel->end_frame(avctx);
2622         if (ret < 0)
2623             goto err;
2624
2625     } else {
2626         s->linesize   = curframe->tf.f->linesize[0];
2627         s->uvlinesize = curframe->tf.f->linesize[1];
2628
2629         memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2630         /* Zero macroblock structures for top/top-left prediction
2631          * from outside the frame. */
2632         if (!s->mb_layout)
2633             memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2634                    (s->mb_width + 1) * sizeof(*s->macroblocks));
2635         if (!s->mb_layout && s->keyframe)
2636             memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2637
2638         memset(s->ref_count, 0, sizeof(s->ref_count));
2639
2640         if (s->mb_layout == 1) {
2641             // Make sure the previous frame has read its segmentation map,
2642             // if we re-use the same map.
2643             if (prev_frame && s->segmentation.enabled &&
2644                 !s->segmentation.update_map)
2645                 ff_thread_await_progress(&prev_frame->tf, 1, 0);
2646             if (is_vp7)
2647                 vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2648             else
2649                 vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2650         }
2651
2652         if (avctx->active_thread_type == FF_THREAD_FRAME)
2653             num_jobs = 1;
2654         else
2655             num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2656         s->num_jobs   = num_jobs;
2657         s->curframe   = curframe;
2658         s->prev_frame = prev_frame;
2659         s->mv_min.y   = -MARGIN;
2660         s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2661         for (i = 0; i < MAX_THREADS; i++) {
2662             s->thread_data[i].thread_mb_pos = 0;
2663             s->thread_data[i].wait_mb_pos   = INT_MAX;
2664         }
2665
2666         if (is_vp7)
2667             avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2668                             num_jobs);
2669         else
2670             avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2671                             num_jobs);
2672     }
2673
2674     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2675     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2676
2677 skip_decode:
2678     // if future frames don't use the updated probabilities,
2679     // reset them to the values we saved
2680     if (!s->update_probabilities)
2681         s->prob[0] = s->prob[1];
2682
2683     if (!s->invisible) {
2684         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2685             return ret;
2686         *got_frame = 1;
2687     }
2688
2689     return avpkt->size;
2690 err:
2691     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2692     return ret;
2693 }
2694
2695 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2696                         AVPacket *avpkt)
2697 {
2698     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2699 }
2700
2701 #if CONFIG_VP7_DECODER
2702 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2703                             AVPacket *avpkt)
2704 {
2705     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2706 }
2707 #endif /* CONFIG_VP7_DECODER */
2708
2709 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2710 {
2711     VP8Context *s = avctx->priv_data;
2712     int i;
2713
2714     vp8_decode_flush_impl(avctx, 1);
2715     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2716         av_frame_free(&s->frames[i].tf.f);
2717
2718     return 0;
2719 }
2720
2721 static av_cold int vp8_init_frames(VP8Context *s)
2722 {
2723     int i;
2724     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2725         s->frames[i].tf.f = av_frame_alloc();
2726         if (!s->frames[i].tf.f)
2727             return AVERROR(ENOMEM);
2728     }
2729     return 0;
2730 }
2731
2732 static av_always_inline
2733 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2734 {
2735     VP8Context *s = avctx->priv_data;
2736     int ret;
2737
2738     s->avctx = avctx;
2739     s->pix_fmt = AV_PIX_FMT_NONE;
2740     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2741     avctx->internal->allocate_progress = 1;
2742
2743     ff_videodsp_init(&s->vdsp, 8);
2744
2745     ff_vp78dsp_init(&s->vp8dsp);
2746     if (CONFIG_VP7_DECODER && is_vp7) {
2747         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2748         ff_vp7dsp_init(&s->vp8dsp);
2749     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2750         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2751         ff_vp8dsp_init(&s->vp8dsp);
2752     }
2753
2754     /* does not change for VP8 */
2755     memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2756
2757     if ((ret = vp8_init_frames(s)) < 0) {
2758         ff_vp8_decode_free(avctx);
2759         return ret;
2760     }
2761
2762     return 0;
2763 }
2764
2765 #if CONFIG_VP7_DECODER
2766 static int vp7_decode_init(AVCodecContext *avctx)
2767 {
2768     return vp78_decode_init(avctx, IS_VP7);
2769 }
2770 #endif /* CONFIG_VP7_DECODER */
2771
2772 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2773 {
2774     return vp78_decode_init(avctx, IS_VP8);
2775 }
2776
2777 #if CONFIG_VP8_DECODER
2778 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2779 {
2780     VP8Context *s = avctx->priv_data;
2781     int ret;
2782
2783     s->avctx = avctx;
2784
2785     if ((ret = vp8_init_frames(s)) < 0) {
2786         ff_vp8_decode_free(avctx);
2787         return ret;
2788     }
2789
2790     return 0;
2791 }
2792
2793 #define REBASE(pic) pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2794
2795 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2796                                             const AVCodecContext *src)
2797 {
2798     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2799     int i;
2800
2801     if (s->macroblocks_base &&
2802         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2803         free_buffers(s);
2804         s->mb_width  = s_src->mb_width;
2805         s->mb_height = s_src->mb_height;
2806     }
2807
2808     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2809     s->segmentation = s_src->segmentation;
2810     s->lf_delta     = s_src->lf_delta;
2811     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2812
2813     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2814         if (s_src->frames[i].tf.f->data[0]) {
2815             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2816             if (ret < 0)
2817                 return ret;
2818         }
2819     }
2820
2821     s->framep[0] = REBASE(s_src->next_framep[0]);
2822     s->framep[1] = REBASE(s_src->next_framep[1]);
2823     s->framep[2] = REBASE(s_src->next_framep[2]);
2824     s->framep[3] = REBASE(s_src->next_framep[3]);
2825
2826     return 0;
2827 }
2828 #endif /* CONFIG_VP8_DECODER */
2829
2830 #if CONFIG_VP7_DECODER
2831 AVCodec ff_vp7_decoder = {
2832     .name                  = "vp7",
2833     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2834     .type                  = AVMEDIA_TYPE_VIDEO,
2835     .id                    = AV_CODEC_ID_VP7,
2836     .priv_data_size        = sizeof(VP8Context),
2837     .init                  = vp7_decode_init,
2838     .close                 = ff_vp8_decode_free,
2839     .decode                = vp7_decode_frame,
2840     .capabilities          = AV_CODEC_CAP_DR1,
2841     .flush                 = vp8_decode_flush,
2842 };
2843 #endif /* CONFIG_VP7_DECODER */
2844
2845 #if CONFIG_VP8_DECODER
2846 AVCodec ff_vp8_decoder = {
2847     .name                  = "vp8",
2848     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2849     .type                  = AVMEDIA_TYPE_VIDEO,
2850     .id                    = AV_CODEC_ID_VP8,
2851     .priv_data_size        = sizeof(VP8Context),
2852     .init                  = ff_vp8_decode_init,
2853     .close                 = ff_vp8_decode_free,
2854     .decode                = ff_vp8_decode_frame,
2855     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2856                              AV_CODEC_CAP_SLICE_THREADS,
2857     .hw_configs            = (const AVCodecHWConfigInternal*[]) {
2858 #if CONFIG_VP8_VAAPI_HWACCEL
2859                                HWACCEL_VAAPI(vp8),
2860 #endif
2861                                NULL
2862                            },
2863     .flush                 = vp8_decode_flush,
2864     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2865     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2866 };
2867 #endif /* CONFIG_VP7_DECODER */