third_party/libwebp/enc/quant.c

   1 // Copyright 2011 Google Inc. All Rights Reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style license
   4 // that can be found in the COPYING file in the root of the source
   5 // tree. An additional intellectual property rights grant can be found
   6 // in the file PATENTS. All contributing project authors may
   7 // be found in the AUTHORS file in the root of the source tree.
   8 // -----------------------------------------------------------------------------
   9 //
  10 //   Quantization
  11 //
  12 // Author: Skal (pascal.massimino@gmail.com)
  13
  14 #include <assert.h>
  15 #include <math.h>
  16 #include <stdlib.h>  // for abs()
  17
  18 #include "./vp8enci.h"
  19 #include "./cost.h"
  20
  21 #define DO_TRELLIS_I4  1
  22 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
  23 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
  24 #define USE_TDISTO 1
  25
  26 #define MID_ALPHA 64      // neutral value for susceptibility
  27 #define MIN_ALPHA 30      // lowest usable value for susceptibility
  28 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
  29
  30 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
  31                           // power-law modulation. Must be strictly less than 1.
  32
  33 #define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
  34
  35 // number of non-zero coeffs below which we consider the block very flat
  36 // (and apply a penalty to complex predictions)
  37 #define FLATNESS_LIMIT_I16 10      // I16 mode
  38 #define FLATNESS_LIMIT_I4  3       // I4 mode
  39 #define FLATNESS_LIMIT_UV  2       // UV mode
  40 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
  41
  42 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
  43
  44 // #define DEBUG_BLOCK
  45
  46 //------------------------------------------------------------------------------
  47
  48 #if defined(DEBUG_BLOCK)
  49
  50 #include <stdio.h>
  51 #include <stdlib.h>
  52
  53 static void PrintBlockInfo(const VP8EncIterator* const it,
  54                            const VP8ModeScore* const rd) {
  55   int i, j;
  56   const int is_i16 = (it->mb_->type_ == 1);
  57   printf("SOURCE / OUTPUT / ABS DELTA\n");
  58   for (j = 0; j < 24; ++j) {
  59     if (j == 16) printf("\n");   // newline before the U/V block
  60     for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
  61     printf("     ");
  62     for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
  63     printf("     ");
  64     for (i = 0; i < 16; ++i) {
  65       printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
  66     }
  67     printf("\n");
  68   }
  69   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
  70     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
  71     (int)rd->score);
  72   if (is_i16) {
  73     printf("Mode: %d\n", rd->mode_i16);
  74     printf("y_dc_levels:");
  75     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
  76     printf("\n");
  77   } else {
  78     printf("Modes[16]: ");
  79     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
  80     printf("\n");
  81   }
  82   printf("y_ac_levels:\n");
  83   for (j = 0; j < 16; ++j) {
  84     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
  85       printf("%4d ", rd->y_ac_levels[j][i]);
  86     }
  87     printf("\n");
  88   }
  89   printf("\n");
  90   printf("uv_levels (mode=%d):\n", rd->mode_uv);
  91   for (j = 0; j < 8; ++j) {
  92     for (i = 0; i < 16; ++i) {
  93       printf("%4d ", rd->uv_levels[j][i]);
  94     }
  95     printf("\n");
  96   }
  97 }
  98
  99 #endif   // DEBUG_BLOCK
 100
 101 //------------------------------------------------------------------------------
 102
 103 static WEBP_INLINE int clip(int v, int m, int M) {
 104   return v < m ? m : v > M ? M : v;
 105 }
 106
 107 static const uint8_t kZigzag[16] = {
 108   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
 109 };
 110
 111 static const uint8_t kDcTable[128] = {
 112   4,     5,   6,   7,   8,   9,  10,  10,
 113   11,   12,  13,  14,  15,  16,  17,  17,
 114   18,   19,  20,  20,  21,  21,  22,  22,
 115   23,   23,  24,  25,  25,  26,  27,  28,
 116   29,   30,  31,  32,  33,  34,  35,  36,
 117   37,   37,  38,  39,  40,  41,  42,  43,
 118   44,   45,  46,  46,  47,  48,  49,  50,
 119   51,   52,  53,  54,  55,  56,  57,  58,
 120   59,   60,  61,  62,  63,  64,  65,  66,
 121   67,   68,  69,  70,  71,  72,  73,  74,
 122   75,   76,  76,  77,  78,  79,  80,  81,
 123   82,   83,  84,  85,  86,  87,  88,  89,
 124   91,   93,  95,  96,  98, 100, 101, 102,
 125   104, 106, 108, 110, 112, 114, 116, 118,
 126   122, 124, 126, 128, 130, 132, 134, 136,
 127   138, 140, 143, 145, 148, 151, 154, 157
 128 };
 129
 130 static const uint16_t kAcTable[128] = {
 131   4,     5,   6,   7,   8,   9,  10,  11,
 132   12,   13,  14,  15,  16,  17,  18,  19,
 133   20,   21,  22,  23,  24,  25,  26,  27,
 134   28,   29,  30,  31,  32,  33,  34,  35,
 135   36,   37,  38,  39,  40,  41,  42,  43,
 136   44,   45,  46,  47,  48,  49,  50,  51,
 137   52,   53,  54,  55,  56,  57,  58,  60,
 138   62,   64,  66,  68,  70,  72,  74,  76,
 139   78,   80,  82,  84,  86,  88,  90,  92,
 140   94,   96,  98, 100, 102, 104, 106, 108,
 141   110, 112, 114, 116, 119, 122, 125, 128,
 142   131, 134, 137, 140, 143, 146, 149, 152,
 143   155, 158, 161, 164, 167, 170, 173, 177,
 144   181, 185, 189, 193, 197, 201, 205, 209,
 145   213, 217, 221, 225, 229, 234, 239, 245,
 146   249, 254, 259, 264, 269, 274, 279, 284
 147 };
 148
 149 static const uint16_t kAcTable2[128] = {
 150   8,     8,   9,  10,  12,  13,  15,  17,
 151   18,   20,  21,  23,  24,  26,  27,  29,
 152   31,   32,  34,  35,  37,  38,  40,  41,
 153   43,   44,  46,  48,  49,  51,  52,  54,
 154   55,   57,  58,  60,  62,  63,  65,  66,
 155   68,   69,  71,  72,  74,  75,  77,  79,
 156   80,   82,  83,  85,  86,  88,  89,  93,
 157   96,   99, 102, 105, 108, 111, 114, 117,
 158   120, 124, 127, 130, 133, 136, 139, 142,
 159   145, 148, 151, 155, 158, 161, 164, 167,
 160   170, 173, 176, 179, 184, 189, 193, 198,
 161   203, 207, 212, 217, 221, 226, 230, 235,
 162   240, 244, 249, 254, 258, 263, 268, 274,
 163   280, 286, 292, 299, 305, 311, 317, 323,
 164   330, 336, 342, 348, 354, 362, 370, 379,
 165   385, 393, 401, 409, 416, 424, 432, 440
 166 };
 167
 168 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
 169   { 96, 110 }, { 96, 108 }, { 110, 115 }
 170 };
 171
 172 // Sharpening by (slightly) raising the hi-frequency coeffs.
 173 // Hack-ish but helpful for mid-bitrate range. Use with care.
 174 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
 175 static const uint8_t kFreqSharpening[16] = {
 176   0,  30, 60, 90,
 177   30, 60, 90, 90,
 178   60, 90, 90, 90,
 179   90, 90, 90, 90
 180 };
 181
 182 //------------------------------------------------------------------------------
 183 // Initialize quantization parameters in VP8Matrix
 184
 185 // Returns the average quantizer
 186 static int ExpandMatrix(VP8Matrix* const m, int type) {
 187   int i, sum;
 188   for (i = 0; i < 2; ++i) {
 189     const int is_ac_coeff = (i > 0);
 190     const int bias = kBiasMatrices[type][is_ac_coeff];
 191     m->iq_[i] = (1 << QFIX) / m->q_[i];
 192     m->bias_[i] = BIAS(bias);
 193     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
 194     //   * zero if coeff <= zthresh
 195     //   * non-zero if coeff > zthresh
 196     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
 197   }
 198   for (i = 2; i < 16; ++i) {
 199     m->q_[i] = m->q_[1];
 200     m->iq_[i] = m->iq_[1];
 201     m->bias_[i] = m->bias_[1];
 202     m->zthresh_[i] = m->zthresh_[1];
 203   }
 204   for (sum = 0, i = 0; i < 16; ++i) {
 205     if (type == 0) {  // we only use sharpening for AC luma coeffs
 206       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
 207     } else {
 208       m->sharpen_[i] = 0;
 209     }
 210     sum += m->q_[i];
 211   }
 212   return (sum + 8) >> 4;
 213 }
 214
 215 static void SetupMatrices(VP8Encoder* enc) {
 216   int i;
 217   const int tlambda_scale =
 218     (enc->method_ >= 4) ? enc->config_->sns_strength
 219                         : 0;
 220   const int num_segments = enc->segment_hdr_.num_segments_;
 221   for (i = 0; i < num_segments; ++i) {
 222     VP8SegmentInfo* const m = &enc->dqm_[i];
 223     const int q = m->quant_;
 224     int q4, q16, quv;
 225     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
 226     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
 227
 228     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
 229     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
 230
 231     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
 232     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
 233
 234     q4  = ExpandMatrix(&m->y1_, 0);
 235     q16 = ExpandMatrix(&m->y2_, 1);
 236     quv = ExpandMatrix(&m->uv_, 2);
 237
 238     m->lambda_i4_          = (3 * q4 * q4) >> 7;
 239     m->lambda_i16_         = (3 * q16 * q16);
 240     m->lambda_uv_          = (3 * quv * quv) >> 6;
 241     m->lambda_mode_        = (1 * q4 * q4) >> 7;
 242     m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
 243     m->lambda_trellis_i16_ = (q16 * q16) >> 2;
 244     m->lambda_trellis_uv_  = (quv *quv) << 1;
 245     m->tlambda_            = (tlambda_scale * q4) >> 5;
 246
 247     m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
 248     m->max_edge_  = 0;
 249   }
 250 }
 251
 252 //------------------------------------------------------------------------------
 253 // Initialize filtering parameters
 254
 255 // Very small filter-strength values have close to no visual effect. So we can
 256 // save a little decoding-CPU by turning filtering off for these.
 257 #define FSTRENGTH_CUTOFF 2
 258
 259 static void SetupFilterStrength(VP8Encoder* const enc) {
 260   int i;
 261   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
 262   const int level0 = 5 * enc->config_->filter_strength;
 263   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
 264     VP8SegmentInfo* const m = &enc->dqm_[i];
 265     // We focus on the quantization of AC coeffs.
 266     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
 267     const int base_strength =
 268         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
 269     // Segments with lower complexity ('beta') will be less filtered.
 270     const int f = base_strength * level0 / (256 + m->beta_);
 271     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
 272   }
 273   // We record the initial strength (mainly for the case of 1-segment only).
 274   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
 275   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
 276   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
 277 }
 278
 279 //------------------------------------------------------------------------------
 280
 281 // Note: if you change the values below, remember that the max range
 282 // allowed by the syntax for DQ_UV is [-16,16].
 283 #define MAX_DQ_UV (6)
 284 #define MIN_DQ_UV (-4)
 285
 286 // We want to emulate jpeg-like behaviour where the expected "good" quality
 287 // is around q=75. Internally, our "good" middle is around c=50. So we
 288 // map accordingly using linear piece-wise function
 289 static double QualityToCompression(double c) {
 290   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
 291   // The file size roughly scales as pow(quantizer, 3.). Actually, the
 292   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
 293   // in the mid-quant range. So we scale the compressibility inversely to
 294   // this power-law: quant ~= compression ^ 1/3. This law holds well for
 295   // low quant. Finer modeling for high-quant would make use of kAcTable[]
 296   // more explicitly.
 297   const double v = pow(linear_c, 1 / 3.);
 298   return v;
 299 }
 300
 301 static double QualityToJPEGCompression(double c, double alpha) {
 302   // We map the complexity 'alpha' and quality setting 'c' to a compression
 303   // exponent empirically matched to the compression curve of libjpeg6b.
 304   // On average, the WebP output size will be roughly similar to that of a
 305   // JPEG file compressed with same quality factor.
 306   const double amin = 0.30;
 307   const double amax = 0.85;
 308   const double exp_min = 0.4;
 309   const double exp_max = 0.9;
 310   const double slope = (exp_min - exp_max) / (amax - amin);
 311   // Linearly interpolate 'expn' from exp_min to exp_max
 312   // in the [amin, amax] range.
 313   const double expn = (alpha > amax) ? exp_min
 314                     : (alpha < amin) ? exp_max
 315                     : exp_max + slope * (alpha - amin);
 316   const double v = pow(c, expn);
 317   return v;
 318 }
 319
 320 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
 321                                  const VP8SegmentInfo* const S2) {
 322   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
 323 }
 324
 325 static void SimplifySegments(VP8Encoder* const enc) {
 326   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
 327   const int num_segments = enc->segment_hdr_.num_segments_;
 328   int num_final_segments = 1;
 329   int s1, s2;
 330   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
 331     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
 332     int found = 0;
 333     // check if we already have similar segment
 334     for (s2 = 0; s2 < num_final_segments; ++s2) {
 335       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
 336       if (SegmentsAreEquivalent(S1, S2)) {
 337         found = 1;
 338         break;
 339       }
 340     }
 341     map[s1] = s2;
 342     if (!found) {
 343       if (num_final_segments != s1) {
 344         enc->dqm_[num_final_segments] = enc->dqm_[s1];
 345       }
 346       ++num_final_segments;
 347     }
 348   }
 349   if (num_final_segments < num_segments) {  // Remap
 350     int i = enc->mb_w_ * enc->mb_h_;
 351     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
 352     enc->segment_hdr_.num_segments_ = num_final_segments;
 353     // Replicate the trailing segment infos (it's mostly cosmetics)
 354     for (i = num_final_segments; i < num_segments; ++i) {
 355       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
 356     }
 357   }
 358 }
 359
 360 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
 361   int i;
 362   int dq_uv_ac, dq_uv_dc;
 363   const int num_segments = enc->segment_hdr_.num_segments_;
 364   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
 365   const double Q = quality / 100.;
 366   const double c_base = enc->config_->emulate_jpeg_size ?
 367       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
 368       QualityToCompression(Q);
 369   for (i = 0; i < num_segments; ++i) {
 370     // We modulate the base coefficient to accommodate for the quantization
 371     // susceptibility and allow denser segments to be quantized more.
 372     const double expn = 1. - amp * enc->dqm_[i].alpha_;
 373     const double c = pow(c_base, expn);
 374     const int q = (int)(127. * (1. - c));
 375     assert(expn > 0.);
 376     enc->dqm_[i].quant_ = clip(q, 0, 127);
 377   }
 378
 379   // purely indicative in the bitstream (except for the 1-segment case)
 380   enc->base_quant_ = enc->dqm_[0].quant_;
 381
 382   // fill-in values for the unused segments (required by the syntax)
 383   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
 384     enc->dqm_[i].quant_ = enc->base_quant_;
 385   }
 386
 387   // uv_alpha_ is normally spread around ~60. The useful range is
 388   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
 389   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
 390   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
 391                                           / (MAX_ALPHA - MIN_ALPHA);
 392   // we rescale by the user-defined strength of adaptation
 393   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
 394   // and make it safe.
 395   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
 396   // We also boost the dc-uv-quant a little, based on sns-strength, since
 397   // U/V channels are quite more reactive to high quants (flat DC-blocks
 398   // tend to appear, and are unpleasant).
 399   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
 400   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
 401
 402   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
 403   enc->dq_y2_dc_ = 0;
 404   enc->dq_y2_ac_ = 0;
 405   enc->dq_uv_dc_ = dq_uv_dc;
 406   enc->dq_uv_ac_ = dq_uv_ac;
 407
 408   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
 409
 410   if (num_segments > 1) SimplifySegments(enc);
 411
 412   SetupMatrices(enc);         // finalize quantization matrices
 413 }
 414
 415 //------------------------------------------------------------------------------
 416 // Form the predictions in cache
 417
 418 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
 419 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
 420 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
 421
 422 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
 423 const int VP8I4ModeOffsets[NUM_BMODES] = {
 424   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
 425 };
 426
 427 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
 428   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
 429   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
 430   VP8EncPredLuma16(it->yuv_p_, left, top);
 431 }
 432
 433 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
 434   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
 435   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
 436   VP8EncPredChroma8(it->yuv_p_, left, top);
 437 }
 438
 439 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
 440   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
 441 }
 442
 443 //------------------------------------------------------------------------------
 444 // Quantize
 445
 446 // Layout:
 447 // +----+
 448 // |YYYY| 0
 449 // |YYYY| 4
 450 // |YYYY| 8
 451 // |YYYY| 12
 452 // +----+
 453 // |UUVV| 16
 454 // |UUVV| 20
 455 // +----+
 456
 457 const int VP8Scan[16] = {  // Luma
 458   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
 459   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
 460   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
 461   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
 462 };
 463
 464 static const int VP8ScanUV[4 + 4] = {
 465   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
 466   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 467 };
 468
 469 //------------------------------------------------------------------------------
 470 // Distortion measurement
 471
 472 static const uint16_t kWeightY[16] = {
 473   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
 474 };
 475
 476 static const uint16_t kWeightTrellis[16] = {
 477 #if USE_TDISTO == 0
 478   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 479 #else
 480   30, 27, 19, 11,
 481   27, 24, 17, 10,
 482   19, 17, 12,  8,
 483   11, 10,  8,  6
 484 #endif
 485 };
 486
 487 // Init/Copy the common fields in score.
 488 static void InitScore(VP8ModeScore* const rd) {
 489   rd->D  = 0;
 490   rd->SD = 0;
 491   rd->R  = 0;
 492   rd->H  = 0;
 493   rd->nz = 0;
 494   rd->score = MAX_COST;
 495 }
 496
 497 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
 498   dst->D  = src->D;
 499   dst->SD = src->SD;
 500   dst->R  = src->R;
 501   dst->H  = src->H;
 502   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
 503   dst->score = src->score;
 504 }
 505
 506 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
 507   dst->D  += src->D;
 508   dst->SD += src->SD;
 509   dst->R  += src->R;
 510   dst->H  += src->H;
 511   dst->nz |= src->nz;     // here, new nz bits are accumulated.
 512   dst->score += src->score;
 513 }
 514
 515 //------------------------------------------------------------------------------
 516 // Performs trellis-optimized quantization.
 517
 518 // Trellis node
 519 typedef struct {
 520   int8_t prev;            // best previous node
 521   int8_t sign;            // sign of coeff_i
 522   int16_t level;          // level
 523 } Node;
 524
 525 // Score state
 526 typedef struct {
 527   score_t score;          // partial RD score
 528   const uint16_t* costs;  // shortcut to cost tables
 529 } ScoreState;
 530
 531 // If a coefficient was quantized to a value Q (using a neutral bias),
 532 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
 533 // We don't test negative values though.
 534 #define MIN_DELTA 0   // how much lower level to try
 535 #define MAX_DELTA 1   // how much higher
 536 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
 537 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
 538 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
 539
 540 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
 541   // TODO: incorporate the "* 256" in the tables?
 542   rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
 543 }
 544
 545 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
 546                                           score_t distortion) {
 547   return rate * lambda + 256 * distortion;
 548 }
 549
 550 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
 551                                 int16_t in[16], int16_t out[16],
 552                                 int ctx0, int coeff_type,
 553                                 const VP8Matrix* const mtx,
 554                                 int lambda) {
 555   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
 556   const CostArray* const costs = enc->proba_.level_cost_[coeff_type];
 557   const int first = (coeff_type == 0) ? 1 : 0;
 558   Node nodes[16][NUM_NODES];
 559   ScoreState score_states[2][NUM_NODES];
 560   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
 561   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
 562   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
 563   score_t best_score;
 564   int n, m, p, last;
 565
 566   {
 567     score_t cost;
 568     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
 569     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
 570
 571     // compute the position of the last interesting coefficient
 572     last = first - 1;
 573     for (n = 15; n >= first; --n) {
 574       const int j = kZigzag[n];
 575       const int err = in[j] * in[j];
 576       if (err > thresh) {
 577         last = n;
 578         break;
 579       }
 580     }
 581     // we don't need to go inspect up to n = 16 coeffs. We can just go up
 582     // to last + 1 (inclusive) without losing much.
 583     if (last < 15) ++last;
 584
 585     // compute 'skip' score. This is the max score one can do.
 586     cost = VP8BitCost(0, last_proba);
 587     best_score = RDScoreTrellis(lambda, cost, 0);
 588
 589     // initialize source node.
 590     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
 591       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
 592       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
 593       ss_cur[m].costs = costs[VP8EncBands[first]][ctx0];
 594     }
 595   }
 596
 597   // traverse trellis.
 598   for (n = first; n <= last; ++n) {
 599     const int j = kZigzag[n];
 600     const uint32_t Q  = mtx->q_[j];
 601     const uint32_t iQ = mtx->iq_[j];
 602     const uint32_t B = BIAS(0x00);     // neutral bias
 603     // note: it's important to take sign of the _original_ coeff,
 604     // so we don't have to consider level < 0 afterward.
 605     const int sign = (in[j] < 0);
 606     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
 607     int level0 = QUANTDIV(coeff0, iQ, B);
 608     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
 609
 610     {   // Swap current and previous score states
 611       ScoreState* const tmp = ss_cur;
 612       ss_cur = ss_prev;
 613       ss_prev = tmp;
 614     }
 615
 616     // test all alternate level values around level0.
 617     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
 618       Node* const cur = &NODE(n, m);
 619       int level = level0 + m;
 620       const int ctx = (level > 2) ? 2 : level;
 621       const int band = VP8EncBands[n + 1];
 622       score_t base_score, last_pos_score;
 623       score_t best_cur_score = MAX_COST;
 624       int best_prev = 0;   // default, in case
 625
 626       ss_cur[m].score = MAX_COST;
 627       ss_cur[m].costs = costs[band][ctx];
 628       if (level > MAX_LEVEL || level < 0) {   // node is dead?
 629         continue;
 630       }
 631
 632       // Compute extra rate cost if last coeff's position is < 15
 633       {
 634         const score_t last_pos_cost =
 635             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
 636         last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
 637       }
 638
 639       {
 640         // Compute delta_error = how much coding this level will
 641         // subtract to max_error as distortion.
 642         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
 643         const int new_error = coeff0 - level * Q;
 644         const int delta_error =
 645             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
 646         base_score = RDScoreTrellis(lambda, 0, delta_error);
 647       }
 648
 649       // Inspect all possible non-dead predecessors. Retain only the best one.
 650       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
 651         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
 652         // eliminated since their score can't be better than the current best.
 653         const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
 654         // Examine node assuming it's a non-terminal one.
 655         const score_t score =
 656             base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
 657         if (score < best_cur_score) {
 658           best_cur_score = score;
 659           best_prev = p;
 660         }
 661       }
 662       // Store best finding in current node.
 663       cur->sign = sign;
 664       cur->level = level;
 665       cur->prev = best_prev;
 666       ss_cur[m].score = best_cur_score;
 667
 668       // Now, record best terminal node (and thus best entry in the graph).
 669       if (level != 0) {
 670         const score_t score = best_cur_score + last_pos_score;
 671         if (score < best_score) {
 672           best_score = score;
 673           best_path[0] = n;                     // best eob position
 674           best_path[1] = m;                     // best node index
 675           best_path[2] = best_prev;             // best predecessor
 676         }
 677       }
 678     }
 679   }
 680
 681   // Fresh start
 682   memset(in + first, 0, (16 - first) * sizeof(*in));
 683   memset(out + first, 0, (16 - first) * sizeof(*out));
 684   if (best_path[0] == -1) {
 685     return 0;   // skip!
 686   }
 687
 688   {
 689     // Unwind the best path.
 690     // Note: best-prev on terminal node is not necessarily equal to the
 691     // best_prev for non-terminal. So we patch best_path[2] in.
 692     int nz = 0;
 693     int best_node = best_path[1];
 694     n = best_path[0];
 695     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
 696
 697     for (; n >= first; --n) {
 698       const Node* const node = &NODE(n, best_node);
 699       const int j = kZigzag[n];
 700       out[n] = node->sign ? -node->level : node->level;
 701       nz |= node->level;
 702       in[j] = out[n] * mtx->q_[j];
 703       best_node = node->prev;
 704     }
 705     return (nz != 0);
 706   }
 707 }
 708
 709 #undef NODE
 710
 711 //------------------------------------------------------------------------------
 712 // Performs: difference, transform, quantize, back-transform, add
 713 // all at once. Output is the reconstructed block in *yuv_out, and the
 714 // quantized levels in *levels.
 715
 716 static int ReconstructIntra16(VP8EncIterator* const it,
 717                               VP8ModeScore* const rd,
 718                               uint8_t* const yuv_out,
 719                               int mode) {
 720   const VP8Encoder* const enc = it->enc_;
 721   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
 722   const uint8_t* const src = it->yuv_in_ + Y_OFF;
 723   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 724   int nz = 0;
 725   int n;
 726   int16_t tmp[16][16], dc_tmp[16];
 727
 728   for (n = 0; n < 16; ++n) {
 729     VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
 730   }
 731   VP8FTransformWHT(tmp[0], dc_tmp);
 732   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
 733
 734   if (DO_TRELLIS_I16 && it->do_trellis_) {
 735     int x, y;
 736     VP8IteratorNzToBytes(it);
 737     for (y = 0, n = 0; y < 4; ++y) {
 738       for (x = 0; x < 4; ++x, ++n) {
 739         const int ctx = it->top_nz_[x] + it->left_nz_[y];
 740         const int non_zero =
 741             TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
 742                                  &dqm->y1_, dqm->lambda_trellis_i16_);
 743         it->top_nz_[x] = it->left_nz_[y] = non_zero;
 744         rd->y_ac_levels[n][0] = 0;
 745         nz |= non_zero << n;
 746       }
 747     }
 748   } else {
 749     for (n = 0; n < 16; ++n) {
 750       // Zero-out the first coeff, so that: a) nz is correct below, and
 751       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
 752       tmp[n][0] = 0;
 753       nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
 754       assert(rd->y_ac_levels[n][0] == 0);
 755     }
 756   }
 757
 758   // Transform back
 759   VP8TransformWHT(dc_tmp, tmp[0]);
 760   for (n = 0; n < 16; n += 2) {
 761     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
 762   }
 763
 764   return nz;
 765 }
 766
 767 static int ReconstructIntra4(VP8EncIterator* const it,
 768                              int16_t levels[16],
 769                              const uint8_t* const src,
 770                              uint8_t* const yuv_out,
 771                              int mode) {
 772   const VP8Encoder* const enc = it->enc_;
 773   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
 774   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 775   int nz = 0;
 776   int16_t tmp[16];
 777
 778   VP8FTransform(src, ref, tmp);
 779   if (DO_TRELLIS_I4 && it->do_trellis_) {
 780     const int x = it->i4_ & 3, y = it->i4_ >> 2;
 781     const int ctx = it->top_nz_[x] + it->left_nz_[y];
 782     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
 783                               dqm->lambda_trellis_i4_);
 784   } else {
 785     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
 786   }
 787   VP8ITransform(ref, tmp, yuv_out, 0);
 788   return nz;
 789 }
 790
 791 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
 792                          uint8_t* const yuv_out, int mode) {
 793   const VP8Encoder* const enc = it->enc_;
 794   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
 795   const uint8_t* const src = it->yuv_in_ + U_OFF;
 796   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 797   int nz = 0;
 798   int n;
 799   int16_t tmp[8][16];
 800
 801   for (n = 0; n < 8; ++n) {
 802     VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
 803   }
 804   if (DO_TRELLIS_UV && it->do_trellis_) {
 805     int ch, x, y;
 806     for (ch = 0, n = 0; ch <= 2; ch += 2) {
 807       for (y = 0; y < 2; ++y) {
 808         for (x = 0; x < 2; ++x, ++n) {
 809           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
 810           const int non_zero =
 811               TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
 812                                    &dqm->uv_, dqm->lambda_trellis_uv_);
 813           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
 814           nz |= non_zero << n;
 815         }
 816       }
 817     }
 818   } else {
 819     for (n = 0; n < 8; ++n) {
 820       nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
 821     }
 822   }
 823
 824   for (n = 0; n < 8; n += 2) {
 825     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
 826   }
 827   return (nz << 16);
 828 }
 829
 830 //------------------------------------------------------------------------------
 831 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
 832 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
 833
 834 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
 835   // We look at the first three AC coefficients to determine what is the average
 836   // delta between each sub-4x4 block.
 837   const int v0 = abs(DCs[1]);
 838   const int v1 = abs(DCs[4]);
 839   const int v2 = abs(DCs[5]);
 840   int max_v = (v0 > v1) ? v1 : v0;
 841   max_v = (v2 > max_v) ? v2 : max_v;
 842   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
 843 }
 844
 845 static void SwapPtr(uint8_t** a, uint8_t** b) {
 846   uint8_t* const tmp = *a;
 847   *a = *b;
 848   *b = tmp;
 849 }
 850
 851 static void SwapOut(VP8EncIterator* const it) {
 852   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
 853 }
 854
 855 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
 856   score_t score = 0;
 857   while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
 858     int i;
 859     for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
 860       score += (levels[i] != 0);
 861       if (score > thresh) return 0;
 862     }
 863     levels += 16;
 864   }
 865   return 1;
 866 }
 867
 868 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
 869   const int kNumBlocks = 16;
 870   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
 871   const int lambda = dqm->lambda_i16_;
 872   const int tlambda = dqm->tlambda_;
 873   const uint8_t* const src = it->yuv_in_ + Y_OFF;
 874   VP8ModeScore rd16;
 875   int mode;
 876
 877   rd->mode_i16 = -1;
 878   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
 879     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
 880     int nz;
 881
 882     // Reconstruct
 883     nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
 884
 885     // Measure RD-score
 886     rd16.D = VP8SSE16x16(src, tmp_dst);
 887     rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
 888             : 0;
 889     rd16.H = VP8FixedCostsI16[mode];
 890     rd16.R = VP8GetCostLuma16(it, &rd16);
 891     if (mode > 0 &&
 892         IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
 893       // penalty to avoid flat area to be mispredicted by complex mode
 894       rd16.R += FLATNESS_PENALTY * kNumBlocks;
 895     }
 896
 897     // Since we always examine Intra16 first, we can overwrite *rd directly.
 898     SetRDScore(lambda, &rd16);
 899     if (mode == 0 || rd16.score < rd->score) {
 900       CopyScore(rd, &rd16);
 901       rd->mode_i16 = mode;
 902       rd->nz = nz;
 903       memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
 904       memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
 905       SwapOut(it);
 906     }
 907   }
 908   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
 909   VP8SetIntra16Mode(it, rd->mode_i16);
 910
 911   // we have a blocky macroblock (only DCs are non-zero) with fairly high
 912   // distortion, record max delta so we can later adjust the minimal filtering
 913   // strength needed to smooth these blocks out.
 914   if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
 915     StoreMaxDelta(dqm, rd->y_dc_levels);
 916   }
 917 }
 918
 919 //------------------------------------------------------------------------------
 920
 921 // return the cost array corresponding to the surrounding prediction modes.
 922 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
 923                                      const uint8_t modes[16]) {
 924   const int preds_w = it->enc_->preds_w_;
 925   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
 926   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
 927   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
 928   return VP8FixedCostsI4[top][left];
 929 }
 930
 931 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
 932   const VP8Encoder* const enc = it->enc_;
 933   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 934   const int lambda = dqm->lambda_i4_;
 935   const int tlambda = dqm->tlambda_;
 936   const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
 937   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
 938   int total_header_bits = 0;
 939   VP8ModeScore rd_best;
 940
 941   if (enc->max_i4_header_bits_ == 0) {
 942     return 0;
 943   }
 944
 945   InitScore(&rd_best);
 946   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
 947   SetRDScore(dqm->lambda_mode_, &rd_best);
 948   VP8IteratorStartI4(it);
 949   do {
 950     const int kNumBlocks = 1;
 951     VP8ModeScore rd_i4;
 952     int mode;
 953     int best_mode = -1;
 954     const uint8_t* const src = src0 + VP8Scan[it->i4_];
 955     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
 956     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
 957     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
 958
 959     InitScore(&rd_i4);
 960     VP8MakeIntra4Preds(it);
 961     for (mode = 0; mode < NUM_BMODES; ++mode) {
 962       VP8ModeScore rd_tmp;
 963       int16_t tmp_levels[16];
 964
 965       // Reconstruct
 966       rd_tmp.nz =
 967           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
 968
 969       // Compute RD-score
 970       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
 971       rd_tmp.SD =
 972           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
 973                   : 0;
 974       rd_tmp.H = mode_costs[mode];
 975       rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
 976       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
 977         rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
 978       }
 979
 980       SetRDScore(lambda, &rd_tmp);
 981       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
 982         CopyScore(&rd_i4, &rd_tmp);
 983         best_mode = mode;
 984         SwapPtr(&tmp_dst, &best_block);
 985         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
 986       }
 987     }
 988     SetRDScore(dqm->lambda_mode_, &rd_i4);
 989     AddScore(&rd_best, &rd_i4);
 990     if (rd_best.score >= rd->score) {
 991       return 0;
 992     }
 993     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
 994     if (total_header_bits > enc->max_i4_header_bits_) {
 995       return 0;
 996     }
 997     // Copy selected samples if not in the right place already.
 998     if (best_block != best_blocks + VP8Scan[it->i4_]) {
 999       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1000     }
1001     rd->modes_i4[it->i4_] = best_mode;
1002     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1003   } while (VP8IteratorRotateI4(it, best_blocks));
1004
1005   // finalize state
1006   CopyScore(rd, &rd_best);
1007   VP8SetIntra4Mode(it, rd->modes_i4);
1008   SwapOut(it);
1009   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1010   return 1;   // select intra4x4 over intra16x16
1011 }
1012
1013 //------------------------------------------------------------------------------
1014
1015 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1016   const int kNumBlocks = 8;
1017   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1018   const int lambda = dqm->lambda_uv_;
1019   const uint8_t* const src = it->yuv_in_ + U_OFF;
1020   uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF;  // scratch buffer
1021   uint8_t* const dst0 = it->yuv_out_ + U_OFF;
1022   VP8ModeScore rd_best;
1023   int mode;
1024
1025   rd->mode_uv = -1;
1026   InitScore(&rd_best);
1027   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1028     VP8ModeScore rd_uv;
1029
1030     // Reconstruct
1031     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1032
1033     // Compute RD-score
1034     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1035     rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
1036     rd_uv.H  = VP8FixedCostsUV[mode];
1037     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1038     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1039       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1040     }
1041
1042     SetRDScore(lambda, &rd_uv);
1043     if (mode == 0 || rd_uv.score < rd_best.score) {
1044       CopyScore(&rd_best, &rd_uv);
1045       rd->mode_uv = mode;
1046       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1047       memcpy(dst0, tmp_dst, UV_SIZE);   //  TODO: SwapUVOut() ?
1048     }
1049   }
1050   VP8SetIntraUVMode(it, rd->mode_uv);
1051   AddScore(rd, &rd_best);
1052 }
1053
1054 //------------------------------------------------------------------------------
1055 // Final reconstruction and quantization.
1056
1057 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1058   const VP8Encoder* const enc = it->enc_;
1059   const int is_i16 = (it->mb_->type_ == 1);
1060   int nz = 0;
1061
1062   if (is_i16) {
1063     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
1064   } else {
1065     VP8IteratorStartI4(it);
1066     do {
1067       const int mode =
1068           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1069       const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
1070       uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
1071       VP8MakeIntra4Preds(it);
1072       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1073                               src, dst, mode) << it->i4_;
1074     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
1075   }
1076
1077   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
1078   rd->nz = nz;
1079 }
1080
1081 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
1082 static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
1083   const int is_i16 = (it->mb_->type_ == 1);
1084   score_t best_score = MAX_COST;
1085
1086   if (try_both_i4_i16 || is_i16) {
1087     int mode;
1088     int best_mode = -1;
1089     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1090       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1091       const uint8_t* const src = it->yuv_in_ + Y_OFF;
1092       const score_t score = VP8SSE16x16(src, ref);
1093       if (score < best_score) {
1094         best_mode = mode;
1095         best_score = score;
1096       }
1097     }
1098     VP8SetIntra16Mode(it, best_mode);
1099   }
1100   if (try_both_i4_i16 || !is_i16) {
1101     uint8_t modes_i4[16];
1102     // We don't evaluate the rate here, but just account for it through a
1103     // constant penalty (i4 mode usually needs more bits compared to i16).
1104     score_t score_i4 = (score_t)I4_PENALTY;
1105
1106     VP8IteratorStartI4(it);
1107     do {
1108       int mode;
1109       int best_sub_mode = -1;
1110       score_t best_sub_score = MAX_COST;
1111       const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
1112
1113       // TODO(skal): we don't really need the prediction pixels here,
1114       // but just the distortion against 'src'.
1115       VP8MakeIntra4Preds(it);
1116       for (mode = 0; mode < NUM_BMODES; ++mode) {
1117         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1118         const score_t score = VP8SSE4x4(src, ref);
1119         if (score < best_sub_score) {
1120           best_sub_mode = mode;
1121           best_sub_score = score;
1122         }
1123       }
1124       modes_i4[it->i4_] = best_sub_mode;
1125       score_i4 += best_sub_score;
1126       if (score_i4 >= best_score) break;
1127     } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
1128     if (score_i4 < best_score) {
1129       VP8SetIntra4Mode(it, modes_i4);
1130     }
1131   }
1132 }
1133
1134 //------------------------------------------------------------------------------
1135 // Entry point
1136
1137 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1138                 VP8RDLevel rd_opt) {
1139   int is_skipped;
1140   const int method = it->enc_->method_;
1141
1142   InitScore(rd);
1143
1144   // We can perform predictions for Luma16x16 and Chroma8x8 already.
1145   // Luma4x4 predictions needs to be done as-we-go.
1146   VP8MakeLuma16Preds(it);
1147   VP8MakeChroma8Preds(it);
1148
1149   if (rd_opt > RD_OPT_NONE) {
1150     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1151     PickBestIntra16(it, rd);
1152     if (method >= 2) {
1153       PickBestIntra4(it, rd);
1154     }
1155     PickBestUV(it, rd);
1156     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1157       it->do_trellis_ = 1;
1158       SimpleQuantize(it, rd);
1159     }
1160   } else {
1161     // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
1162     // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
1163     DistoRefine(it, (method >= 2));
1164     SimpleQuantize(it, rd);
1165   }
1166   is_skipped = (rd->nz == 0);
1167   VP8SetSkip(it, is_skipped);
1168   return is_skipped;
1169 }
1170