third_party/libwebp/enc/quant.c

   1 // Copyright 2011 Google Inc. All Rights Reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style license
   4 // that can be found in the COPYING file in the root of the source
   5 // tree. An additional intellectual property rights grant can be found
   6 // in the file PATENTS. All contributing project authors may
   7 // be found in the AUTHORS file in the root of the source tree.
   8 // -----------------------------------------------------------------------------
   9 //
  10 //   Quantization
  11 //
  12 // Author: Skal (pascal.massimino@gmail.com)
  13
  14 #include <assert.h>
  15 #include <math.h>
  16 #include <stdlib.h>  // for abs()
  17
  18 #include "./vp8enci.h"
  19 #include "./cost.h"
  20
  21 #define DO_TRELLIS_I4  1
  22 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
  23 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
  24 #define USE_TDISTO 1
  25
  26 #define MID_ALPHA 64      // neutral value for susceptibility
  27 #define MIN_ALPHA 30      // lowest usable value for susceptibility
  28 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
  29
  30 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
  31                           // power-law modulation. Must be strictly less than 1.
  32
  33 #define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
  34
  35 // number of non-zero coeffs below which we consider the block very flat
  36 // (and apply a penalty to complex predictions)
  37 #define FLATNESS_LIMIT_I16 10      // I16 mode
  38 #define FLATNESS_LIMIT_I4  3       // I4 mode
  39 #define FLATNESS_LIMIT_UV  2       // UV mode
  40 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
  41
  42 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
  43
  44 // #define DEBUG_BLOCK
  45
  46 //------------------------------------------------------------------------------
  47
  48 #if defined(DEBUG_BLOCK)
  49
  50 #include <stdio.h>
  51 #include <stdlib.h>
  52
  53 static void PrintBlockInfo(const VP8EncIterator* const it,
  54                            const VP8ModeScore* const rd) {
  55   int i, j;
  56   const int is_i16 = (it->mb_->type_ == 1);
  57   printf("SOURCE / OUTPUT / ABS DELTA\n");
  58   for (j = 0; j < 24; ++j) {
  59     if (j == 16) printf("\n");   // newline before the U/V block
  60     for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
  61     printf("     ");
  62     for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
  63     printf("     ");
  64     for (i = 0; i < 16; ++i) {
  65       printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
  66     }
  67     printf("\n");
  68   }
  69   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
  70     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
  71     (int)rd->score);
  72   if (is_i16) {
  73     printf("Mode: %d\n", rd->mode_i16);
  74     printf("y_dc_levels:");
  75     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
  76     printf("\n");
  77   } else {
  78     printf("Modes[16]: ");
  79     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
  80     printf("\n");
  81   }
  82   printf("y_ac_levels:\n");
  83   for (j = 0; j < 16; ++j) {
  84     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
  85       printf("%4d ", rd->y_ac_levels[j][i]);
  86     }
  87     printf("\n");
  88   }
  89   printf("\n");
  90   printf("uv_levels (mode=%d):\n", rd->mode_uv);
  91   for (j = 0; j < 8; ++j) {
  92     for (i = 0; i < 16; ++i) {
  93       printf("%4d ", rd->uv_levels[j][i]);
  94     }
  95     printf("\n");
  96   }
  97 }
  98
  99 #endif   // DEBUG_BLOCK
 100
 101 //------------------------------------------------------------------------------
 102
 103 static WEBP_INLINE int clip(int v, int m, int M) {
 104   return v < m ? m : v > M ? M : v;
 105 }
 106
 107 static const uint8_t kZigzag[16] = {
 108   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
 109 };
 110
 111 static const uint8_t kDcTable[128] = {
 112   4,     5,   6,   7,   8,   9,  10,  10,
 113   11,   12,  13,  14,  15,  16,  17,  17,
 114   18,   19,  20,  20,  21,  21,  22,  22,
 115   23,   23,  24,  25,  25,  26,  27,  28,
 116   29,   30,  31,  32,  33,  34,  35,  36,
 117   37,   37,  38,  39,  40,  41,  42,  43,
 118   44,   45,  46,  46,  47,  48,  49,  50,
 119   51,   52,  53,  54,  55,  56,  57,  58,
 120   59,   60,  61,  62,  63,  64,  65,  66,
 121   67,   68,  69,  70,  71,  72,  73,  74,
 122   75,   76,  76,  77,  78,  79,  80,  81,
 123   82,   83,  84,  85,  86,  87,  88,  89,
 124   91,   93,  95,  96,  98, 100, 101, 102,
 125   104, 106, 108, 110, 112, 114, 116, 118,
 126   122, 124, 126, 128, 130, 132, 134, 136,
 127   138, 140, 143, 145, 148, 151, 154, 157
 128 };
 129
 130 static const uint16_t kAcTable[128] = {
 131   4,     5,   6,   7,   8,   9,  10,  11,
 132   12,   13,  14,  15,  16,  17,  18,  19,
 133   20,   21,  22,  23,  24,  25,  26,  27,
 134   28,   29,  30,  31,  32,  33,  34,  35,
 135   36,   37,  38,  39,  40,  41,  42,  43,
 136   44,   45,  46,  47,  48,  49,  50,  51,
 137   52,   53,  54,  55,  56,  57,  58,  60,
 138   62,   64,  66,  68,  70,  72,  74,  76,
 139   78,   80,  82,  84,  86,  88,  90,  92,
 140   94,   96,  98, 100, 102, 104, 106, 108,
 141   110, 112, 114, 116, 119, 122, 125, 128,
 142   131, 134, 137, 140, 143, 146, 149, 152,
 143   155, 158, 161, 164, 167, 170, 173, 177,
 144   181, 185, 189, 193, 197, 201, 205, 209,
 145   213, 217, 221, 225, 229, 234, 239, 245,
 146   249, 254, 259, 264, 269, 274, 279, 284
 147 };
 148
 149 static const uint16_t kAcTable2[128] = {
 150   8,     8,   9,  10,  12,  13,  15,  17,
 151   18,   20,  21,  23,  24,  26,  27,  29,
 152   31,   32,  34,  35,  37,  38,  40,  41,
 153   43,   44,  46,  48,  49,  51,  52,  54,
 154   55,   57,  58,  60,  62,  63,  65,  66,
 155   68,   69,  71,  72,  74,  75,  77,  79,
 156   80,   82,  83,  85,  86,  88,  89,  93,
 157   96,   99, 102, 105, 108, 111, 114, 117,
 158   120, 124, 127, 130, 133, 136, 139, 142,
 159   145, 148, 151, 155, 158, 161, 164, 167,
 160   170, 173, 176, 179, 184, 189, 193, 198,
 161   203, 207, 212, 217, 221, 226, 230, 235,
 162   240, 244, 249, 254, 258, 263, 268, 274,
 163   280, 286, 292, 299, 305, 311, 317, 323,
 164   330, 336, 342, 348, 354, 362, 370, 379,
 165   385, 393, 401, 409, 416, 424, 432, 440
 166 };
 167
 168 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
 169   { 96, 110 }, { 96, 108 }, { 110, 115 }
 170 };
 171
 172 // Sharpening by (slightly) raising the hi-frequency coeffs.
 173 // Hack-ish but helpful for mid-bitrate range. Use with care.
 174 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
 175 static const uint8_t kFreqSharpening[16] = {
 176   0,  30, 60, 90,
 177   30, 60, 90, 90,
 178   60, 90, 90, 90,
 179   90, 90, 90, 90
 180 };
 181
 182 //------------------------------------------------------------------------------
 183 // Initialize quantization parameters in VP8Matrix
 184
 185 // Returns the average quantizer
 186 static int ExpandMatrix(VP8Matrix* const m, int type) {
 187   int i, sum;
 188   for (i = 0; i < 2; ++i) {
 189     const int is_ac_coeff = (i > 0);
 190     const int bias = kBiasMatrices[type][is_ac_coeff];
 191     m->iq_[i] = (1 << QFIX) / m->q_[i];
 192     m->bias_[i] = BIAS(bias);
 193     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
 194     //   * zero if coeff <= zthresh
 195     //   * non-zero if coeff > zthresh
 196     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
 197   }
 198   for (i = 2; i < 16; ++i) {
 199     m->q_[i] = m->q_[1];
 200     m->iq_[i] = m->iq_[1];
 201     m->bias_[i] = m->bias_[1];
 202     m->zthresh_[i] = m->zthresh_[1];
 203   }
 204   for (sum = 0, i = 0; i < 16; ++i) {
 205     if (type == 0) {  // we only use sharpening for AC luma coeffs
 206       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
 207     } else {
 208       m->sharpen_[i] = 0;
 209     }
 210     sum += m->q_[i];
 211   }
 212   return (sum + 8) >> 4;
 213 }
 214
 215 static void SetupMatrices(VP8Encoder* enc) {
 216   int i;
 217   const int tlambda_scale =
 218     (enc->method_ >= 4) ? enc->config_->sns_strength
 219                         : 0;
 220   const int num_segments = enc->segment_hdr_.num_segments_;
 221   for (i = 0; i < num_segments; ++i) {
 222     VP8SegmentInfo* const m = &enc->dqm_[i];
 223     const int q = m->quant_;
 224     int q4, q16, quv;
 225     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
 226     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
 227
 228     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
 229     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
 230
 231     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
 232     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
 233
 234     q4  = ExpandMatrix(&m->y1_, 0);
 235     q16 = ExpandMatrix(&m->y2_, 1);
 236     quv = ExpandMatrix(&m->uv_, 2);
 237
 238     m->lambda_i4_          = (3 * q4 * q4) >> 7;
 239     m->lambda_i16_         = (3 * q16 * q16);
 240     m->lambda_uv_          = (3 * quv * quv) >> 6;
 241     m->lambda_mode_        = (1 * q4 * q4) >> 7;
 242     m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
 243     m->lambda_trellis_i16_ = (q16 * q16) >> 2;
 244     m->lambda_trellis_uv_  = (quv *quv) << 1;
 245     m->tlambda_            = (tlambda_scale * q4) >> 5;
 246
 247     m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
 248     m->max_edge_  = 0;
 249   }
 250 }
 251
 252 //------------------------------------------------------------------------------
 253 // Initialize filtering parameters
 254
 255 // Very small filter-strength values have close to no visual effect. So we can
 256 // save a little decoding-CPU by turning filtering off for these.
 257 #define FSTRENGTH_CUTOFF 2
 258
 259 static void SetupFilterStrength(VP8Encoder* const enc) {
 260   int i;
 261   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
 262   const int level0 = 5 * enc->config_->filter_strength;
 263   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
 264     VP8SegmentInfo* const m = &enc->dqm_[i];
 265     // We focus on the quantization of AC coeffs.
 266     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
 267     const int base_strength =
 268         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
 269     // Segments with lower complexity ('beta') will be less filtered.
 270     const int f = base_strength * level0 / (256 + m->beta_);
 271     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
 272   }
 273   // We record the initial strength (mainly for the case of 1-segment only).
 274   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
 275   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
 276   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
 277 }
 278
 279 //------------------------------------------------------------------------------
 280
 281 // Note: if you change the values below, remember that the max range
 282 // allowed by the syntax for DQ_UV is [-16,16].
 283 #define MAX_DQ_UV (6)
 284 #define MIN_DQ_UV (-4)
 285
 286 // We want to emulate jpeg-like behaviour where the expected "good" quality
 287 // is around q=75. Internally, our "good" middle is around c=50. So we
 288 // map accordingly using linear piece-wise function
 289 static double QualityToCompression(double c) {
 290   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
 291   // The file size roughly scales as pow(quantizer, 3.). Actually, the
 292   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
 293   // in the mid-quant range. So we scale the compressibility inversely to
 294   // this power-law: quant ~= compression ^ 1/3. This law holds well for
 295   // low quant. Finer modeling for high-quant would make use of kAcTable[]
 296   // more explicitly.
 297   const double v = pow(linear_c, 1 / 3.);
 298   return v;
 299 }
 300
 301 static double QualityToJPEGCompression(double c, double alpha) {
 302   // We map the complexity 'alpha' and quality setting 'c' to a compression
 303   // exponent empirically matched to the compression curve of libjpeg6b.
 304   // On average, the WebP output size will be roughly similar to that of a
 305   // JPEG file compressed with same quality factor.
 306   const double amin = 0.30;
 307   const double amax = 0.85;
 308   const double exp_min = 0.4;
 309   const double exp_max = 0.9;
 310   const double slope = (exp_min - exp_max) / (amax - amin);
 311   // Linearly interpolate 'expn' from exp_min to exp_max
 312   // in the [amin, amax] range.
 313   const double expn = (alpha > amax) ? exp_min
 314                     : (alpha < amin) ? exp_max
 315                     : exp_max + slope * (alpha - amin);
 316   const double v = pow(c, expn);
 317   return v;
 318 }
 319
 320 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
 321                                  const VP8SegmentInfo* const S2) {
 322   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
 323 }
 324
 325 static void SimplifySegments(VP8Encoder* const enc) {
 326   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
 327   const int num_segments = enc->segment_hdr_.num_segments_;
 328   int num_final_segments = 1;
 329   int s1, s2;
 330   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
 331     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
 332     int found = 0;
 333     // check if we already have similar segment
 334     for (s2 = 0; s2 < num_final_segments; ++s2) {
 335       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
 336       if (SegmentsAreEquivalent(S1, S2)) {
 337         found = 1;
 338         break;
 339       }
 340     }
 341     map[s1] = s2;
 342     if (!found) {
 343       if (num_final_segments != s1) {
 344         enc->dqm_[num_final_segments] = enc->dqm_[s1];
 345       }
 346       ++num_final_segments;
 347     }
 348   }
 349   if (num_final_segments < num_segments) {  // Remap
 350     int i = enc->mb_w_ * enc->mb_h_;
 351     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
 352     enc->segment_hdr_.num_segments_ = num_final_segments;
 353     // Replicate the trailing segment infos (it's mostly cosmetics)
 354     for (i = num_final_segments; i < num_segments; ++i) {
 355       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
 356     }
 357   }
 358 }
 359
 360 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
 361   int i;
 362   int dq_uv_ac, dq_uv_dc;
 363   const int num_segments = enc->segment_hdr_.num_segments_;
 364   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
 365   const double Q = quality / 100.;
 366   const double c_base = enc->config_->emulate_jpeg_size ?
 367       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
 368       QualityToCompression(Q);
 369   for (i = 0; i < num_segments; ++i) {
 370     // We modulate the base coefficient to accommodate for the quantization
 371     // susceptibility and allow denser segments to be quantized more.
 372     const double expn = 1. - amp * enc->dqm_[i].alpha_;
 373     const double c = pow(c_base, expn);
 374     const int q = (int)(127. * (1. - c));
 375     assert(expn > 0.);
 376     enc->dqm_[i].quant_ = clip(q, 0, 127);
 377   }
 378
 379   // purely indicative in the bitstream (except for the 1-segment case)
 380   enc->base_quant_ = enc->dqm_[0].quant_;
 381
 382   // fill-in values for the unused segments (required by the syntax)
 383   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
 384     enc->dqm_[i].quant_ = enc->base_quant_;
 385   }
 386
 387   // uv_alpha_ is normally spread around ~60. The useful range is
 388   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
 389   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
 390   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
 391                                           / (MAX_ALPHA - MIN_ALPHA);
 392   // we rescale by the user-defined strength of adaptation
 393   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
 394   // and make it safe.
 395   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
 396   // We also boost the dc-uv-quant a little, based on sns-strength, since
 397   // U/V channels are quite more reactive to high quants (flat DC-blocks
 398   // tend to appear, and are displeasant).
 399   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
 400   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
 401
 402   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
 403   enc->dq_y2_dc_ = 0;
 404   enc->dq_y2_ac_ = 0;
 405   enc->dq_uv_dc_ = dq_uv_dc;
 406   enc->dq_uv_ac_ = dq_uv_ac;
 407
 408   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
 409
 410   if (num_segments > 1) SimplifySegments(enc);
 411
 412   SetupMatrices(enc);         // finalize quantization matrices
 413 }
 414
 415 //------------------------------------------------------------------------------
 416 // Form the predictions in cache
 417
 418 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
 419 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
 420 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
 421
 422 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
 423 const int VP8I4ModeOffsets[NUM_BMODES] = {
 424   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
 425 };
 426
 427 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
 428   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
 429   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
 430   VP8EncPredLuma16(it->yuv_p_, left, top);
 431 }
 432
 433 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
 434   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
 435   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
 436   VP8EncPredChroma8(it->yuv_p_, left, top);
 437 }
 438
 439 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
 440   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
 441 }
 442
 443 //------------------------------------------------------------------------------
 444 // Quantize
 445
 446 // Layout:
 447 // +----+
 448 // |YYYY| 0
 449 // |YYYY| 4
 450 // |YYYY| 8
 451 // |YYYY| 12
 452 // +----+
 453 // |UUVV| 16
 454 // |UUVV| 20
 455 // +----+
 456
 457 const int VP8Scan[16 + 4 + 4] = {
 458   // Luma
 459   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
 460   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
 461   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
 462   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
 463
 464   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
 465   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 466 };
 467
 468 //------------------------------------------------------------------------------
 469 // Distortion measurement
 470
 471 static const uint16_t kWeightY[16] = {
 472   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
 473 };
 474
 475 static const uint16_t kWeightTrellis[16] = {
 476 #if USE_TDISTO == 0
 477   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 478 #else
 479   30, 27, 19, 11,
 480   27, 24, 17, 10,
 481   19, 17, 12,  8,
 482   11, 10,  8,  6
 483 #endif
 484 };
 485
 486 // Init/Copy the common fields in score.
 487 static void InitScore(VP8ModeScore* const rd) {
 488   rd->D  = 0;
 489   rd->SD = 0;
 490   rd->R  = 0;
 491   rd->H  = 0;
 492   rd->nz = 0;
 493   rd->score = MAX_COST;
 494 }
 495
 496 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
 497   dst->D  = src->D;
 498   dst->SD = src->SD;
 499   dst->R  = src->R;
 500   dst->H  = src->H;
 501   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
 502   dst->score = src->score;
 503 }
 504
 505 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
 506   dst->D  += src->D;
 507   dst->SD += src->SD;
 508   dst->R  += src->R;
 509   dst->H  += src->H;
 510   dst->nz |= src->nz;     // here, new nz bits are accumulated.
 511   dst->score += src->score;
 512 }
 513
 514 //------------------------------------------------------------------------------
 515 // Performs trellis-optimized quantization.
 516
 517 // Trellis
 518
 519 typedef struct {
 520   int prev;        // best previous
 521   int level;       // level
 522   int sign;        // sign of coeff_i
 523   score_t cost;    // bit cost
 524   score_t error;   // distortion = sum of (|coeff_i| - level_i * Q_i)^2
 525   int ctx;         // context (only depends on 'level'. Could be spared.)
 526 } Node;
 527
 528 // If a coefficient was quantized to a value Q (using a neutral bias),
 529 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
 530 // We don't test negative values though.
 531 #define MIN_DELTA 0   // how much lower level to try
 532 #define MAX_DELTA 1   // how much higher
 533 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
 534 #define NODE(n, l) (nodes[(n) + 1][(l) + MIN_DELTA])
 535
 536 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
 537   // TODO: incorporate the "* 256" in the tables?
 538   rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
 539 }
 540
 541 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
 542                                           score_t distortion) {
 543   return rate * lambda + 256 * distortion;
 544 }
 545
 546 static int TrellisQuantizeBlock(const VP8EncIterator* const it,
 547                                 int16_t in[16], int16_t out[16],
 548                                 int ctx0, int coeff_type,
 549                                 const VP8Matrix* const mtx,
 550                                 int lambda) {
 551   ProbaArray* const last_costs = it->enc_->proba_.coeffs_[coeff_type];
 552   CostArray* const costs = it->enc_->proba_.level_cost_[coeff_type];
 553   const int first = (coeff_type == 0) ? 1 : 0;
 554   Node nodes[17][NUM_NODES];
 555   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
 556   score_t best_score;
 557   int best_node;
 558   int last = first - 1;
 559   int n, m, p, nz;
 560
 561   {
 562     score_t cost;
 563     score_t max_error;
 564     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
 565     const int last_proba = last_costs[VP8EncBands[first]][ctx0][0];
 566
 567     // compute maximal distortion.
 568     max_error = 0;
 569     for (n = first; n < 16; ++n) {
 570       const int j  = kZigzag[n];
 571       const int err = in[j] * in[j];
 572       max_error += kWeightTrellis[j] * err;
 573       if (err > thresh) last = n;
 574     }
 575     // we don't need to go inspect up to n = 16 coeffs. We can just go up
 576     // to last + 1 (inclusive) without losing much.
 577     if (last < 15) ++last;
 578
 579     // compute 'skip' score. This is the max score one can do.
 580     cost = VP8BitCost(0, last_proba);
 581     best_score = RDScoreTrellis(lambda, cost, max_error);
 582
 583     // initialize source node.
 584     n = first - 1;
 585     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
 586       NODE(n, m).cost = 0;
 587       NODE(n, m).error = max_error;
 588       NODE(n, m).ctx = ctx0;
 589     }
 590   }
 591
 592   // traverse trellis.
 593   for (n = first; n <= last; ++n) {
 594     const int j  = kZigzag[n];
 595     const int Q  = mtx->q_[j];
 596     const int iQ = mtx->iq_[j];
 597     const int B = BIAS(0x00);     // neutral bias
 598     // note: it's important to take sign of the _original_ coeff,
 599     // so we don't have to consider level < 0 afterward.
 600     const int sign = (in[j] < 0);
 601     const int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
 602     int level0 = QUANTDIV(coeff0, iQ, B);
 603     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
 604
 605     // test all alternate level values around level0.
 606     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
 607       Node* const cur = &NODE(n, m);
 608       int delta_error, new_error;
 609       score_t cur_score = MAX_COST;
 610       int level = level0 + m;
 611       int last_proba;
 612
 613       cur->sign = sign;
 614       cur->level = level;
 615       cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2;
 616       if (level > MAX_LEVEL || level < 0) {   // node is dead?
 617         cur->cost = MAX_COST;
 618         continue;
 619       }
 620       last_proba = last_costs[VP8EncBands[n + 1]][cur->ctx][0];
 621
 622       // Compute delta_error = how much coding this level will
 623       // subtract as distortion to max_error
 624       new_error = coeff0 - level * Q;
 625       delta_error =
 626         kWeightTrellis[j] * (coeff0 * coeff0 - new_error * new_error);
 627
 628       // Inspect all possible non-dead predecessors. Retain only the best one.
 629       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
 630         const Node* const prev = &NODE(n - 1, p);
 631         const int prev_ctx = prev->ctx;
 632         const uint16_t* const tcost = costs[VP8EncBands[n]][prev_ctx];
 633         const score_t total_error = prev->error - delta_error;
 634         score_t cost, base_cost, score;
 635
 636         if (prev->cost >= MAX_COST) {   // dead node?
 637           continue;
 638         }
 639
 640         // Base cost of both terminal/non-terminal
 641         base_cost = prev->cost + VP8LevelCost(tcost, level);
 642
 643         // Examine node assuming it's a non-terminal one.
 644         cost = base_cost;
 645         if (level && n < 15) {
 646           cost += VP8BitCost(1, last_proba);
 647         }
 648         score = RDScoreTrellis(lambda, cost, total_error);
 649         if (score < cur_score) {
 650           cur_score = score;
 651           cur->cost  = cost;
 652           cur->error = total_error;
 653           cur->prev  = p;
 654         }
 655
 656         // Now, record best terminal node (and thus best entry in the graph).
 657         if (level) {
 658           cost = base_cost;
 659           if (n < 15) cost += VP8BitCost(0, last_proba);
 660           score = RDScoreTrellis(lambda, cost, total_error);
 661           if (score < best_score) {
 662             best_score = score;
 663             best_path[0] = n;   // best eob position
 664             best_path[1] = m;   // best level
 665             best_path[2] = p;   // best predecessor
 666           }
 667         }
 668       }
 669     }
 670   }
 671
 672   // Fresh start
 673   memset(in + first, 0, (16 - first) * sizeof(*in));
 674   memset(out + first, 0, (16 - first) * sizeof(*out));
 675   if (best_path[0] == -1) {
 676     return 0;   // skip!
 677   }
 678
 679   // Unwind the best path.
 680   // Note: best-prev on terminal node is not necessarily equal to the
 681   // best_prev for non-terminal. So we patch best_path[2] in.
 682   n = best_path[0];
 683   best_node = best_path[1];
 684   NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
 685   nz = 0;
 686
 687   for (; n >= first; --n) {
 688     const Node* const node = &NODE(n, best_node);
 689     const int j = kZigzag[n];
 690     out[n] = node->sign ? -node->level : node->level;
 691     nz |= (node->level != 0);
 692     in[j] = out[n] * mtx->q_[j];
 693     best_node = node->prev;
 694   }
 695   return nz;
 696 }
 697
 698 #undef NODE
 699
 700 //------------------------------------------------------------------------------
 701 // Performs: difference, transform, quantize, back-transform, add
 702 // all at once. Output is the reconstructed block in *yuv_out, and the
 703 // quantized levels in *levels.
 704
 705 static int ReconstructIntra16(VP8EncIterator* const it,
 706                               VP8ModeScore* const rd,
 707                               uint8_t* const yuv_out,
 708                               int mode) {
 709   VP8Encoder* const enc = it->enc_;
 710   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
 711   const uint8_t* const src = it->yuv_in_ + Y_OFF;
 712   VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 713   int nz = 0;
 714   int n;
 715   int16_t tmp[16][16], dc_tmp[16];
 716
 717   for (n = 0; n < 16; ++n) {
 718     VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
 719   }
 720   VP8FTransformWHT(tmp[0], dc_tmp);
 721   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
 722
 723   if (DO_TRELLIS_I16 && it->do_trellis_) {
 724     int x, y;
 725     VP8IteratorNzToBytes(it);
 726     for (y = 0, n = 0; y < 4; ++y) {
 727       for (x = 0; x < 4; ++x, ++n) {
 728         const int ctx = it->top_nz_[x] + it->left_nz_[y];
 729         const int non_zero =
 730            TrellisQuantizeBlock(it, tmp[n], rd->y_ac_levels[n], ctx, 0,
 731                                 &dqm->y1_, dqm->lambda_trellis_i16_);
 732         it->top_nz_[x] = it->left_nz_[y] = non_zero;
 733         nz |= non_zero << n;
 734       }
 735     }
 736   } else {
 737     for (n = 0; n < 16; ++n) {
 738       nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], 1, &dqm->y1_) << n;
 739     }
 740   }
 741
 742   // Transform back
 743   VP8ITransformWHT(dc_tmp, tmp[0]);
 744   for (n = 0; n < 16; n += 2) {
 745     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
 746   }
 747
 748   return nz;
 749 }
 750
 751 static int ReconstructIntra4(VP8EncIterator* const it,
 752                              int16_t levels[16],
 753                              const uint8_t* const src,
 754                              uint8_t* const yuv_out,
 755                              int mode) {
 756   const VP8Encoder* const enc = it->enc_;
 757   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
 758   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 759   int nz = 0;
 760   int16_t tmp[16];
 761
 762   VP8FTransform(src, ref, tmp);
 763   if (DO_TRELLIS_I4 && it->do_trellis_) {
 764     const int x = it->i4_ & 3, y = it->i4_ >> 2;
 765     const int ctx = it->top_nz_[x] + it->left_nz_[y];
 766     nz = TrellisQuantizeBlock(it, tmp, levels, ctx, 3, &dqm->y1_,
 767                               dqm->lambda_trellis_i4_);
 768   } else {
 769     nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_);
 770   }
 771   VP8ITransform(ref, tmp, yuv_out, 0);
 772   return nz;
 773 }
 774
 775 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
 776                          uint8_t* const yuv_out, int mode) {
 777   const VP8Encoder* const enc = it->enc_;
 778   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
 779   const uint8_t* const src = it->yuv_in_ + U_OFF;
 780   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 781   int nz = 0;
 782   int n;
 783   int16_t tmp[8][16];
 784
 785   for (n = 0; n < 8; ++n) {
 786     VP8FTransform(src + VP8Scan[16 + n], ref + VP8Scan[16 + n], tmp[n]);
 787   }
 788   if (DO_TRELLIS_UV && it->do_trellis_) {
 789     int ch, x, y;
 790     for (ch = 0, n = 0; ch <= 2; ch += 2) {
 791       for (y = 0; y < 2; ++y) {
 792         for (x = 0; x < 2; ++x, ++n) {
 793           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
 794           const int non_zero =
 795             TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2,
 796                                  &dqm->uv_, dqm->lambda_trellis_uv_);
 797           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
 798           nz |= non_zero << n;
 799         }
 800       }
 801     }
 802   } else {
 803     for (n = 0; n < 8; ++n) {
 804       nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], 0, &dqm->uv_) << n;
 805     }
 806   }
 807
 808   for (n = 0; n < 8; n += 2) {
 809     VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n], 1);
 810   }
 811   return (nz << 16);
 812 }
 813
 814 //------------------------------------------------------------------------------
 815 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
 816 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
 817
 818 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
 819   // We look at the first three AC coefficients to determine what is the average
 820   // delta between each sub-4x4 block.
 821   const int v0 = abs(DCs[1]);
 822   const int v1 = abs(DCs[4]);
 823   const int v2 = abs(DCs[5]);
 824   int max_v = (v0 > v1) ? v1 : v0;
 825   max_v = (v2 > max_v) ? v2 : max_v;
 826   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
 827 }
 828
 829 static void SwapPtr(uint8_t** a, uint8_t** b) {
 830   uint8_t* const tmp = *a;
 831   *a = *b;
 832   *b = tmp;
 833 }
 834
 835 static void SwapOut(VP8EncIterator* const it) {
 836   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
 837 }
 838
 839 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
 840   score_t score = 0;
 841   while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
 842     int i;
 843     for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
 844       score += (levels[i] != 0);
 845       if (score > thresh) return 0;
 846     }
 847     levels += 16;
 848   }
 849   return 1;
 850 }
 851
 852 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
 853   const int kNumBlocks = 16;
 854   VP8Encoder* const enc = it->enc_;
 855   VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 856   const int lambda = dqm->lambda_i16_;
 857   const int tlambda = dqm->tlambda_;
 858   const uint8_t* const src = it->yuv_in_ + Y_OFF;
 859   VP8ModeScore rd16;
 860   int mode;
 861
 862   rd->mode_i16 = -1;
 863   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
 864     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
 865     int nz;
 866
 867     // Reconstruct
 868     nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
 869
 870     // Measure RD-score
 871     rd16.D = VP8SSE16x16(src, tmp_dst);
 872     rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
 873             : 0;
 874     rd16.H = VP8FixedCostsI16[mode];
 875     rd16.R = VP8GetCostLuma16(it, &rd16);
 876     if (mode > 0 &&
 877         IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
 878       // penalty to avoid flat area to be mispredicted by complex mode
 879       rd16.R += FLATNESS_PENALTY * kNumBlocks;
 880     }
 881
 882     // Since we always examine Intra16 first, we can overwrite *rd directly.
 883     SetRDScore(lambda, &rd16);
 884     if (mode == 0 || rd16.score < rd->score) {
 885       CopyScore(rd, &rd16);
 886       rd->mode_i16 = mode;
 887       rd->nz = nz;
 888       memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
 889       memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
 890       SwapOut(it);
 891     }
 892   }
 893   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
 894   VP8SetIntra16Mode(it, rd->mode_i16);
 895
 896   // we have a blocky macroblock (only DCs are non-zero) with fairly high
 897   // distortion, record max delta so we can later adjust the minimal filtering
 898   // strength needed to smooth these blocks out.
 899   if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
 900     StoreMaxDelta(dqm, rd->y_dc_levels);
 901   }
 902 }
 903
 904 //------------------------------------------------------------------------------
 905
 906 // return the cost array corresponding to the surrounding prediction modes.
 907 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
 908                                      const uint8_t modes[16]) {
 909   const int preds_w = it->enc_->preds_w_;
 910   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
 911   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
 912   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
 913   return VP8FixedCostsI4[top][left];
 914 }
 915
 916 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
 917   const VP8Encoder* const enc = it->enc_;
 918   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 919   const int lambda = dqm->lambda_i4_;
 920   const int tlambda = dqm->tlambda_;
 921   const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
 922   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
 923   int total_header_bits = 0;
 924   VP8ModeScore rd_best;
 925
 926   if (enc->max_i4_header_bits_ == 0) {
 927     return 0;
 928   }
 929
 930   InitScore(&rd_best);
 931   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
 932   SetRDScore(dqm->lambda_mode_, &rd_best);
 933   VP8IteratorStartI4(it);
 934   do {
 935     const int kNumBlocks = 1;
 936     VP8ModeScore rd_i4;
 937     int mode;
 938     int best_mode = -1;
 939     const uint8_t* const src = src0 + VP8Scan[it->i4_];
 940     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
 941     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
 942     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
 943
 944     InitScore(&rd_i4);
 945     VP8MakeIntra4Preds(it);
 946     for (mode = 0; mode < NUM_BMODES; ++mode) {
 947       VP8ModeScore rd_tmp;
 948       int16_t tmp_levels[16];
 949
 950       // Reconstruct
 951       rd_tmp.nz =
 952           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
 953
 954       // Compute RD-score
 955       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
 956       rd_tmp.SD =
 957           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
 958                   : 0;
 959       rd_tmp.H = mode_costs[mode];
 960       rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
 961       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
 962         rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
 963       }
 964
 965       SetRDScore(lambda, &rd_tmp);
 966       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
 967         CopyScore(&rd_i4, &rd_tmp);
 968         best_mode = mode;
 969         SwapPtr(&tmp_dst, &best_block);
 970         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
 971       }
 972     }
 973     SetRDScore(dqm->lambda_mode_, &rd_i4);
 974     AddScore(&rd_best, &rd_i4);
 975     if (rd_best.score >= rd->score) {
 976       return 0;
 977     }
 978     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
 979     if (total_header_bits > enc->max_i4_header_bits_) {
 980       return 0;
 981     }
 982     // Copy selected samples if not in the right place already.
 983     if (best_block != best_blocks + VP8Scan[it->i4_]) {
 984       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
 985     }
 986     rd->modes_i4[it->i4_] = best_mode;
 987     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
 988   } while (VP8IteratorRotateI4(it, best_blocks));
 989
 990   // finalize state
 991   CopyScore(rd, &rd_best);
 992   VP8SetIntra4Mode(it, rd->modes_i4);
 993   SwapOut(it);
 994   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
 995   return 1;   // select intra4x4 over intra16x16
 996 }
 997
 998 //------------------------------------------------------------------------------
 999
1000 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1001   const int kNumBlocks = 8;
1002   const VP8Encoder* const enc = it->enc_;
1003   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1004   const int lambda = dqm->lambda_uv_;
1005   const uint8_t* const src = it->yuv_in_ + U_OFF;
1006   uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF;  // scratch buffer
1007   uint8_t* const dst0 = it->yuv_out_ + U_OFF;
1008   VP8ModeScore rd_best;
1009   int mode;
1010
1011   rd->mode_uv = -1;
1012   InitScore(&rd_best);
1013   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1014     VP8ModeScore rd_uv;
1015
1016     // Reconstruct
1017     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1018
1019     // Compute RD-score
1020     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1021     rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
1022     rd_uv.H  = VP8FixedCostsUV[mode];
1023     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1024     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1025       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1026     }
1027
1028     SetRDScore(lambda, &rd_uv);
1029     if (mode == 0 || rd_uv.score < rd_best.score) {
1030       CopyScore(&rd_best, &rd_uv);
1031       rd->mode_uv = mode;
1032       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1033       memcpy(dst0, tmp_dst, UV_SIZE);   //  TODO: SwapUVOut() ?
1034     }
1035   }
1036   VP8SetIntraUVMode(it, rd->mode_uv);
1037   AddScore(rd, &rd_best);
1038 }
1039
1040 //------------------------------------------------------------------------------
1041 // Final reconstruction and quantization.
1042
1043 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1044   const VP8Encoder* const enc = it->enc_;
1045   const int is_i16 = (it->mb_->type_ == 1);
1046   int nz = 0;
1047
1048   if (is_i16) {
1049     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
1050   } else {
1051     VP8IteratorStartI4(it);
1052     do {
1053       const int mode =
1054           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1055       const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
1056       uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
1057       VP8MakeIntra4Preds(it);
1058       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1059                               src, dst, mode) << it->i4_;
1060     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
1061   }
1062
1063   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
1064   rd->nz = nz;
1065 }
1066
1067 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
1068 static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
1069   const int is_i16 = (it->mb_->type_ == 1);
1070   score_t best_score = MAX_COST;
1071
1072   if (try_both_i4_i16 || is_i16) {
1073     int mode;
1074     int best_mode = -1;
1075     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1076       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1077       const uint8_t* const src = it->yuv_in_ + Y_OFF;
1078       const score_t score = VP8SSE16x16(src, ref);
1079       if (score < best_score) {
1080         best_mode = mode;
1081         best_score = score;
1082       }
1083     }
1084     VP8SetIntra16Mode(it, best_mode);
1085   }
1086   if (try_both_i4_i16 || !is_i16) {
1087     uint8_t modes_i4[16];
1088     // We don't evaluate the rate here, but just account for it through a
1089     // constant penalty (i4 mode usually needs more bits compared to i16).
1090     score_t score_i4 = (score_t)I4_PENALTY;
1091
1092     VP8IteratorStartI4(it);
1093     do {
1094       int mode;
1095       int best_sub_mode = -1;
1096       score_t best_sub_score = MAX_COST;
1097       const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
1098
1099       // TODO(skal): we don't really need the prediction pixels here,
1100       // but just the distortion against 'src'.
1101       VP8MakeIntra4Preds(it);
1102       for (mode = 0; mode < NUM_BMODES; ++mode) {
1103         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1104         const score_t score = VP8SSE4x4(src, ref);
1105         if (score < best_sub_score) {
1106           best_sub_mode = mode;
1107           best_sub_score = score;
1108         }
1109       }
1110       modes_i4[it->i4_] = best_sub_mode;
1111       score_i4 += best_sub_score;
1112       if (score_i4 >= best_score) break;
1113     } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
1114     if (score_i4 < best_score) {
1115       VP8SetIntra4Mode(it, modes_i4);
1116     }
1117   }
1118 }
1119
1120 //------------------------------------------------------------------------------
1121 // Entry point
1122
1123 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1124                 VP8RDLevel rd_opt) {
1125   int is_skipped;
1126   const int method = it->enc_->method_;
1127
1128   InitScore(rd);
1129
1130   // We can perform predictions for Luma16x16 and Chroma8x8 already.
1131   // Luma4x4 predictions needs to be done as-we-go.
1132   VP8MakeLuma16Preds(it);
1133   VP8MakeChroma8Preds(it);
1134
1135   if (rd_opt > RD_OPT_NONE) {
1136     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1137     PickBestIntra16(it, rd);
1138     if (method >= 2) {
1139       PickBestIntra4(it, rd);
1140     }
1141     PickBestUV(it, rd);
1142     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1143       it->do_trellis_ = 1;
1144       SimpleQuantize(it, rd);
1145     }
1146   } else {
1147     // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
1148     // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
1149     DistoRefine(it, (method >= 2));
1150     SimpleQuantize(it, rd);
1151   }
1152   is_skipped = (rd->nz == 0);
1153   VP8SetSkip(it, is_skipped);
1154   return is_skipped;
1155 }
1156