Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / third_party / libwebp / enc / quant.c
blob9130a41609baa781ca1a6633a56e2c0a2659fbe1
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // Quantization
12 // Author: Skal (pascal.massimino@gmail.com)
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h> // for abs()
18 #include "./vp8enci.h"
19 #include "./cost.h"
21 #define DO_TRELLIS_I4 1
22 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
23 #define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
24 #define USE_TDISTO 1
26 #define MID_ALPHA 64 // neutral value for susceptibility
27 #define MIN_ALPHA 30 // lowest usable value for susceptibility
28 #define MAX_ALPHA 100 // higher meaningful value for susceptibility
30 #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
31 // power-law modulation. Must be strictly less than 1.
33 #define I4_PENALTY 4000 // Rate-penalty for quick i4/i16 decision
35 // number of non-zero coeffs below which we consider the block very flat
36 // (and apply a penalty to complex predictions)
37 #define FLATNESS_LIMIT_I16 10 // I16 mode
38 #define FLATNESS_LIMIT_I4 3 // I4 mode
39 #define FLATNESS_LIMIT_UV 2 // UV mode
40 #define FLATNESS_PENALTY 140 // roughly ~1bit per block
42 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
44 // #define DEBUG_BLOCK
46 //------------------------------------------------------------------------------
48 #if defined(DEBUG_BLOCK)
50 #include <stdio.h>
51 #include <stdlib.h>
53 static void PrintBlockInfo(const VP8EncIterator* const it,
54 const VP8ModeScore* const rd) {
55 int i, j;
56 const int is_i16 = (it->mb_->type_ == 1);
57 printf("SOURCE / OUTPUT / ABS DELTA\n");
58 for (j = 0; j < 24; ++j) {
59 if (j == 16) printf("\n"); // newline before the U/V block
60 for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
61 printf(" ");
62 for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
63 printf(" ");
64 for (i = 0; i < 16; ++i) {
65 printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
67 printf("\n");
69 printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
70 (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
71 (int)rd->score);
72 if (is_i16) {
73 printf("Mode: %d\n", rd->mode_i16);
74 printf("y_dc_levels:");
75 for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
76 printf("\n");
77 } else {
78 printf("Modes[16]: ");
79 for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
80 printf("\n");
82 printf("y_ac_levels:\n");
83 for (j = 0; j < 16; ++j) {
84 for (i = is_i16 ? 1 : 0; i < 16; ++i) {
85 printf("%4d ", rd->y_ac_levels[j][i]);
87 printf("\n");
89 printf("\n");
90 printf("uv_levels (mode=%d):\n", rd->mode_uv);
91 for (j = 0; j < 8; ++j) {
92 for (i = 0; i < 16; ++i) {
93 printf("%4d ", rd->uv_levels[j][i]);
95 printf("\n");
99 #endif // DEBUG_BLOCK
101 //------------------------------------------------------------------------------
103 static WEBP_INLINE int clip(int v, int m, int M) {
104 return v < m ? m : v > M ? M : v;
107 static const uint8_t kZigzag[16] = {
108 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
111 static const uint8_t kDcTable[128] = {
112 4, 5, 6, 7, 8, 9, 10, 10,
113 11, 12, 13, 14, 15, 16, 17, 17,
114 18, 19, 20, 20, 21, 21, 22, 22,
115 23, 23, 24, 25, 25, 26, 27, 28,
116 29, 30, 31, 32, 33, 34, 35, 36,
117 37, 37, 38, 39, 40, 41, 42, 43,
118 44, 45, 46, 46, 47, 48, 49, 50,
119 51, 52, 53, 54, 55, 56, 57, 58,
120 59, 60, 61, 62, 63, 64, 65, 66,
121 67, 68, 69, 70, 71, 72, 73, 74,
122 75, 76, 76, 77, 78, 79, 80, 81,
123 82, 83, 84, 85, 86, 87, 88, 89,
124 91, 93, 95, 96, 98, 100, 101, 102,
125 104, 106, 108, 110, 112, 114, 116, 118,
126 122, 124, 126, 128, 130, 132, 134, 136,
127 138, 140, 143, 145, 148, 151, 154, 157
130 static const uint16_t kAcTable[128] = {
131 4, 5, 6, 7, 8, 9, 10, 11,
132 12, 13, 14, 15, 16, 17, 18, 19,
133 20, 21, 22, 23, 24, 25, 26, 27,
134 28, 29, 30, 31, 32, 33, 34, 35,
135 36, 37, 38, 39, 40, 41, 42, 43,
136 44, 45, 46, 47, 48, 49, 50, 51,
137 52, 53, 54, 55, 56, 57, 58, 60,
138 62, 64, 66, 68, 70, 72, 74, 76,
139 78, 80, 82, 84, 86, 88, 90, 92,
140 94, 96, 98, 100, 102, 104, 106, 108,
141 110, 112, 114, 116, 119, 122, 125, 128,
142 131, 134, 137, 140, 143, 146, 149, 152,
143 155, 158, 161, 164, 167, 170, 173, 177,
144 181, 185, 189, 193, 197, 201, 205, 209,
145 213, 217, 221, 225, 229, 234, 239, 245,
146 249, 254, 259, 264, 269, 274, 279, 284
149 static const uint16_t kAcTable2[128] = {
150 8, 8, 9, 10, 12, 13, 15, 17,
151 18, 20, 21, 23, 24, 26, 27, 29,
152 31, 32, 34, 35, 37, 38, 40, 41,
153 43, 44, 46, 48, 49, 51, 52, 54,
154 55, 57, 58, 60, 62, 63, 65, 66,
155 68, 69, 71, 72, 74, 75, 77, 79,
156 80, 82, 83, 85, 86, 88, 89, 93,
157 96, 99, 102, 105, 108, 111, 114, 117,
158 120, 124, 127, 130, 133, 136, 139, 142,
159 145, 148, 151, 155, 158, 161, 164, 167,
160 170, 173, 176, 179, 184, 189, 193, 198,
161 203, 207, 212, 217, 221, 226, 230, 235,
162 240, 244, 249, 254, 258, 263, 268, 274,
163 280, 286, 292, 299, 305, 311, 317, 323,
164 330, 336, 342, 348, 354, 362, 370, 379,
165 385, 393, 401, 409, 416, 424, 432, 440
168 static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac]
169 { 96, 110 }, { 96, 108 }, { 110, 115 }
172 // Sharpening by (slightly) raising the hi-frequency coeffs.
173 // Hack-ish but helpful for mid-bitrate range. Use with care.
174 #define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
175 static const uint8_t kFreqSharpening[16] = {
176 0, 30, 60, 90,
177 30, 60, 90, 90,
178 60, 90, 90, 90,
179 90, 90, 90, 90
182 //------------------------------------------------------------------------------
183 // Initialize quantization parameters in VP8Matrix
185 // Returns the average quantizer
186 static int ExpandMatrix(VP8Matrix* const m, int type) {
187 int i, sum;
188 for (i = 0; i < 2; ++i) {
189 const int is_ac_coeff = (i > 0);
190 const int bias = kBiasMatrices[type][is_ac_coeff];
191 m->iq_[i] = (1 << QFIX) / m->q_[i];
192 m->bias_[i] = BIAS(bias);
193 // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
194 // * zero if coeff <= zthresh
195 // * non-zero if coeff > zthresh
196 m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
198 for (i = 2; i < 16; ++i) {
199 m->q_[i] = m->q_[1];
200 m->iq_[i] = m->iq_[1];
201 m->bias_[i] = m->bias_[1];
202 m->zthresh_[i] = m->zthresh_[1];
204 for (sum = 0, i = 0; i < 16; ++i) {
205 if (type == 0) { // we only use sharpening for AC luma coeffs
206 m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
207 } else {
208 m->sharpen_[i] = 0;
210 sum += m->q_[i];
212 return (sum + 8) >> 4;
215 static void SetupMatrices(VP8Encoder* enc) {
216 int i;
217 const int tlambda_scale =
218 (enc->method_ >= 4) ? enc->config_->sns_strength
219 : 0;
220 const int num_segments = enc->segment_hdr_.num_segments_;
221 for (i = 0; i < num_segments; ++i) {
222 VP8SegmentInfo* const m = &enc->dqm_[i];
223 const int q = m->quant_;
224 int q4, q16, quv;
225 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
226 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
228 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
229 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
231 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
232 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
234 q4 = ExpandMatrix(&m->y1_, 0);
235 q16 = ExpandMatrix(&m->y2_, 1);
236 quv = ExpandMatrix(&m->uv_, 2);
238 m->lambda_i4_ = (3 * q4 * q4) >> 7;
239 m->lambda_i16_ = (3 * q16 * q16);
240 m->lambda_uv_ = (3 * quv * quv) >> 6;
241 m->lambda_mode_ = (1 * q4 * q4) >> 7;
242 m->lambda_trellis_i4_ = (7 * q4 * q4) >> 3;
243 m->lambda_trellis_i16_ = (q16 * q16) >> 2;
244 m->lambda_trellis_uv_ = (quv *quv) << 1;
245 m->tlambda_ = (tlambda_scale * q4) >> 5;
247 m->min_disto_ = 10 * m->y1_.q_[0]; // quantization-aware min disto
248 m->max_edge_ = 0;
252 //------------------------------------------------------------------------------
253 // Initialize filtering parameters
255 // Very small filter-strength values have close to no visual effect. So we can
256 // save a little decoding-CPU by turning filtering off for these.
257 #define FSTRENGTH_CUTOFF 2
259 static void SetupFilterStrength(VP8Encoder* const enc) {
260 int i;
261 // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
262 const int level0 = 5 * enc->config_->filter_strength;
263 for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
264 VP8SegmentInfo* const m = &enc->dqm_[i];
265 // We focus on the quantization of AC coeffs.
266 const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
267 const int base_strength =
268 VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
269 // Segments with lower complexity ('beta') will be less filtered.
270 const int f = base_strength * level0 / (256 + m->beta_);
271 m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
273 // We record the initial strength (mainly for the case of 1-segment only).
274 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
275 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
276 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
279 //------------------------------------------------------------------------------
281 // Note: if you change the values below, remember that the max range
282 // allowed by the syntax for DQ_UV is [-16,16].
283 #define MAX_DQ_UV (6)
284 #define MIN_DQ_UV (-4)
286 // We want to emulate jpeg-like behaviour where the expected "good" quality
287 // is around q=75. Internally, our "good" middle is around c=50. So we
288 // map accordingly using linear piece-wise function
289 static double QualityToCompression(double c) {
290 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
291 // The file size roughly scales as pow(quantizer, 3.). Actually, the
292 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
293 // in the mid-quant range. So we scale the compressibility inversely to
294 // this power-law: quant ~= compression ^ 1/3. This law holds well for
295 // low quant. Finer modeling for high-quant would make use of kAcTable[]
296 // more explicitly.
297 const double v = pow(linear_c, 1 / 3.);
298 return v;
301 static double QualityToJPEGCompression(double c, double alpha) {
302 // We map the complexity 'alpha' and quality setting 'c' to a compression
303 // exponent empirically matched to the compression curve of libjpeg6b.
304 // On average, the WebP output size will be roughly similar to that of a
305 // JPEG file compressed with same quality factor.
306 const double amin = 0.30;
307 const double amax = 0.85;
308 const double exp_min = 0.4;
309 const double exp_max = 0.9;
310 const double slope = (exp_min - exp_max) / (amax - amin);
311 // Linearly interpolate 'expn' from exp_min to exp_max
312 // in the [amin, amax] range.
313 const double expn = (alpha > amax) ? exp_min
314 : (alpha < amin) ? exp_max
315 : exp_max + slope * (alpha - amin);
316 const double v = pow(c, expn);
317 return v;
320 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
321 const VP8SegmentInfo* const S2) {
322 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
325 static void SimplifySegments(VP8Encoder* const enc) {
326 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
327 const int num_segments = enc->segment_hdr_.num_segments_;
328 int num_final_segments = 1;
329 int s1, s2;
330 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
331 const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
332 int found = 0;
333 // check if we already have similar segment
334 for (s2 = 0; s2 < num_final_segments; ++s2) {
335 const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
336 if (SegmentsAreEquivalent(S1, S2)) {
337 found = 1;
338 break;
341 map[s1] = s2;
342 if (!found) {
343 if (num_final_segments != s1) {
344 enc->dqm_[num_final_segments] = enc->dqm_[s1];
346 ++num_final_segments;
349 if (num_final_segments < num_segments) { // Remap
350 int i = enc->mb_w_ * enc->mb_h_;
351 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
352 enc->segment_hdr_.num_segments_ = num_final_segments;
353 // Replicate the trailing segment infos (it's mostly cosmetics)
354 for (i = num_final_segments; i < num_segments; ++i) {
355 enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
360 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
361 int i;
362 int dq_uv_ac, dq_uv_dc;
363 const int num_segments = enc->segment_hdr_.num_segments_;
364 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
365 const double Q = quality / 100.;
366 const double c_base = enc->config_->emulate_jpeg_size ?
367 QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
368 QualityToCompression(Q);
369 for (i = 0; i < num_segments; ++i) {
370 // We modulate the base coefficient to accommodate for the quantization
371 // susceptibility and allow denser segments to be quantized more.
372 const double expn = 1. - amp * enc->dqm_[i].alpha_;
373 const double c = pow(c_base, expn);
374 const int q = (int)(127. * (1. - c));
375 assert(expn > 0.);
376 enc->dqm_[i].quant_ = clip(q, 0, 127);
379 // purely indicative in the bitstream (except for the 1-segment case)
380 enc->base_quant_ = enc->dqm_[0].quant_;
382 // fill-in values for the unused segments (required by the syntax)
383 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
384 enc->dqm_[i].quant_ = enc->base_quant_;
387 // uv_alpha_ is normally spread around ~60. The useful range is
388 // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
389 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
390 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
391 / (MAX_ALPHA - MIN_ALPHA);
392 // we rescale by the user-defined strength of adaptation
393 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
394 // and make it safe.
395 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
396 // We also boost the dc-uv-quant a little, based on sns-strength, since
397 // U/V channels are quite more reactive to high quants (flat DC-blocks
398 // tend to appear, and are unpleasant).
399 dq_uv_dc = -4 * enc->config_->sns_strength / 100;
400 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
402 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum
403 enc->dq_y2_dc_ = 0;
404 enc->dq_y2_ac_ = 0;
405 enc->dq_uv_dc_ = dq_uv_dc;
406 enc->dq_uv_ac_ = dq_uv_ac;
408 SetupFilterStrength(enc); // initialize segments' filtering, eventually
410 if (num_segments > 1) SimplifySegments(enc);
412 SetupMatrices(enc); // finalize quantization matrices
415 //------------------------------------------------------------------------------
416 // Form the predictions in cache
418 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
419 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
420 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
422 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
423 const int VP8I4ModeOffsets[NUM_BMODES] = {
424 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
427 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
428 const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
429 const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
430 VP8EncPredLuma16(it->yuv_p_, left, top);
433 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
434 const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
435 const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
436 VP8EncPredChroma8(it->yuv_p_, left, top);
439 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
440 VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
443 //------------------------------------------------------------------------------
444 // Quantize
446 // Layout:
447 // +----+
448 // |YYYY| 0
449 // |YYYY| 4
450 // |YYYY| 8
451 // |YYYY| 12
452 // +----+
453 // |UUVV| 16
454 // |UUVV| 20
455 // +----+
457 const int VP8Scan[16] = { // Luma
458 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
459 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
460 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
461 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
464 static const int VP8ScanUV[4 + 4] = {
465 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
466 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
469 //------------------------------------------------------------------------------
470 // Distortion measurement
472 static const uint16_t kWeightY[16] = {
473 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
476 static const uint16_t kWeightTrellis[16] = {
477 #if USE_TDISTO == 0
478 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
479 #else
480 30, 27, 19, 11,
481 27, 24, 17, 10,
482 19, 17, 12, 8,
483 11, 10, 8, 6
484 #endif
487 // Init/Copy the common fields in score.
488 static void InitScore(VP8ModeScore* const rd) {
489 rd->D = 0;
490 rd->SD = 0;
491 rd->R = 0;
492 rd->H = 0;
493 rd->nz = 0;
494 rd->score = MAX_COST;
497 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
498 dst->D = src->D;
499 dst->SD = src->SD;
500 dst->R = src->R;
501 dst->H = src->H;
502 dst->nz = src->nz; // note that nz is not accumulated, but just copied.
503 dst->score = src->score;
506 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
507 dst->D += src->D;
508 dst->SD += src->SD;
509 dst->R += src->R;
510 dst->H += src->H;
511 dst->nz |= src->nz; // here, new nz bits are accumulated.
512 dst->score += src->score;
515 //------------------------------------------------------------------------------
516 // Performs trellis-optimized quantization.
518 // Trellis node
519 typedef struct {
520 int8_t prev; // best previous node
521 int8_t sign; // sign of coeff_i
522 int16_t level; // level
523 } Node;
525 // Score state
526 typedef struct {
527 score_t score; // partial RD score
528 const uint16_t* costs; // shortcut to cost tables
529 } ScoreState;
531 // If a coefficient was quantized to a value Q (using a neutral bias),
532 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
533 // We don't test negative values though.
534 #define MIN_DELTA 0 // how much lower level to try
535 #define MAX_DELTA 1 // how much higher
536 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
537 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
538 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
540 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
541 // TODO: incorporate the "* 256" in the tables?
542 rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
545 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
546 score_t distortion) {
547 return rate * lambda + 256 * distortion;
550 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
551 int16_t in[16], int16_t out[16],
552 int ctx0, int coeff_type,
553 const VP8Matrix* const mtx,
554 int lambda) {
555 const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
556 const CostArray* const costs = enc->proba_.level_cost_[coeff_type];
557 const int first = (coeff_type == 0) ? 1 : 0;
558 Node nodes[16][NUM_NODES];
559 ScoreState score_states[2][NUM_NODES];
560 ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
561 ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
562 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
563 score_t best_score;
564 int n, m, p, last;
567 score_t cost;
568 const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
569 const int last_proba = probas[VP8EncBands[first]][ctx0][0];
571 // compute the position of the last interesting coefficient
572 last = first - 1;
573 for (n = 15; n >= first; --n) {
574 const int j = kZigzag[n];
575 const int err = in[j] * in[j];
576 if (err > thresh) {
577 last = n;
578 break;
581 // we don't need to go inspect up to n = 16 coeffs. We can just go up
582 // to last + 1 (inclusive) without losing much.
583 if (last < 15) ++last;
585 // compute 'skip' score. This is the max score one can do.
586 cost = VP8BitCost(0, last_proba);
587 best_score = RDScoreTrellis(lambda, cost, 0);
589 // initialize source node.
590 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
591 const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
592 ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
593 ss_cur[m].costs = costs[VP8EncBands[first]][ctx0];
597 // traverse trellis.
598 for (n = first; n <= last; ++n) {
599 const int j = kZigzag[n];
600 const uint32_t Q = mtx->q_[j];
601 const uint32_t iQ = mtx->iq_[j];
602 const uint32_t B = BIAS(0x00); // neutral bias
603 // note: it's important to take sign of the _original_ coeff,
604 // so we don't have to consider level < 0 afterward.
605 const int sign = (in[j] < 0);
606 const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
607 int level0 = QUANTDIV(coeff0, iQ, B);
608 if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
610 { // Swap current and previous score states
611 ScoreState* const tmp = ss_cur;
612 ss_cur = ss_prev;
613 ss_prev = tmp;
616 // test all alternate level values around level0.
617 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
618 Node* const cur = &NODE(n, m);
619 int level = level0 + m;
620 const int ctx = (level > 2) ? 2 : level;
621 const int band = VP8EncBands[n + 1];
622 score_t base_score, last_pos_score;
623 score_t best_cur_score = MAX_COST;
624 int best_prev = 0; // default, in case
626 ss_cur[m].score = MAX_COST;
627 ss_cur[m].costs = costs[band][ctx];
628 if (level > MAX_LEVEL || level < 0) { // node is dead?
629 continue;
632 // Compute extra rate cost if last coeff's position is < 15
634 const score_t last_pos_cost =
635 (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
636 last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
640 // Compute delta_error = how much coding this level will
641 // subtract to max_error as distortion.
642 // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
643 const int new_error = coeff0 - level * Q;
644 const int delta_error =
645 kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
646 base_score = RDScoreTrellis(lambda, 0, delta_error);
649 // Inspect all possible non-dead predecessors. Retain only the best one.
650 for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
651 // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
652 // eliminated since their score can't be better than the current best.
653 const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
654 // Examine node assuming it's a non-terminal one.
655 const score_t score =
656 base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
657 if (score < best_cur_score) {
658 best_cur_score = score;
659 best_prev = p;
662 // Store best finding in current node.
663 cur->sign = sign;
664 cur->level = level;
665 cur->prev = best_prev;
666 ss_cur[m].score = best_cur_score;
668 // Now, record best terminal node (and thus best entry in the graph).
669 if (level != 0) {
670 const score_t score = best_cur_score + last_pos_score;
671 if (score < best_score) {
672 best_score = score;
673 best_path[0] = n; // best eob position
674 best_path[1] = m; // best node index
675 best_path[2] = best_prev; // best predecessor
681 // Fresh start
682 memset(in + first, 0, (16 - first) * sizeof(*in));
683 memset(out + first, 0, (16 - first) * sizeof(*out));
684 if (best_path[0] == -1) {
685 return 0; // skip!
689 // Unwind the best path.
690 // Note: best-prev on terminal node is not necessarily equal to the
691 // best_prev for non-terminal. So we patch best_path[2] in.
692 int nz = 0;
693 int best_node = best_path[1];
694 n = best_path[0];
695 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
697 for (; n >= first; --n) {
698 const Node* const node = &NODE(n, best_node);
699 const int j = kZigzag[n];
700 out[n] = node->sign ? -node->level : node->level;
701 nz |= node->level;
702 in[j] = out[n] * mtx->q_[j];
703 best_node = node->prev;
705 return (nz != 0);
709 #undef NODE
711 //------------------------------------------------------------------------------
712 // Performs: difference, transform, quantize, back-transform, add
713 // all at once. Output is the reconstructed block in *yuv_out, and the
714 // quantized levels in *levels.
716 static int ReconstructIntra16(VP8EncIterator* const it,
717 VP8ModeScore* const rd,
718 uint8_t* const yuv_out,
719 int mode) {
720 const VP8Encoder* const enc = it->enc_;
721 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
722 const uint8_t* const src = it->yuv_in_ + Y_OFF;
723 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
724 int nz = 0;
725 int n;
726 int16_t tmp[16][16], dc_tmp[16];
728 for (n = 0; n < 16; ++n) {
729 VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
731 VP8FTransformWHT(tmp[0], dc_tmp);
732 nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
734 if (DO_TRELLIS_I16 && it->do_trellis_) {
735 int x, y;
736 VP8IteratorNzToBytes(it);
737 for (y = 0, n = 0; y < 4; ++y) {
738 for (x = 0; x < 4; ++x, ++n) {
739 const int ctx = it->top_nz_[x] + it->left_nz_[y];
740 const int non_zero =
741 TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
742 &dqm->y1_, dqm->lambda_trellis_i16_);
743 it->top_nz_[x] = it->left_nz_[y] = non_zero;
744 rd->y_ac_levels[n][0] = 0;
745 nz |= non_zero << n;
748 } else {
749 for (n = 0; n < 16; ++n) {
750 // Zero-out the first coeff, so that: a) nz is correct below, and
751 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
752 tmp[n][0] = 0;
753 nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
754 assert(rd->y_ac_levels[n][0] == 0);
758 // Transform back
759 VP8TransformWHT(dc_tmp, tmp[0]);
760 for (n = 0; n < 16; n += 2) {
761 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
764 return nz;
767 static int ReconstructIntra4(VP8EncIterator* const it,
768 int16_t levels[16],
769 const uint8_t* const src,
770 uint8_t* const yuv_out,
771 int mode) {
772 const VP8Encoder* const enc = it->enc_;
773 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
774 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
775 int nz = 0;
776 int16_t tmp[16];
778 VP8FTransform(src, ref, tmp);
779 if (DO_TRELLIS_I4 && it->do_trellis_) {
780 const int x = it->i4_ & 3, y = it->i4_ >> 2;
781 const int ctx = it->top_nz_[x] + it->left_nz_[y];
782 nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
783 dqm->lambda_trellis_i4_);
784 } else {
785 nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
787 VP8ITransform(ref, tmp, yuv_out, 0);
788 return nz;
791 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
792 uint8_t* const yuv_out, int mode) {
793 const VP8Encoder* const enc = it->enc_;
794 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
795 const uint8_t* const src = it->yuv_in_ + U_OFF;
796 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
797 int nz = 0;
798 int n;
799 int16_t tmp[8][16];
801 for (n = 0; n < 8; ++n) {
802 VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
804 if (DO_TRELLIS_UV && it->do_trellis_) {
805 int ch, x, y;
806 for (ch = 0, n = 0; ch <= 2; ch += 2) {
807 for (y = 0; y < 2; ++y) {
808 for (x = 0; x < 2; ++x, ++n) {
809 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
810 const int non_zero =
811 TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
812 &dqm->uv_, dqm->lambda_trellis_uv_);
813 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
814 nz |= non_zero << n;
818 } else {
819 for (n = 0; n < 8; ++n) {
820 nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
824 for (n = 0; n < 8; n += 2) {
825 VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
827 return (nz << 16);
830 //------------------------------------------------------------------------------
831 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
832 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
834 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
835 // We look at the first three AC coefficients to determine what is the average
836 // delta between each sub-4x4 block.
837 const int v0 = abs(DCs[1]);
838 const int v1 = abs(DCs[4]);
839 const int v2 = abs(DCs[5]);
840 int max_v = (v0 > v1) ? v1 : v0;
841 max_v = (v2 > max_v) ? v2 : max_v;
842 if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
845 static void SwapPtr(uint8_t** a, uint8_t** b) {
846 uint8_t* const tmp = *a;
847 *a = *b;
848 *b = tmp;
851 static void SwapOut(VP8EncIterator* const it) {
852 SwapPtr(&it->yuv_out_, &it->yuv_out2_);
855 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
856 score_t score = 0;
857 while (num_blocks-- > 0) { // TODO(skal): refine positional scoring?
858 int i;
859 for (i = 1; i < 16; ++i) { // omit DC, we're only interested in AC
860 score += (levels[i] != 0);
861 if (score > thresh) return 0;
863 levels += 16;
865 return 1;
868 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
869 const int kNumBlocks = 16;
870 VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
871 const int lambda = dqm->lambda_i16_;
872 const int tlambda = dqm->tlambda_;
873 const uint8_t* const src = it->yuv_in_ + Y_OFF;
874 VP8ModeScore rd16;
875 int mode;
877 rd->mode_i16 = -1;
878 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
879 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF; // scratch buffer
880 int nz;
882 // Reconstruct
883 nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
885 // Measure RD-score
886 rd16.D = VP8SSE16x16(src, tmp_dst);
887 rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
888 : 0;
889 rd16.H = VP8FixedCostsI16[mode];
890 rd16.R = VP8GetCostLuma16(it, &rd16);
891 if (mode > 0 &&
892 IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
893 // penalty to avoid flat area to be mispredicted by complex mode
894 rd16.R += FLATNESS_PENALTY * kNumBlocks;
897 // Since we always examine Intra16 first, we can overwrite *rd directly.
898 SetRDScore(lambda, &rd16);
899 if (mode == 0 || rd16.score < rd->score) {
900 CopyScore(rd, &rd16);
901 rd->mode_i16 = mode;
902 rd->nz = nz;
903 memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
904 memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
905 SwapOut(it);
908 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
909 VP8SetIntra16Mode(it, rd->mode_i16);
911 // we have a blocky macroblock (only DCs are non-zero) with fairly high
912 // distortion, record max delta so we can later adjust the minimal filtering
913 // strength needed to smooth these blocks out.
914 if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
915 StoreMaxDelta(dqm, rd->y_dc_levels);
919 //------------------------------------------------------------------------------
921 // return the cost array corresponding to the surrounding prediction modes.
922 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
923 const uint8_t modes[16]) {
924 const int preds_w = it->enc_->preds_w_;
925 const int x = (it->i4_ & 3), y = it->i4_ >> 2;
926 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
927 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
928 return VP8FixedCostsI4[top][left];
931 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
932 const VP8Encoder* const enc = it->enc_;
933 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
934 const int lambda = dqm->lambda_i4_;
935 const int tlambda = dqm->tlambda_;
936 const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
937 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
938 int total_header_bits = 0;
939 VP8ModeScore rd_best;
941 if (enc->max_i4_header_bits_ == 0) {
942 return 0;
945 InitScore(&rd_best);
946 rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145)
947 SetRDScore(dqm->lambda_mode_, &rd_best);
948 VP8IteratorStartI4(it);
949 do {
950 const int kNumBlocks = 1;
951 VP8ModeScore rd_i4;
952 int mode;
953 int best_mode = -1;
954 const uint8_t* const src = src0 + VP8Scan[it->i4_];
955 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
956 uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
957 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
959 InitScore(&rd_i4);
960 VP8MakeIntra4Preds(it);
961 for (mode = 0; mode < NUM_BMODES; ++mode) {
962 VP8ModeScore rd_tmp;
963 int16_t tmp_levels[16];
965 // Reconstruct
966 rd_tmp.nz =
967 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
969 // Compute RD-score
970 rd_tmp.D = VP8SSE4x4(src, tmp_dst);
971 rd_tmp.SD =
972 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
973 : 0;
974 rd_tmp.H = mode_costs[mode];
975 rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
976 if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
977 rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
980 SetRDScore(lambda, &rd_tmp);
981 if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
982 CopyScore(&rd_i4, &rd_tmp);
983 best_mode = mode;
984 SwapPtr(&tmp_dst, &best_block);
985 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
988 SetRDScore(dqm->lambda_mode_, &rd_i4);
989 AddScore(&rd_best, &rd_i4);
990 if (rd_best.score >= rd->score) {
991 return 0;
993 total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
994 if (total_header_bits > enc->max_i4_header_bits_) {
995 return 0;
997 // Copy selected samples if not in the right place already.
998 if (best_block != best_blocks + VP8Scan[it->i4_]) {
999 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1001 rd->modes_i4[it->i4_] = best_mode;
1002 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1003 } while (VP8IteratorRotateI4(it, best_blocks));
1005 // finalize state
1006 CopyScore(rd, &rd_best);
1007 VP8SetIntra4Mode(it, rd->modes_i4);
1008 SwapOut(it);
1009 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1010 return 1; // select intra4x4 over intra16x16
1013 //------------------------------------------------------------------------------
1015 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1016 const int kNumBlocks = 8;
1017 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1018 const int lambda = dqm->lambda_uv_;
1019 const uint8_t* const src = it->yuv_in_ + U_OFF;
1020 uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF; // scratch buffer
1021 uint8_t* const dst0 = it->yuv_out_ + U_OFF;
1022 VP8ModeScore rd_best;
1023 int mode;
1025 rd->mode_uv = -1;
1026 InitScore(&rd_best);
1027 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1028 VP8ModeScore rd_uv;
1030 // Reconstruct
1031 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1033 // Compute RD-score
1034 rd_uv.D = VP8SSE16x8(src, tmp_dst);
1035 rd_uv.SD = 0; // TODO: should we call TDisto? it tends to flatten areas.
1036 rd_uv.H = VP8FixedCostsUV[mode];
1037 rd_uv.R = VP8GetCostUV(it, &rd_uv);
1038 if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1039 rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1042 SetRDScore(lambda, &rd_uv);
1043 if (mode == 0 || rd_uv.score < rd_best.score) {
1044 CopyScore(&rd_best, &rd_uv);
1045 rd->mode_uv = mode;
1046 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1047 memcpy(dst0, tmp_dst, UV_SIZE); // TODO: SwapUVOut() ?
1050 VP8SetIntraUVMode(it, rd->mode_uv);
1051 AddScore(rd, &rd_best);
1054 //------------------------------------------------------------------------------
1055 // Final reconstruction and quantization.
1057 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1058 const VP8Encoder* const enc = it->enc_;
1059 const int is_i16 = (it->mb_->type_ == 1);
1060 int nz = 0;
1062 if (is_i16) {
1063 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
1064 } else {
1065 VP8IteratorStartI4(it);
1066 do {
1067 const int mode =
1068 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1069 const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
1070 uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
1071 VP8MakeIntra4Preds(it);
1072 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1073 src, dst, mode) << it->i4_;
1074 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
1077 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
1078 rd->nz = nz;
1081 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
1082 static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
1083 const int is_i16 = (it->mb_->type_ == 1);
1084 score_t best_score = MAX_COST;
1086 if (try_both_i4_i16 || is_i16) {
1087 int mode;
1088 int best_mode = -1;
1089 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1090 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1091 const uint8_t* const src = it->yuv_in_ + Y_OFF;
1092 const score_t score = VP8SSE16x16(src, ref);
1093 if (score < best_score) {
1094 best_mode = mode;
1095 best_score = score;
1098 VP8SetIntra16Mode(it, best_mode);
1100 if (try_both_i4_i16 || !is_i16) {
1101 uint8_t modes_i4[16];
1102 // We don't evaluate the rate here, but just account for it through a
1103 // constant penalty (i4 mode usually needs more bits compared to i16).
1104 score_t score_i4 = (score_t)I4_PENALTY;
1106 VP8IteratorStartI4(it);
1107 do {
1108 int mode;
1109 int best_sub_mode = -1;
1110 score_t best_sub_score = MAX_COST;
1111 const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
1113 // TODO(skal): we don't really need the prediction pixels here,
1114 // but just the distortion against 'src'.
1115 VP8MakeIntra4Preds(it);
1116 for (mode = 0; mode < NUM_BMODES; ++mode) {
1117 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1118 const score_t score = VP8SSE4x4(src, ref);
1119 if (score < best_sub_score) {
1120 best_sub_mode = mode;
1121 best_sub_score = score;
1124 modes_i4[it->i4_] = best_sub_mode;
1125 score_i4 += best_sub_score;
1126 if (score_i4 >= best_score) break;
1127 } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
1128 if (score_i4 < best_score) {
1129 VP8SetIntra4Mode(it, modes_i4);
1134 //------------------------------------------------------------------------------
1135 // Entry point
1137 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1138 VP8RDLevel rd_opt) {
1139 int is_skipped;
1140 const int method = it->enc_->method_;
1142 InitScore(rd);
1144 // We can perform predictions for Luma16x16 and Chroma8x8 already.
1145 // Luma4x4 predictions needs to be done as-we-go.
1146 VP8MakeLuma16Preds(it);
1147 VP8MakeChroma8Preds(it);
1149 if (rd_opt > RD_OPT_NONE) {
1150 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1151 PickBestIntra16(it, rd);
1152 if (method >= 2) {
1153 PickBestIntra4(it, rd);
1155 PickBestUV(it, rd);
1156 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1157 it->do_trellis_ = 1;
1158 SimpleQuantize(it, rd);
1160 } else {
1161 // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
1162 // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
1163 DistoRefine(it, (method >= 2));
1164 SimpleQuantize(it, rd);
1166 is_skipped = (rd->nz == 0);
1167 VP8SetSkip(it, is_skipped);
1168 return is_skipped;