Define RDCOST only once
[libvpx.git] / vp8 / encoder / encodemb.c
blobbb11ab072de73d3b3ca705e8a25278189bb80505
1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
12 #include "vpx_ports/config.h"
13 #include "encodemb.h"
14 #include "vp8/common/reconinter.h"
15 #include "quantize.h"
16 #include "tokenize.h"
17 #include "vp8/common/invtrans.h"
18 #include "vp8/common/recon.h"
19 #include "vp8/common/reconintra.h"
20 #include "dct.h"
21 #include "vpx_mem/vpx_mem.h"
22 #include "rdopt.h"
24 #if CONFIG_RUNTIME_CPU_DETECT
25 #define IF_RTCD(x) (x)
26 #else
27 #define IF_RTCD(x) NULL
28 #endif
29 void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
31 unsigned char *src_ptr = (*(be->base_src) + be->src);
32 short *diff_ptr = be->src_diff;
33 unsigned char *pred_ptr = bd->predictor;
34 int src_stride = be->src_stride;
36 int r, c;
38 for (r = 0; r < 4; r++)
40 for (c = 0; c < 4; c++)
42 diff_ptr[c] = src_ptr[c] - pred_ptr[c];
45 diff_ptr += pitch;
46 pred_ptr += pitch;
47 src_ptr += src_stride;
51 void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
53 short *udiff = diff + 256;
54 short *vdiff = diff + 320;
55 unsigned char *upred = pred + 256;
56 unsigned char *vpred = pred + 320;
58 int r, c;
60 for (r = 0; r < 8; r++)
62 for (c = 0; c < 8; c++)
64 udiff[c] = usrc[c] - upred[c];
67 udiff += 8;
68 upred += 8;
69 usrc += stride;
72 for (r = 0; r < 8; r++)
74 for (c = 0; c < 8; c++)
76 vdiff[c] = vsrc[c] - vpred[c];
79 vdiff += 8;
80 vpred += 8;
81 vsrc += stride;
85 void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
87 int r, c;
89 for (r = 0; r < 16; r++)
91 for (c = 0; c < 16; c++)
93 diff[c] = src[c] - pred[c];
96 diff += 16;
97 pred += 16;
98 src += stride;
102 static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
104 ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
105 ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
108 static void build_dcblock(MACROBLOCK *x)
110 short *src_diff_ptr = &x->src_diff[384];
111 int i;
113 for (i = 0; i < 16; i++)
115 src_diff_ptr[i] = x->coeff[i * 16];
119 void vp8_transform_mbuv(MACROBLOCK *x)
121 int i;
123 for (i = 16; i < 24; i += 2)
125 x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
126 &x->block[i].coeff[0], 16);
131 void vp8_transform_intra_mby(MACROBLOCK *x)
133 int i;
135 for (i = 0; i < 16; i += 2)
137 x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
138 &x->block[i].coeff[0], 32);
141 // build dc block from 16 y dc values
142 build_dcblock(x);
144 // do 2nd order transform on the dc block
145 x->short_walsh4x4(&x->block[24].src_diff[0],
146 &x->block[24].coeff[0], 8);
151 static void transform_mb(MACROBLOCK *x)
153 int i;
155 for (i = 0; i < 16; i += 2)
157 x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
158 &x->block[i].coeff[0], 32);
161 // build dc block from 16 y dc values
162 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
163 build_dcblock(x);
165 for (i = 16; i < 24; i += 2)
167 x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
168 &x->block[i].coeff[0], 16);
171 // do 2nd order transform on the dc block
172 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
173 x->short_walsh4x4(&x->block[24].src_diff[0],
174 &x->block[24].coeff[0], 8);
179 static void transform_mby(MACROBLOCK *x)
181 int i;
183 for (i = 0; i < 16; i += 2)
185 x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
186 &x->block[i].coeff[0], 32);
189 // build dc block from 16 y dc values
190 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
192 build_dcblock(x);
193 x->short_walsh4x4(&x->block[24].src_diff[0],
194 &x->block[24].coeff[0], 8);
199 void vp8_stuff_inter16x16(MACROBLOCK *x)
201 vp8_build_inter_predictors_mb_s(&x->e_mbd);
203 // recon = copy from predictors to destination
205 BLOCKD *b = &x->e_mbd.block[0];
206 unsigned char *pred_ptr = b->predictor;
207 unsigned char *dst_ptr = *(b->base_dst) + b->dst;
208 int stride = b->dst_stride;
210 int i;
211 for(i=0;i<16;i++)
212 vpx_memcpy(dst_ptr+i*stride,pred_ptr+16*i,16);
214 b = &x->e_mbd.block[16];
215 pred_ptr = b->predictor;
216 dst_ptr = *(b->base_dst) + b->dst;
217 stride = b->dst_stride;
219 for(i=0;i<8;i++)
220 vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
222 b = &x->e_mbd.block[20];
223 pred_ptr = b->predictor;
224 dst_ptr = *(b->base_dst) + b->dst;
225 stride = b->dst_stride;
227 for(i=0;i<8;i++)
228 vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
233 #if !(CONFIG_REALTIME_ONLY)
234 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
236 typedef struct vp8_token_state vp8_token_state;
238 struct vp8_token_state{
239 int rate;
240 int error;
241 signed char next;
242 signed char token;
243 short qc;
246 // TODO: experiments to find optimal multiple numbers
247 #define Y1_RD_MULT 4
248 #define UV_RD_MULT 2
249 #define Y2_RD_MULT 16
251 static const int plane_rd_mult[4]=
253 Y1_RD_MULT,
254 Y2_RD_MULT,
255 UV_RD_MULT,
256 Y1_RD_MULT
259 static void optimize_b(MACROBLOCK *mb, int ib, int type,
260 ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
261 const VP8_ENCODER_RTCD *rtcd)
263 BLOCK *b;
264 BLOCKD *d;
265 vp8_token_state tokens[17][2];
266 unsigned best_mask[2];
267 const short *dequant_ptr;
268 const short *coeff_ptr;
269 short *qcoeff_ptr;
270 short *dqcoeff_ptr;
271 int eob;
272 int i0;
273 int rc;
274 int x;
275 int sz;
276 int next;
277 int rdmult;
278 int rddiv;
279 int final_eob;
280 int rd_cost0;
281 int rd_cost1;
282 int rate0;
283 int rate1;
284 int error0;
285 int error1;
286 int t0;
287 int t1;
288 int best;
289 int band;
290 int pt;
291 int i;
292 int err_mult = plane_rd_mult[type];
294 b = &mb->block[ib];
295 d = &mb->e_mbd.block[ib];
297 /* Enable this to test the effect of RDO as a replacement for the dynamic
298 * zero bin instead of an augmentation of it.
300 #if 0
301 vp8_strict_quantize_b(b, d);
302 #endif
304 dequant_ptr = d->dequant;
305 coeff_ptr = b->coeff;
306 qcoeff_ptr = d->qcoeff;
307 dqcoeff_ptr = d->dqcoeff;
308 i0 = !type;
309 eob = d->eob;
311 /* Now set up a Viterbi trellis to evaluate alternative roundings. */
312 rdmult = mb->rdmult * err_mult;
313 if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME)
314 rdmult = (rdmult * 9)>>4;
316 rddiv = mb->rddiv;
317 best_mask[0] = best_mask[1] = 0;
318 /* Initialize the sentinel node of the trellis. */
319 tokens[eob][0].rate = 0;
320 tokens[eob][0].error = 0;
321 tokens[eob][0].next = 16;
322 tokens[eob][0].token = DCT_EOB_TOKEN;
323 tokens[eob][0].qc = 0;
324 *(tokens[eob] + 1) = *(tokens[eob] + 0);
325 next = eob;
326 for (i = eob; i-- > i0;)
328 int base_bits;
329 int d2;
330 int dx;
332 rc = vp8_default_zig_zag1d[i];
333 x = qcoeff_ptr[rc];
334 /* Only add a trellis state for non-zero coefficients. */
335 if (x)
337 int shortcut=0;
338 error0 = tokens[next][0].error;
339 error1 = tokens[next][1].error;
340 /* Evaluate the first possibility for this state. */
341 rate0 = tokens[next][0].rate;
342 rate1 = tokens[next][1].rate;
343 t0 = (vp8_dct_value_tokens_ptr + x)->Token;
344 /* Consider both possible successor states. */
345 if (next < 16)
347 band = vp8_coef_bands[i + 1];
348 pt = vp8_prev_token_class[t0];
349 rate0 +=
350 mb->token_costs[type][band][pt][tokens[next][0].token];
351 rate1 +=
352 mb->token_costs[type][band][pt][tokens[next][1].token];
354 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
355 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
356 if (rd_cost0 == rd_cost1)
358 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
359 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
361 /* And pick the best. */
362 best = rd_cost1 < rd_cost0;
363 base_bits = *(vp8_dct_value_cost_ptr + x);
364 dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
365 d2 = dx*dx;
366 tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
367 tokens[i][0].error = d2 + (best ? error1 : error0);
368 tokens[i][0].next = next;
369 tokens[i][0].token = t0;
370 tokens[i][0].qc = x;
371 best_mask[0] |= best << i;
372 /* Evaluate the second possibility for this state. */
373 rate0 = tokens[next][0].rate;
374 rate1 = tokens[next][1].rate;
376 if((abs(x)*dequant_ptr[rc]>abs(coeff_ptr[rc])) &&
377 (abs(x)*dequant_ptr[rc]<abs(coeff_ptr[rc])+dequant_ptr[rc]))
378 shortcut = 1;
379 else
380 shortcut = 0;
382 if(shortcut)
384 sz = -(x < 0);
385 x -= 2*sz + 1;
388 /* Consider both possible successor states. */
389 if (!x)
391 /* If we reduced this coefficient to zero, check to see if
392 * we need to move the EOB back here.
394 t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
395 DCT_EOB_TOKEN : ZERO_TOKEN;
396 t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
397 DCT_EOB_TOKEN : ZERO_TOKEN;
399 else
401 t0=t1 = (vp8_dct_value_tokens_ptr + x)->Token;
403 if (next < 16)
405 band = vp8_coef_bands[i + 1];
406 if(t0!=DCT_EOB_TOKEN)
408 pt = vp8_prev_token_class[t0];
409 rate0 += mb->token_costs[type][band][pt][
410 tokens[next][0].token];
412 if(t1!=DCT_EOB_TOKEN)
414 pt = vp8_prev_token_class[t1];
415 rate1 += mb->token_costs[type][band][pt][
416 tokens[next][1].token];
420 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
421 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
422 if (rd_cost0 == rd_cost1)
424 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
425 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
427 /* And pick the best. */
428 best = rd_cost1 < rd_cost0;
429 base_bits = *(vp8_dct_value_cost_ptr + x);
431 if(shortcut)
433 dx -= (dequant_ptr[rc] + sz) ^ sz;
434 d2 = dx*dx;
436 tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
437 tokens[i][1].error = d2 + (best ? error1 : error0);
438 tokens[i][1].next = next;
439 tokens[i][1].token =best?t1:t0;
440 tokens[i][1].qc = x;
441 best_mask[1] |= best << i;
442 /* Finally, make this the new head of the trellis. */
443 next = i;
445 /* There's no choice to make for a zero coefficient, so we don't
446 * add a new trellis node, but we do need to update the costs.
448 else
450 band = vp8_coef_bands[i + 1];
451 t0 = tokens[next][0].token;
452 t1 = tokens[next][1].token;
453 /* Update the cost of each path if we're past the EOB token. */
454 if (t0 != DCT_EOB_TOKEN)
456 tokens[next][0].rate += mb->token_costs[type][band][0][t0];
457 tokens[next][0].token = ZERO_TOKEN;
459 if (t1 != DCT_EOB_TOKEN)
461 tokens[next][1].rate += mb->token_costs[type][band][0][t1];
462 tokens[next][1].token = ZERO_TOKEN;
464 /* Don't update next, because we didn't add a new node. */
468 /* Now pick the best path through the whole trellis. */
469 band = vp8_coef_bands[i + 1];
470 VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
471 rate0 = tokens[next][0].rate;
472 rate1 = tokens[next][1].rate;
473 error0 = tokens[next][0].error;
474 error1 = tokens[next][1].error;
475 t0 = tokens[next][0].token;
476 t1 = tokens[next][1].token;
477 rate0 += mb->token_costs[type][band][pt][t0];
478 rate1 += mb->token_costs[type][band][pt][t1];
479 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
480 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
481 if (rd_cost0 == rd_cost1)
483 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
484 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
486 best = rd_cost1 < rd_cost0;
487 final_eob = i0 - 1;
488 for (i = next; i < eob; i = next)
490 x = tokens[i][best].qc;
491 if (x)
492 final_eob = i;
493 rc = vp8_default_zig_zag1d[i];
494 qcoeff_ptr[rc] = x;
495 dqcoeff_ptr[rc] = x * dequant_ptr[rc];
496 next = tokens[i][best].next;
497 best = (best_mask[best] >> i) & 1;
499 final_eob++;
501 d->eob = final_eob;
502 *a = *l = (d->eob != !type);
505 static void optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
507 int b;
508 int type;
509 int has_2nd_order;
510 ENTROPY_CONTEXT_PLANES t_above, t_left;
511 ENTROPY_CONTEXT *ta;
512 ENTROPY_CONTEXT *tl;
514 vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
515 vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
517 ta = (ENTROPY_CONTEXT *)&t_above;
518 tl = (ENTROPY_CONTEXT *)&t_left;
520 has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
521 && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
522 type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
524 for (b = 0; b < 16; b++)
526 optimize_b(x, b, type,
527 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
530 for (b = 16; b < 24; b++)
532 optimize_b(x, b, PLANE_TYPE_UV,
533 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
536 if (has_2nd_order)
538 b=24;
539 optimize_b(x, b, PLANE_TYPE_Y2,
540 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
545 void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
547 int b;
548 int type;
549 int has_2nd_order;
551 ENTROPY_CONTEXT_PLANES t_above, t_left;
552 ENTROPY_CONTEXT *ta;
553 ENTROPY_CONTEXT *tl;
555 if (!x->e_mbd.above_context)
556 return;
558 if (!x->e_mbd.left_context)
559 return;
561 vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
562 vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
564 ta = (ENTROPY_CONTEXT *)&t_above;
565 tl = (ENTROPY_CONTEXT *)&t_left;
567 has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
568 && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
569 type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
571 for (b = 0; b < 16; b++)
573 optimize_b(x, b, type,
574 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
578 if (has_2nd_order)
580 b=24;
581 optimize_b(x, b, PLANE_TYPE_Y2,
582 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
586 void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
588 int b;
589 ENTROPY_CONTEXT_PLANES t_above, t_left;
590 ENTROPY_CONTEXT *ta;
591 ENTROPY_CONTEXT *tl;
593 if (!x->e_mbd.above_context)
594 return;
596 if (!x->e_mbd.left_context)
597 return;
599 vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
600 vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
602 ta = (ENTROPY_CONTEXT *)&t_above;
603 tl = (ENTROPY_CONTEXT *)&t_left;
605 for (b = 16; b < 24; b++)
607 optimize_b(x, b, PLANE_TYPE_UV,
608 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
611 #endif
613 void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
615 vp8_build_inter_predictors_mb(&x->e_mbd);
617 vp8_subtract_mb(rtcd, x);
619 transform_mb(x);
621 vp8_quantize_mb(x);
623 #if !(CONFIG_REALTIME_ONLY)
624 if (x->optimize)
625 optimize_mb(x, rtcd);
626 #endif
628 vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
630 RECON_INVOKE(&rtcd->common->recon, recon_mb)
631 (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
635 /* this funciton is used by first pass only */
636 void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
638 vp8_build_inter_predictors_mby(&x->e_mbd);
640 ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
642 transform_mby(x);
644 vp8_quantize_mby(x);
646 vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
648 RECON_INVOKE(&rtcd->common->recon, recon_mby)
649 (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
653 void vp8_encode_inter16x16uvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
655 vp8_build_inter_predictors_mbuv(&x->e_mbd);
656 ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
658 vp8_transform_mbuv(x);
660 vp8_quantize_mbuv(x);