1 // Copyright 2010 Google Inc. All Rights Reserved.
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
10 // Frame-reconstruction function. Memory allocation.
12 // Author: Skal (pascal.massimino@gmail.com)
16 #include "../utils/utils.h"
18 #define ALIGN_MASK (32 - 1)
20 static void ReconstructRow(const VP8Decoder
* const dec
,
21 const VP8ThreadContext
* ctx
); // TODO(skal): remove
23 //------------------------------------------------------------------------------
26 // kFilterExtraRows[] = How many extra lines are needed on the MB boundary
27 // for caching, given a filtering level.
28 // Simple filter: up to 2 luma samples are read and 1 is written.
29 // Complex filter: up to 4 luma samples are read and 3 are written. Same for
30 // U/V, so it's 8 samples total (because of the 2x upsampling).
31 static const uint8_t kFilterExtraRows
[3] = { 0, 2, 8 };
33 static void DoFilter(const VP8Decoder
* const dec
, int mb_x
, int mb_y
) {
34 const VP8ThreadContext
* const ctx
= &dec
->thread_ctx_
;
35 const int cache_id
= ctx
->id_
;
36 const int y_bps
= dec
->cache_y_stride_
;
37 const VP8FInfo
* const f_info
= ctx
->f_info_
+ mb_x
;
38 uint8_t* const y_dst
= dec
->cache_y_
+ cache_id
* 16 * y_bps
+ mb_x
* 16;
39 const int ilevel
= f_info
->f_ilevel_
;
40 const int limit
= f_info
->f_limit_
;
45 if (dec
->filter_type_
== 1) { // simple
47 VP8SimpleHFilter16(y_dst
, y_bps
, limit
+ 4);
49 if (f_info
->f_inner_
) {
50 VP8SimpleHFilter16i(y_dst
, y_bps
, limit
);
53 VP8SimpleVFilter16(y_dst
, y_bps
, limit
+ 4);
55 if (f_info
->f_inner_
) {
56 VP8SimpleVFilter16i(y_dst
, y_bps
, limit
);
59 const int uv_bps
= dec
->cache_uv_stride_
;
60 uint8_t* const u_dst
= dec
->cache_u_
+ cache_id
* 8 * uv_bps
+ mb_x
* 8;
61 uint8_t* const v_dst
= dec
->cache_v_
+ cache_id
* 8 * uv_bps
+ mb_x
* 8;
62 const int hev_thresh
= f_info
->hev_thresh_
;
64 VP8HFilter16(y_dst
, y_bps
, limit
+ 4, ilevel
, hev_thresh
);
65 VP8HFilter8(u_dst
, v_dst
, uv_bps
, limit
+ 4, ilevel
, hev_thresh
);
67 if (f_info
->f_inner_
) {
68 VP8HFilter16i(y_dst
, y_bps
, limit
, ilevel
, hev_thresh
);
69 VP8HFilter8i(u_dst
, v_dst
, uv_bps
, limit
, ilevel
, hev_thresh
);
72 VP8VFilter16(y_dst
, y_bps
, limit
+ 4, ilevel
, hev_thresh
);
73 VP8VFilter8(u_dst
, v_dst
, uv_bps
, limit
+ 4, ilevel
, hev_thresh
);
75 if (f_info
->f_inner_
) {
76 VP8VFilter16i(y_dst
, y_bps
, limit
, ilevel
, hev_thresh
);
77 VP8VFilter8i(u_dst
, v_dst
, uv_bps
, limit
, ilevel
, hev_thresh
);
82 // Filter the decoded macroblock row (if needed)
83 static void FilterRow(const VP8Decoder
* const dec
) {
85 const int mb_y
= dec
->thread_ctx_
.mb_y_
;
86 assert(dec
->thread_ctx_
.filter_row_
);
87 for (mb_x
= dec
->tl_mb_x_
; mb_x
< dec
->br_mb_x_
; ++mb_x
) {
88 DoFilter(dec
, mb_x
, mb_y
);
92 //------------------------------------------------------------------------------
93 // Precompute the filtering strength for each segment and each i4x4/i16x16 mode.
95 static void PrecomputeFilterStrengths(VP8Decoder
* const dec
) {
96 if (dec
->filter_type_
> 0) {
98 const VP8FilterHeader
* const hdr
= &dec
->filter_hdr_
;
99 for (s
= 0; s
< NUM_MB_SEGMENTS
; ++s
) {
101 // First, compute the initial level
103 if (dec
->segment_hdr_
.use_segment_
) {
104 base_level
= dec
->segment_hdr_
.filter_strength_
[s
];
105 if (!dec
->segment_hdr_
.absolute_delta_
) {
106 base_level
+= hdr
->level_
;
109 base_level
= hdr
->level_
;
111 for (i4x4
= 0; i4x4
<= 1; ++i4x4
) {
112 VP8FInfo
* const info
= &dec
->fstrengths_
[s
][i4x4
];
113 int level
= base_level
;
114 if (hdr
->use_lf_delta_
) {
115 // TODO(skal): only CURRENT is handled for now.
116 level
+= hdr
->ref_lf_delta_
[0];
118 level
+= hdr
->mode_lf_delta_
[0];
121 level
= (level
< 0) ? 0 : (level
> 63) ? 63 : level
;
124 if (hdr
->sharpness_
> 0) {
125 if (hdr
->sharpness_
> 4) {
130 if (ilevel
> 9 - hdr
->sharpness_
) {
131 ilevel
= 9 - hdr
->sharpness_
;
134 if (ilevel
< 1) ilevel
= 1;
135 info
->f_ilevel_
= ilevel
;
136 info
->f_limit_
= 2 * level
+ ilevel
;
137 info
->hev_thresh_
= (level
>= 40) ? 2 : (level
>= 15) ? 1 : 0;
139 info
->f_limit_
= 0; // no filtering
141 info
->f_inner_
= i4x4
;
147 //------------------------------------------------------------------------------
150 #define DITHER_AMP_TAB_SIZE 12
151 static const int kQuantToDitherAmp
[DITHER_AMP_TAB_SIZE
] = {
152 // roughly, it's dqm->uv_mat_[1]
153 8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
156 void VP8InitDithering(const WebPDecoderOptions
* const options
,
157 VP8Decoder
* const dec
) {
159 if (options
!= NULL
) {
160 const int d
= options
->dithering_strength
;
161 const int max_amp
= (1 << VP8_RANDOM_DITHER_FIX
) - 1;
162 const int f
= (d
< 0) ? 0 : (d
> 100) ? max_amp
: (d
* max_amp
/ 100);
166 for (s
= 0; s
< NUM_MB_SEGMENTS
; ++s
) {
167 VP8QuantMatrix
* const dqm
= &dec
->dqm_
[s
];
168 if (dqm
->uv_quant_
< DITHER_AMP_TAB_SIZE
) {
169 // TODO(skal): should we specially dither more for uv_quant_ < 0?
170 const int idx
= (dqm
->uv_quant_
< 0) ? 0 : dqm
->uv_quant_
;
171 dqm
->dither_
= (f
* kQuantToDitherAmp
[idx
]) >> 3;
173 all_amp
|= dqm
->dither_
;
176 VP8InitRandom(&dec
->dithering_rg_
, 1.0f
);
180 #if WEBP_DECODER_ABI_VERSION > 0x0204
181 // potentially allow alpha dithering
182 dec
->alpha_dithering_
= options
->alpha_dithering_strength
;
183 if (dec
->alpha_dithering_
> 100) {
184 dec
->alpha_dithering_
= 100;
185 } else if (dec
->alpha_dithering_
< 0) {
186 dec
->alpha_dithering_
= 0;
192 // minimal amp that will provide a non-zero dithering effect
193 #define MIN_DITHER_AMP 4
194 #define DITHER_DESCALE 4
195 #define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
196 #define DITHER_AMP_BITS 8
197 #define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
199 static void Dither8x8(VP8Random
* const rg
, uint8_t* dst
, int bps
, int amp
) {
201 for (j
= 0; j
< 8; ++j
) {
202 for (i
= 0; i
< 8; ++i
) {
203 // TODO: could be made faster with SSE2
205 VP8RandomBits2(rg
, DITHER_AMP_BITS
+ 1, amp
) - DITHER_AMP_CENTER
;
206 // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
207 const int delta
= (bits
+ DITHER_DESCALE_ROUNDER
) >> DITHER_DESCALE
;
208 const int v
= (int)dst
[i
] + delta
;
209 dst
[i
] = (v
< 0) ? 0 : (v
> 255) ? 255u : (uint8_t)v
;
215 static void DitherRow(VP8Decoder
* const dec
) {
217 assert(dec
->dither_
);
218 for (mb_x
= dec
->tl_mb_x_
; mb_x
< dec
->br_mb_x_
; ++mb_x
) {
219 const VP8ThreadContext
* const ctx
= &dec
->thread_ctx_
;
220 const VP8MBData
* const data
= ctx
->mb_data_
+ mb_x
;
221 const int cache_id
= ctx
->id_
;
222 const int uv_bps
= dec
->cache_uv_stride_
;
223 if (data
->dither_
>= MIN_DITHER_AMP
) {
224 uint8_t* const u_dst
= dec
->cache_u_
+ cache_id
* 8 * uv_bps
+ mb_x
* 8;
225 uint8_t* const v_dst
= dec
->cache_v_
+ cache_id
* 8 * uv_bps
+ mb_x
* 8;
226 Dither8x8(&dec
->dithering_rg_
, u_dst
, uv_bps
, data
->dither_
);
227 Dither8x8(&dec
->dithering_rg_
, v_dst
, uv_bps
, data
->dither_
);
232 //------------------------------------------------------------------------------
233 // This function is called after a row of macroblocks is finished decoding.
234 // It also takes into account the following restrictions:
235 // * In case of in-loop filtering, we must hold off sending some of the bottom
236 // pixels as they are yet unfiltered. They will be when the next macroblock
237 // row is decoded. Meanwhile, we must preserve them by rotating them in the
238 // cache area. This doesn't hold for the very bottom row of the uncropped
239 // picture of course.
240 // * we must clip the remaining pixels against the cropping area. The VP8Io
241 // struct must have the following fields set correctly before calling put():
243 #define MACROBLOCK_VPOS(mb_y) ((mb_y) * 16) // vertical position of a MB
245 // Finalize and transmit a complete row. Return false in case of user-abort.
246 static int FinishRow(VP8Decoder
* const dec
, VP8Io
* const io
) {
248 const VP8ThreadContext
* const ctx
= &dec
->thread_ctx_
;
249 const int cache_id
= ctx
->id_
;
250 const int extra_y_rows
= kFilterExtraRows
[dec
->filter_type_
];
251 const int ysize
= extra_y_rows
* dec
->cache_y_stride_
;
252 const int uvsize
= (extra_y_rows
/ 2) * dec
->cache_uv_stride_
;
253 const int y_offset
= cache_id
* 16 * dec
->cache_y_stride_
;
254 const int uv_offset
= cache_id
* 8 * dec
->cache_uv_stride_
;
255 uint8_t* const ydst
= dec
->cache_y_
- ysize
+ y_offset
;
256 uint8_t* const udst
= dec
->cache_u_
- uvsize
+ uv_offset
;
257 uint8_t* const vdst
= dec
->cache_v_
- uvsize
+ uv_offset
;
258 const int mb_y
= ctx
->mb_y_
;
259 const int is_first_row
= (mb_y
== 0);
260 const int is_last_row
= (mb_y
>= dec
->br_mb_y_
- 1);
262 if (dec
->mt_method_
== 2) {
263 ReconstructRow(dec
, ctx
);
266 if (ctx
->filter_row_
) {
274 if (io
->put
!= NULL
) {
275 int y_start
= MACROBLOCK_VPOS(mb_y
);
276 int y_end
= MACROBLOCK_VPOS(mb_y
+ 1);
278 y_start
-= extra_y_rows
;
283 io
->y
= dec
->cache_y_
+ y_offset
;
284 io
->u
= dec
->cache_u_
+ uv_offset
;
285 io
->v
= dec
->cache_v_
+ uv_offset
;
289 y_end
-= extra_y_rows
;
291 if (y_end
> io
->crop_bottom
) {
292 y_end
= io
->crop_bottom
; // make sure we don't overflow on last row.
295 if (dec
->alpha_data_
!= NULL
&& y_start
< y_end
) {
296 // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
298 io
->a
= VP8DecompressAlphaRows(dec
, y_start
, y_end
- y_start
);
300 return VP8SetError(dec
, VP8_STATUS_BITSTREAM_ERROR
,
301 "Could not decode alpha data.");
304 if (y_start
< io
->crop_top
) {
305 const int delta_y
= io
->crop_top
- y_start
;
306 y_start
= io
->crop_top
;
307 assert(!(delta_y
& 1));
308 io
->y
+= dec
->cache_y_stride_
* delta_y
;
309 io
->u
+= dec
->cache_uv_stride_
* (delta_y
>> 1);
310 io
->v
+= dec
->cache_uv_stride_
* (delta_y
>> 1);
312 io
->a
+= io
->width
* delta_y
;
315 if (y_start
< y_end
) {
316 io
->y
+= io
->crop_left
;
317 io
->u
+= io
->crop_left
>> 1;
318 io
->v
+= io
->crop_left
>> 1;
320 io
->a
+= io
->crop_left
;
322 io
->mb_y
= y_start
- io
->crop_top
;
323 io
->mb_w
= io
->crop_right
- io
->crop_left
;
324 io
->mb_h
= y_end
- y_start
;
328 // rotate top samples if needed
329 if (cache_id
+ 1 == dec
->num_caches_
) {
331 memcpy(dec
->cache_y_
- ysize
, ydst
+ 16 * dec
->cache_y_stride_
, ysize
);
332 memcpy(dec
->cache_u_
- uvsize
, udst
+ 8 * dec
->cache_uv_stride_
, uvsize
);
333 memcpy(dec
->cache_v_
- uvsize
, vdst
+ 8 * dec
->cache_uv_stride_
, uvsize
);
340 #undef MACROBLOCK_VPOS
342 //------------------------------------------------------------------------------
344 int VP8ProcessRow(VP8Decoder
* const dec
, VP8Io
* const io
) {
346 VP8ThreadContext
* const ctx
= &dec
->thread_ctx_
;
347 const int filter_row
=
348 (dec
->filter_type_
> 0) &&
349 (dec
->mb_y_
>= dec
->tl_mb_y_
) && (dec
->mb_y_
<= dec
->br_mb_y_
);
350 if (dec
->mt_method_
== 0) {
351 // ctx->id_ and ctx->f_info_ are already set
352 ctx
->mb_y_
= dec
->mb_y_
;
353 ctx
->filter_row_
= filter_row
;
354 ReconstructRow(dec
, ctx
);
355 ok
= FinishRow(dec
, io
);
357 WebPWorker
* const worker
= &dec
->worker_
;
358 // Finish previous job *before* updating context
359 ok
&= WebPGetWorkerInterface()->Sync(worker
);
360 assert(worker
->status_
== OK
);
361 if (ok
) { // spawn a new deblocking/output job
363 ctx
->id_
= dec
->cache_id_
;
364 ctx
->mb_y_
= dec
->mb_y_
;
365 ctx
->filter_row_
= filter_row
;
366 if (dec
->mt_method_
== 2) { // swap macroblock data
367 VP8MBData
* const tmp
= ctx
->mb_data_
;
368 ctx
->mb_data_
= dec
->mb_data_
;
371 // perform reconstruction directly in main thread
372 ReconstructRow(dec
, ctx
);
374 if (filter_row
) { // swap filter info
375 VP8FInfo
* const tmp
= ctx
->f_info_
;
376 ctx
->f_info_
= dec
->f_info_
;
379 // (reconstruct)+filter in parallel
380 WebPGetWorkerInterface()->Launch(worker
);
381 if (++dec
->cache_id_
== dec
->num_caches_
) {
389 //------------------------------------------------------------------------------
390 // Finish setting up the decoding parameter once user's setup() is called.
392 VP8StatusCode
VP8EnterCritical(VP8Decoder
* const dec
, VP8Io
* const io
) {
393 // Call setup() first. This may trigger additional decoding features on 'io'.
394 // Note: Afterward, we must call teardown() no matter what.
395 if (io
->setup
!= NULL
&& !io
->setup(io
)) {
396 VP8SetError(dec
, VP8_STATUS_USER_ABORT
, "Frame setup failed");
400 // Disable filtering per user request
401 if (io
->bypass_filtering
) {
402 dec
->filter_type_
= 0;
404 // TODO(skal): filter type / strength / sharpness forcing
406 // Define the area where we can skip in-loop filtering, in case of cropping.
408 // 'Simple' filter reads two luma samples outside of the macroblock
409 // and filters one. It doesn't filter the chroma samples. Hence, we can
410 // avoid doing the in-loop filtering before crop_top/crop_left position.
411 // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
412 // Means: there's a dependency chain that goes all the way up to the
413 // top-left corner of the picture (MB #0). We must filter all the previous
415 // TODO(skal): add an 'approximate_decoding' option, that won't produce
416 // a 1:1 bit-exactness for complex filtering?
418 const int extra_pixels
= kFilterExtraRows
[dec
->filter_type_
];
419 if (dec
->filter_type_
== 2) {
420 // For complex filter, we need to preserve the dependency chain.
424 // For simple filter, we can filter only the cropped region.
425 // We include 'extra_pixels' on the other side of the boundary, since
426 // vertical or horizontal filtering of the previous macroblock can
427 // modify some abutting pixels.
428 dec
->tl_mb_x_
= (io
->crop_left
- extra_pixels
) >> 4;
429 dec
->tl_mb_y_
= (io
->crop_top
- extra_pixels
) >> 4;
430 if (dec
->tl_mb_x_
< 0) dec
->tl_mb_x_
= 0;
431 if (dec
->tl_mb_y_
< 0) dec
->tl_mb_y_
= 0;
433 // We need some 'extra' pixels on the right/bottom.
434 dec
->br_mb_y_
= (io
->crop_bottom
+ 15 + extra_pixels
) >> 4;
435 dec
->br_mb_x_
= (io
->crop_right
+ 15 + extra_pixels
) >> 4;
436 if (dec
->br_mb_x_
> dec
->mb_w_
) {
437 dec
->br_mb_x_
= dec
->mb_w_
;
439 if (dec
->br_mb_y_
> dec
->mb_h_
) {
440 dec
->br_mb_y_
= dec
->mb_h_
;
443 PrecomputeFilterStrengths(dec
);
444 return VP8_STATUS_OK
;
447 int VP8ExitCritical(VP8Decoder
* const dec
, VP8Io
* const io
) {
449 if (dec
->mt_method_
> 0) {
450 ok
= WebPGetWorkerInterface()->Sync(&dec
->worker_
);
453 if (io
->teardown
!= NULL
) {
459 //------------------------------------------------------------------------------
460 // For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line.
462 // Reason is: the deblocking filter cannot deblock the bottom horizontal edges
463 // immediately, and needs to wait for first few rows of the next macroblock to
464 // be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending
466 // With two threads, the vertical positions of the rows being decoded are:
467 // Decode: [ 0..15][16..31][32..47][48..63][64..79][...
468 // Deblock: [ 0..11][12..27][28..43][44..59][...
469 // If we use two threads and two caches of 16 pixels, the sequence would be:
470 // Decode: [ 0..15][16..31][ 0..15!!][16..31][ 0..15][...
471 // Deblock: [ 0..11][12..27!!][-4..11][12..27][...
472 // The problem occurs during row [12..15!!] that both the decoding and
473 // deblocking threads are writing simultaneously.
474 // With 3 cache lines, one get a safe write pattern:
475 // Decode: [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0..
476 // Deblock: [ 0..11][12..27][28..43][-4..11][12..27][28...
477 // Note that multi-threaded output _without_ deblocking can make use of two
478 // cache lines of 16 pixels only, since there's no lagging behind. The decoding
479 // and output process have non-concurrent writing:
480 // Decode: [ 0..15][16..31][ 0..15][16..31][...
481 // io->put: [ 0..15][16..31][ 0..15][...
483 #define MT_CACHE_LINES 3
484 #define ST_CACHE_LINES 1 // 1 cache row only for single-threaded case
486 // Initialize multi/single-thread worker
487 static int InitThreadContext(VP8Decoder
* const dec
) {
489 if (dec
->mt_method_
> 0) {
490 WebPWorker
* const worker
= &dec
->worker_
;
491 if (!WebPGetWorkerInterface()->Reset(worker
)) {
492 return VP8SetError(dec
, VP8_STATUS_OUT_OF_MEMORY
,
493 "thread initialization failed.");
496 worker
->data2
= (void*)&dec
->thread_ctx_
.io_
;
497 worker
->hook
= (WebPWorkerHook
)FinishRow
;
499 (dec
->filter_type_
> 0) ? MT_CACHE_LINES
: MT_CACHE_LINES
- 1;
501 dec
->num_caches_
= ST_CACHE_LINES
;
506 int VP8GetThreadMethod(const WebPDecoderOptions
* const options
,
507 const WebPHeaderStructure
* const headers
,
508 int width
, int height
) {
509 if (options
== NULL
|| options
->use_threads
== 0) {
515 assert(headers
== NULL
|| !headers
->is_lossless
);
516 #if defined(WEBP_USE_THREAD)
517 if (width
< MIN_WIDTH_FOR_THREADS
) return 0;
518 // TODO(skal): tune the heuristic further
520 if (height
< 2 * width
) return 2;
523 #else // !WEBP_USE_THREAD
528 #undef MT_CACHE_LINES
529 #undef ST_CACHE_LINES
531 //------------------------------------------------------------------------------
534 static int AllocateMemory(VP8Decoder
* const dec
) {
535 const int num_caches
= dec
->num_caches_
;
536 const int mb_w
= dec
->mb_w_
;
537 // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
538 const size_t intra_pred_mode_size
= 4 * mb_w
* sizeof(uint8_t);
539 const size_t top_size
= sizeof(VP8TopSamples
) * mb_w
;
540 const size_t mb_info_size
= (mb_w
+ 1) * sizeof(VP8MB
);
541 const size_t f_info_size
=
542 (dec
->filter_type_
> 0) ?
543 mb_w
* (dec
->mt_method_
> 0 ? 2 : 1) * sizeof(VP8FInfo
)
545 const size_t yuv_size
= YUV_SIZE
* sizeof(*dec
->yuv_b_
);
546 const size_t mb_data_size
=
547 (dec
->mt_method_
== 2 ? 2 : 1) * mb_w
* sizeof(*dec
->mb_data_
);
548 const size_t cache_height
= (16 * num_caches
549 + kFilterExtraRows
[dec
->filter_type_
]) * 3 / 2;
550 const size_t cache_size
= top_size
* cache_height
;
551 // alpha_size is the only one that scales as width x height.
552 const uint64_t alpha_size
= (dec
->alpha_data_
!= NULL
) ?
553 (uint64_t)dec
->pic_hdr_
.width_
* dec
->pic_hdr_
.height_
: 0ULL;
554 const uint64_t needed
= (uint64_t)intra_pred_mode_size
555 + top_size
+ mb_info_size
+ f_info_size
556 + yuv_size
+ mb_data_size
557 + cache_size
+ alpha_size
+ ALIGN_MASK
;
560 if (needed
!= (size_t)needed
) return 0; // check for overflow
561 if (needed
> dec
->mem_size_
) {
562 WebPSafeFree(dec
->mem_
);
564 dec
->mem_
= WebPSafeMalloc(needed
, sizeof(uint8_t));
565 if (dec
->mem_
== NULL
) {
566 return VP8SetError(dec
, VP8_STATUS_OUT_OF_MEMORY
,
567 "no memory during frame initialization.");
569 // down-cast is ok, thanks to WebPSafeAlloc() above.
570 dec
->mem_size_
= (size_t)needed
;
573 mem
= (uint8_t*)dec
->mem_
;
574 dec
->intra_t_
= (uint8_t*)mem
;
575 mem
+= intra_pred_mode_size
;
577 dec
->yuv_t_
= (VP8TopSamples
*)mem
;
580 dec
->mb_info_
= ((VP8MB
*)mem
) + 1;
583 dec
->f_info_
= f_info_size
? (VP8FInfo
*)mem
: NULL
;
585 dec
->thread_ctx_
.id_
= 0;
586 dec
->thread_ctx_
.f_info_
= dec
->f_info_
;
587 if (dec
->mt_method_
> 0) {
588 // secondary cache line. The deblocking process need to make use of the
589 // filtering strength from previous macroblock row, while the new ones
590 // are being decoded in parallel. We'll just swap the pointers.
591 dec
->thread_ctx_
.f_info_
+= mb_w
;
594 mem
= (uint8_t*)((uintptr_t)(mem
+ ALIGN_MASK
) & ~ALIGN_MASK
);
595 assert((yuv_size
& ALIGN_MASK
) == 0);
596 dec
->yuv_b_
= (uint8_t*)mem
;
599 dec
->mb_data_
= (VP8MBData
*)mem
;
600 dec
->thread_ctx_
.mb_data_
= (VP8MBData
*)mem
;
601 if (dec
->mt_method_
== 2) {
602 dec
->thread_ctx_
.mb_data_
+= mb_w
;
606 dec
->cache_y_stride_
= 16 * mb_w
;
607 dec
->cache_uv_stride_
= 8 * mb_w
;
609 const int extra_rows
= kFilterExtraRows
[dec
->filter_type_
];
610 const int extra_y
= extra_rows
* dec
->cache_y_stride_
;
611 const int extra_uv
= (extra_rows
/ 2) * dec
->cache_uv_stride_
;
612 dec
->cache_y_
= ((uint8_t*)mem
) + extra_y
;
613 dec
->cache_u_
= dec
->cache_y_
614 + 16 * num_caches
* dec
->cache_y_stride_
+ extra_uv
;
615 dec
->cache_v_
= dec
->cache_u_
616 + 8 * num_caches
* dec
->cache_uv_stride_
+ extra_uv
;
622 dec
->alpha_plane_
= alpha_size
? (uint8_t*)mem
: NULL
;
624 assert(mem
<= (uint8_t*)dec
->mem_
+ dec
->mem_size_
);
626 // note: left/top-info is initialized once for all.
627 memset(dec
->mb_info_
- 1, 0, mb_info_size
);
628 VP8InitScanline(dec
); // initialize left too.
631 memset(dec
->intra_t_
, B_DC_PRED
, intra_pred_mode_size
);
636 static void InitIo(VP8Decoder
* const dec
, VP8Io
* io
) {
639 io
->y
= dec
->cache_y_
;
640 io
->u
= dec
->cache_u_
;
641 io
->v
= dec
->cache_v_
;
642 io
->y_stride
= dec
->cache_y_stride_
;
643 io
->uv_stride
= dec
->cache_uv_stride_
;
647 int VP8InitFrame(VP8Decoder
* const dec
, VP8Io
* io
) {
648 if (!InitThreadContext(dec
)) return 0; // call first. Sets dec->num_caches_.
649 if (!AllocateMemory(dec
)) return 0;
651 VP8DspInit(); // Init critical function pointers and look-up tables.
655 //------------------------------------------------------------------------------
656 // Main reconstruction function.
658 static const int kScan
[16] = {
659 0 + 0 * BPS
, 4 + 0 * BPS
, 8 + 0 * BPS
, 12 + 0 * BPS
,
660 0 + 4 * BPS
, 4 + 4 * BPS
, 8 + 4 * BPS
, 12 + 4 * BPS
,
661 0 + 8 * BPS
, 4 + 8 * BPS
, 8 + 8 * BPS
, 12 + 8 * BPS
,
662 0 + 12 * BPS
, 4 + 12 * BPS
, 8 + 12 * BPS
, 12 + 12 * BPS
665 static int CheckMode(int mb_x
, int mb_y
, int mode
) {
666 if (mode
== B_DC_PRED
) {
668 return (mb_y
== 0) ? B_DC_PRED_NOTOPLEFT
: B_DC_PRED_NOLEFT
;
670 return (mb_y
== 0) ? B_DC_PRED_NOTOP
: B_DC_PRED
;
676 static void Copy32b(uint8_t* dst
, uint8_t* src
) {
680 static WEBP_INLINE
void DoTransform(uint32_t bits
, const int16_t* const src
,
681 uint8_t* const dst
) {
682 switch (bits
>> 30) {
684 VP8Transform(src
, dst
, 0);
687 VP8TransformAC3(src
, dst
);
690 VP8TransformDC(src
, dst
);
697 static void DoUVTransform(uint32_t bits
, const int16_t* const src
,
698 uint8_t* const dst
) {
699 if (bits
& 0xff) { // any non-zero coeff at all?
700 if (bits
& 0xaa) { // any non-zero AC coefficient?
701 VP8TransformUV(src
, dst
); // note we don't use the AC3 variant for U/V
703 VP8TransformDCUV(src
, dst
);
708 static void ReconstructRow(const VP8Decoder
* const dec
,
709 const VP8ThreadContext
* ctx
) {
712 const int mb_y
= ctx
->mb_y_
;
713 const int cache_id
= ctx
->id_
;
714 uint8_t* const y_dst
= dec
->yuv_b_
+ Y_OFF
;
715 uint8_t* const u_dst
= dec
->yuv_b_
+ U_OFF
;
716 uint8_t* const v_dst
= dec
->yuv_b_
+ V_OFF
;
717 for (mb_x
= 0; mb_x
< dec
->mb_w_
; ++mb_x
) {
718 const VP8MBData
* const block
= ctx
->mb_data_
+ mb_x
;
720 // Rotate in the left samples from previously decoded block. We move four
721 // pixels at a time for alignment reason, and because of in-loop filter.
723 for (j
= -1; j
< 16; ++j
) {
724 Copy32b(&y_dst
[j
* BPS
- 4], &y_dst
[j
* BPS
+ 12]);
726 for (j
= -1; j
< 8; ++j
) {
727 Copy32b(&u_dst
[j
* BPS
- 4], &u_dst
[j
* BPS
+ 4]);
728 Copy32b(&v_dst
[j
* BPS
- 4], &v_dst
[j
* BPS
+ 4]);
731 for (j
= 0; j
< 16; ++j
) {
732 y_dst
[j
* BPS
- 1] = 129;
734 for (j
= 0; j
< 8; ++j
) {
735 u_dst
[j
* BPS
- 1] = 129;
736 v_dst
[j
* BPS
- 1] = 129;
738 // Init top-left sample on left column too
740 y_dst
[-1 - BPS
] = u_dst
[-1 - BPS
] = v_dst
[-1 - BPS
] = 129;
744 // bring top samples into the cache
745 VP8TopSamples
* const top_yuv
= dec
->yuv_t_
+ mb_x
;
746 const int16_t* const coeffs
= block
->coeffs_
;
747 uint32_t bits
= block
->non_zero_y_
;
751 memcpy(y_dst
- BPS
, top_yuv
[0].y
, 16);
752 memcpy(u_dst
- BPS
, top_yuv
[0].u
, 8);
753 memcpy(v_dst
- BPS
, top_yuv
[0].v
, 8);
754 } else if (mb_x
== 0) {
755 // we only need to do this init once at block (0,0).
756 // Afterward, it remains valid for the whole topmost row.
757 memset(y_dst
- BPS
- 1, 127, 16 + 4 + 1);
758 memset(u_dst
- BPS
- 1, 127, 8 + 1);
759 memset(v_dst
- BPS
- 1, 127, 8 + 1);
762 // predict and add residuals
763 if (block
->is_i4x4_
) { // 4x4
764 uint32_t* const top_right
= (uint32_t*)(y_dst
- BPS
+ 16);
767 if (mb_x
>= dec
->mb_w_
- 1) { // on rightmost border
768 memset(top_right
, top_yuv
[0].y
[15], sizeof(*top_right
));
770 memcpy(top_right
, top_yuv
[1].y
, sizeof(*top_right
));
773 // replicate the top-right pixels below
774 top_right
[BPS
] = top_right
[2 * BPS
] = top_right
[3 * BPS
] = top_right
[0];
776 // predict and add residuals for all 4x4 blocks in turn.
777 for (n
= 0; n
< 16; ++n
, bits
<<= 2) {
778 uint8_t* const dst
= y_dst
+ kScan
[n
];
779 VP8PredLuma4
[block
->imodes_
[n
]](dst
);
780 DoTransform(bits
, coeffs
+ n
* 16, dst
);
783 const int pred_func
= CheckMode(mb_x
, mb_y
,
785 VP8PredLuma16
[pred_func
](y_dst
);
787 for (n
= 0; n
< 16; ++n
, bits
<<= 2) {
788 DoTransform(bits
, coeffs
+ n
* 16, y_dst
+ kScan
[n
]);
794 const uint32_t bits_uv
= block
->non_zero_uv_
;
795 const int pred_func
= CheckMode(mb_x
, mb_y
, block
->uvmode_
);
796 VP8PredChroma8
[pred_func
](u_dst
);
797 VP8PredChroma8
[pred_func
](v_dst
);
798 DoUVTransform(bits_uv
>> 0, coeffs
+ 16 * 16, u_dst
);
799 DoUVTransform(bits_uv
>> 8, coeffs
+ 20 * 16, v_dst
);
802 // stash away top samples for next block
803 if (mb_y
< dec
->mb_h_
- 1) {
804 memcpy(top_yuv
[0].y
, y_dst
+ 15 * BPS
, 16);
805 memcpy(top_yuv
[0].u
, u_dst
+ 7 * BPS
, 8);
806 memcpy(top_yuv
[0].v
, v_dst
+ 7 * BPS
, 8);
809 // Transfer reconstructed samples from yuv_b_ cache to final destination.
811 const int y_offset
= cache_id
* 16 * dec
->cache_y_stride_
;
812 const int uv_offset
= cache_id
* 8 * dec
->cache_uv_stride_
;
813 uint8_t* const y_out
= dec
->cache_y_
+ mb_x
* 16 + y_offset
;
814 uint8_t* const u_out
= dec
->cache_u_
+ mb_x
* 8 + uv_offset
;
815 uint8_t* const v_out
= dec
->cache_v_
+ mb_x
* 8 + uv_offset
;
816 for (j
= 0; j
< 16; ++j
) {
817 memcpy(y_out
+ j
* dec
->cache_y_stride_
, y_dst
+ j
* BPS
, 16);
819 for (j
= 0; j
< 8; ++j
) {
820 memcpy(u_out
+ j
* dec
->cache_uv_stride_
, u_dst
+ j
* BPS
, 8);
821 memcpy(v_out
+ j
* dec
->cache_uv_stride_
, v_dst
+ j
* BPS
, 8);
827 //------------------------------------------------------------------------------