1 // Copyright 2010 Google Inc. All Rights Reserved.
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
10 // Frame-reconstruction function. Memory allocation.
12 // Author: Skal (pascal.massimino@gmail.com)
16 #include "../utils/utils.h"
18 #define ALIGN_MASK (32 - 1)
20 static void ReconstructRow(const VP8Decoder
* const dec
,
21 const VP8ThreadContext
* ctx
); // TODO(skal): remove
23 //------------------------------------------------------------------------------
26 // kFilterExtraRows[] = How many extra lines are needed on the MB boundary
27 // for caching, given a filtering level.
28 // Simple filter: up to 2 luma samples are read and 1 is written.
29 // Complex filter: up to 4 luma samples are read and 3 are written. Same for
30 // U/V, so it's 8 samples total (because of the 2x upsampling).
31 static const uint8_t kFilterExtraRows
[3] = { 0, 2, 8 };
33 static void DoFilter(const VP8Decoder
* const dec
, int mb_x
, int mb_y
) {
34 const VP8ThreadContext
* const ctx
= &dec
->thread_ctx_
;
35 const int cache_id
= ctx
->id_
;
36 const int y_bps
= dec
->cache_y_stride_
;
37 const VP8FInfo
* const f_info
= ctx
->f_info_
+ mb_x
;
38 uint8_t* const y_dst
= dec
->cache_y_
+ cache_id
* 16 * y_bps
+ mb_x
* 16;
39 const int ilevel
= f_info
->f_ilevel_
;
40 const int limit
= f_info
->f_limit_
;
45 if (dec
->filter_type_
== 1) { // simple
47 VP8SimpleHFilter16(y_dst
, y_bps
, limit
+ 4);
49 if (f_info
->f_inner_
) {
50 VP8SimpleHFilter16i(y_dst
, y_bps
, limit
);
53 VP8SimpleVFilter16(y_dst
, y_bps
, limit
+ 4);
55 if (f_info
->f_inner_
) {
56 VP8SimpleVFilter16i(y_dst
, y_bps
, limit
);
59 const int uv_bps
= dec
->cache_uv_stride_
;
60 uint8_t* const u_dst
= dec
->cache_u_
+ cache_id
* 8 * uv_bps
+ mb_x
* 8;
61 uint8_t* const v_dst
= dec
->cache_v_
+ cache_id
* 8 * uv_bps
+ mb_x
* 8;
62 const int hev_thresh
= f_info
->hev_thresh_
;
64 VP8HFilter16(y_dst
, y_bps
, limit
+ 4, ilevel
, hev_thresh
);
65 VP8HFilter8(u_dst
, v_dst
, uv_bps
, limit
+ 4, ilevel
, hev_thresh
);
67 if (f_info
->f_inner_
) {
68 VP8HFilter16i(y_dst
, y_bps
, limit
, ilevel
, hev_thresh
);
69 VP8HFilter8i(u_dst
, v_dst
, uv_bps
, limit
, ilevel
, hev_thresh
);
72 VP8VFilter16(y_dst
, y_bps
, limit
+ 4, ilevel
, hev_thresh
);
73 VP8VFilter8(u_dst
, v_dst
, uv_bps
, limit
+ 4, ilevel
, hev_thresh
);
75 if (f_info
->f_inner_
) {
76 VP8VFilter16i(y_dst
, y_bps
, limit
, ilevel
, hev_thresh
);
77 VP8VFilter8i(u_dst
, v_dst
, uv_bps
, limit
, ilevel
, hev_thresh
);
82 // Filter the decoded macroblock row (if needed)
83 static void FilterRow(const VP8Decoder
* const dec
) {
85 const int mb_y
= dec
->thread_ctx_
.mb_y_
;
86 assert(dec
->thread_ctx_
.filter_row_
);
87 for (mb_x
= dec
->tl_mb_x_
; mb_x
< dec
->br_mb_x_
; ++mb_x
) {
88 DoFilter(dec
, mb_x
, mb_y
);
92 //------------------------------------------------------------------------------
93 // Precompute the filtering strength for each segment and each i4x4/i16x16 mode.
95 static void PrecomputeFilterStrengths(VP8Decoder
* const dec
) {
96 if (dec
->filter_type_
> 0) {
98 const VP8FilterHeader
* const hdr
= &dec
->filter_hdr_
;
99 for (s
= 0; s
< NUM_MB_SEGMENTS
; ++s
) {
101 // First, compute the initial level
103 if (dec
->segment_hdr_
.use_segment_
) {
104 base_level
= dec
->segment_hdr_
.filter_strength_
[s
];
105 if (!dec
->segment_hdr_
.absolute_delta_
) {
106 base_level
+= hdr
->level_
;
109 base_level
= hdr
->level_
;
111 for (i4x4
= 0; i4x4
<= 1; ++i4x4
) {
112 VP8FInfo
* const info
= &dec
->fstrengths_
[s
][i4x4
];
113 int level
= base_level
;
114 if (hdr
->use_lf_delta_
) {
115 // TODO(skal): only CURRENT is handled for now.
116 level
+= hdr
->ref_lf_delta_
[0];
118 level
+= hdr
->mode_lf_delta_
[0];
121 level
= (level
< 0) ? 0 : (level
> 63) ? 63 : level
;
124 if (hdr
->sharpness_
> 0) {
125 if (hdr
->sharpness_
> 4) {
130 if (ilevel
> 9 - hdr
->sharpness_
) {
131 ilevel
= 9 - hdr
->sharpness_
;
134 if (ilevel
< 1) ilevel
= 1;
135 info
->f_ilevel_
= ilevel
;
136 info
->f_limit_
= 2 * level
+ ilevel
;
137 info
->hev_thresh_
= (level
>= 40) ? 2 : (level
>= 15) ? 1 : 0;
139 info
->f_limit_
= 0; // no filtering
141 info
->f_inner_
= i4x4
;
147 //------------------------------------------------------------------------------
150 #define DITHER_AMP_TAB_SIZE 12
151 static const int kQuantToDitherAmp
[DITHER_AMP_TAB_SIZE
] = {
152 // roughly, it's dqm->uv_mat_[1]
153 8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
156 void VP8InitDithering(const WebPDecoderOptions
* const options
,
157 VP8Decoder
* const dec
) {
159 if (options
!= NULL
) {
160 const int d
= options
->dithering_strength
;
161 const int max_amp
= (1 << VP8_RANDOM_DITHER_FIX
) - 1;
162 const int f
= (d
< 0) ? 0 : (d
> 100) ? max_amp
: (d
* max_amp
/ 100);
166 for (s
= 0; s
< NUM_MB_SEGMENTS
; ++s
) {
167 VP8QuantMatrix
* const dqm
= &dec
->dqm_
[s
];
168 if (dqm
->uv_quant_
< DITHER_AMP_TAB_SIZE
) {
169 // TODO(skal): should we specially dither more for uv_quant_ < 0?
170 const int idx
= (dqm
->uv_quant_
< 0) ? 0 : dqm
->uv_quant_
;
171 dqm
->dither_
= (f
* kQuantToDitherAmp
[idx
]) >> 3;
173 all_amp
|= dqm
->dither_
;
176 VP8InitRandom(&dec
->dithering_rg_
, 1.0f
);
183 // minimal amp that will provide a non-zero dithering effect
184 #define MIN_DITHER_AMP 4
185 #define DITHER_DESCALE 4
186 #define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
187 #define DITHER_AMP_BITS 8
188 #define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
190 static void Dither8x8(VP8Random
* const rg
, uint8_t* dst
, int bps
, int amp
) {
192 for (j
= 0; j
< 8; ++j
) {
193 for (i
= 0; i
< 8; ++i
) {
194 // TODO: could be made faster with SSE2
196 VP8RandomBits2(rg
, DITHER_AMP_BITS
+ 1, amp
) - DITHER_AMP_CENTER
;
197 // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
198 const int delta
= (bits
+ DITHER_DESCALE_ROUNDER
) >> DITHER_DESCALE
;
199 const int v
= (int)dst
[i
] + delta
;
200 dst
[i
] = (v
< 0) ? 0 : (v
> 255) ? 255u : (uint8_t)v
;
206 static void DitherRow(VP8Decoder
* const dec
) {
208 assert(dec
->dither_
);
209 for (mb_x
= dec
->tl_mb_x_
; mb_x
< dec
->br_mb_x_
; ++mb_x
) {
210 const VP8ThreadContext
* const ctx
= &dec
->thread_ctx_
;
211 const VP8MBData
* const data
= ctx
->mb_data_
+ mb_x
;
212 const int cache_id
= ctx
->id_
;
213 const int uv_bps
= dec
->cache_uv_stride_
;
214 if (data
->dither_
>= MIN_DITHER_AMP
) {
215 uint8_t* const u_dst
= dec
->cache_u_
+ cache_id
* 8 * uv_bps
+ mb_x
* 8;
216 uint8_t* const v_dst
= dec
->cache_v_
+ cache_id
* 8 * uv_bps
+ mb_x
* 8;
217 Dither8x8(&dec
->dithering_rg_
, u_dst
, uv_bps
, data
->dither_
);
218 Dither8x8(&dec
->dithering_rg_
, v_dst
, uv_bps
, data
->dither_
);
223 //------------------------------------------------------------------------------
224 // This function is called after a row of macroblocks is finished decoding.
225 // It also takes into account the following restrictions:
226 // * In case of in-loop filtering, we must hold off sending some of the bottom
227 // pixels as they are yet unfiltered. They will be when the next macroblock
228 // row is decoded. Meanwhile, we must preserve them by rotating them in the
229 // cache area. This doesn't hold for the very bottom row of the uncropped
230 // picture of course.
231 // * we must clip the remaining pixels against the cropping area. The VP8Io
232 // struct must have the following fields set correctly before calling put():
234 #define MACROBLOCK_VPOS(mb_y) ((mb_y) * 16) // vertical position of a MB
236 // Finalize and transmit a complete row. Return false in case of user-abort.
237 static int FinishRow(VP8Decoder
* const dec
, VP8Io
* const io
) {
239 const VP8ThreadContext
* const ctx
= &dec
->thread_ctx_
;
240 const int cache_id
= ctx
->id_
;
241 const int extra_y_rows
= kFilterExtraRows
[dec
->filter_type_
];
242 const int ysize
= extra_y_rows
* dec
->cache_y_stride_
;
243 const int uvsize
= (extra_y_rows
/ 2) * dec
->cache_uv_stride_
;
244 const int y_offset
= cache_id
* 16 * dec
->cache_y_stride_
;
245 const int uv_offset
= cache_id
* 8 * dec
->cache_uv_stride_
;
246 uint8_t* const ydst
= dec
->cache_y_
- ysize
+ y_offset
;
247 uint8_t* const udst
= dec
->cache_u_
- uvsize
+ uv_offset
;
248 uint8_t* const vdst
= dec
->cache_v_
- uvsize
+ uv_offset
;
249 const int mb_y
= ctx
->mb_y_
;
250 const int is_first_row
= (mb_y
== 0);
251 const int is_last_row
= (mb_y
>= dec
->br_mb_y_
- 1);
253 if (dec
->mt_method_
== 2) {
254 ReconstructRow(dec
, ctx
);
257 if (ctx
->filter_row_
) {
265 if (io
->put
!= NULL
) {
266 int y_start
= MACROBLOCK_VPOS(mb_y
);
267 int y_end
= MACROBLOCK_VPOS(mb_y
+ 1);
269 y_start
-= extra_y_rows
;
274 io
->y
= dec
->cache_y_
+ y_offset
;
275 io
->u
= dec
->cache_u_
+ uv_offset
;
276 io
->v
= dec
->cache_v_
+ uv_offset
;
280 y_end
-= extra_y_rows
;
282 if (y_end
> io
->crop_bottom
) {
283 y_end
= io
->crop_bottom
; // make sure we don't overflow on last row.
286 if (dec
->alpha_data_
!= NULL
&& y_start
< y_end
) {
287 // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
289 io
->a
= VP8DecompressAlphaRows(dec
, y_start
, y_end
- y_start
);
291 return VP8SetError(dec
, VP8_STATUS_BITSTREAM_ERROR
,
292 "Could not decode alpha data.");
295 if (y_start
< io
->crop_top
) {
296 const int delta_y
= io
->crop_top
- y_start
;
297 y_start
= io
->crop_top
;
298 assert(!(delta_y
& 1));
299 io
->y
+= dec
->cache_y_stride_
* delta_y
;
300 io
->u
+= dec
->cache_uv_stride_
* (delta_y
>> 1);
301 io
->v
+= dec
->cache_uv_stride_
* (delta_y
>> 1);
303 io
->a
+= io
->width
* delta_y
;
306 if (y_start
< y_end
) {
307 io
->y
+= io
->crop_left
;
308 io
->u
+= io
->crop_left
>> 1;
309 io
->v
+= io
->crop_left
>> 1;
311 io
->a
+= io
->crop_left
;
313 io
->mb_y
= y_start
- io
->crop_top
;
314 io
->mb_w
= io
->crop_right
- io
->crop_left
;
315 io
->mb_h
= y_end
- y_start
;
319 // rotate top samples if needed
320 if (cache_id
+ 1 == dec
->num_caches_
) {
322 memcpy(dec
->cache_y_
- ysize
, ydst
+ 16 * dec
->cache_y_stride_
, ysize
);
323 memcpy(dec
->cache_u_
- uvsize
, udst
+ 8 * dec
->cache_uv_stride_
, uvsize
);
324 memcpy(dec
->cache_v_
- uvsize
, vdst
+ 8 * dec
->cache_uv_stride_
, uvsize
);
331 #undef MACROBLOCK_VPOS
333 //------------------------------------------------------------------------------
335 int VP8ProcessRow(VP8Decoder
* const dec
, VP8Io
* const io
) {
337 VP8ThreadContext
* const ctx
= &dec
->thread_ctx_
;
338 const int filter_row
=
339 (dec
->filter_type_
> 0) &&
340 (dec
->mb_y_
>= dec
->tl_mb_y_
) && (dec
->mb_y_
<= dec
->br_mb_y_
);
341 if (dec
->mt_method_
== 0) {
342 // ctx->id_ and ctx->f_info_ are already set
343 ctx
->mb_y_
= dec
->mb_y_
;
344 ctx
->filter_row_
= filter_row
;
345 ReconstructRow(dec
, ctx
);
346 ok
= FinishRow(dec
, io
);
348 WebPWorker
* const worker
= &dec
->worker_
;
349 // Finish previous job *before* updating context
350 ok
&= WebPWorkerSync(worker
);
351 assert(worker
->status_
== OK
);
352 if (ok
) { // spawn a new deblocking/output job
354 ctx
->id_
= dec
->cache_id_
;
355 ctx
->mb_y_
= dec
->mb_y_
;
356 ctx
->filter_row_
= filter_row
;
357 if (dec
->mt_method_
== 2) { // swap macroblock data
358 VP8MBData
* const tmp
= ctx
->mb_data_
;
359 ctx
->mb_data_
= dec
->mb_data_
;
362 // perform reconstruction directly in main thread
363 ReconstructRow(dec
, ctx
);
365 if (filter_row
) { // swap filter info
366 VP8FInfo
* const tmp
= ctx
->f_info_
;
367 ctx
->f_info_
= dec
->f_info_
;
370 WebPWorkerLaunch(worker
); // (reconstruct)+filter in parallel
371 if (++dec
->cache_id_
== dec
->num_caches_
) {
379 //------------------------------------------------------------------------------
380 // Finish setting up the decoding parameter once user's setup() is called.
382 VP8StatusCode
VP8EnterCritical(VP8Decoder
* const dec
, VP8Io
* const io
) {
383 // Call setup() first. This may trigger additional decoding features on 'io'.
384 // Note: Afterward, we must call teardown() no matter what.
385 if (io
->setup
!= NULL
&& !io
->setup(io
)) {
386 VP8SetError(dec
, VP8_STATUS_USER_ABORT
, "Frame setup failed");
390 // Disable filtering per user request
391 if (io
->bypass_filtering
) {
392 dec
->filter_type_
= 0;
394 // TODO(skal): filter type / strength / sharpness forcing
396 // Define the area where we can skip in-loop filtering, in case of cropping.
398 // 'Simple' filter reads two luma samples outside of the macroblock
399 // and filters one. It doesn't filter the chroma samples. Hence, we can
400 // avoid doing the in-loop filtering before crop_top/crop_left position.
401 // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
402 // Means: there's a dependency chain that goes all the way up to the
403 // top-left corner of the picture (MB #0). We must filter all the previous
405 // TODO(skal): add an 'approximate_decoding' option, that won't produce
406 // a 1:1 bit-exactness for complex filtering?
408 const int extra_pixels
= kFilterExtraRows
[dec
->filter_type_
];
409 if (dec
->filter_type_
== 2) {
410 // For complex filter, we need to preserve the dependency chain.
414 // For simple filter, we can filter only the cropped region.
415 // We include 'extra_pixels' on the other side of the boundary, since
416 // vertical or horizontal filtering of the previous macroblock can
417 // modify some abutting pixels.
418 dec
->tl_mb_x_
= (io
->crop_left
- extra_pixels
) >> 4;
419 dec
->tl_mb_y_
= (io
->crop_top
- extra_pixels
) >> 4;
420 if (dec
->tl_mb_x_
< 0) dec
->tl_mb_x_
= 0;
421 if (dec
->tl_mb_y_
< 0) dec
->tl_mb_y_
= 0;
423 // We need some 'extra' pixels on the right/bottom.
424 dec
->br_mb_y_
= (io
->crop_bottom
+ 15 + extra_pixels
) >> 4;
425 dec
->br_mb_x_
= (io
->crop_right
+ 15 + extra_pixels
) >> 4;
426 if (dec
->br_mb_x_
> dec
->mb_w_
) {
427 dec
->br_mb_x_
= dec
->mb_w_
;
429 if (dec
->br_mb_y_
> dec
->mb_h_
) {
430 dec
->br_mb_y_
= dec
->mb_h_
;
433 PrecomputeFilterStrengths(dec
);
434 return VP8_STATUS_OK
;
437 int VP8ExitCritical(VP8Decoder
* const dec
, VP8Io
* const io
) {
439 if (dec
->mt_method_
> 0) {
440 ok
= WebPWorkerSync(&dec
->worker_
);
443 if (io
->teardown
!= NULL
) {
449 //------------------------------------------------------------------------------
450 // For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line.
452 // Reason is: the deblocking filter cannot deblock the bottom horizontal edges
453 // immediately, and needs to wait for first few rows of the next macroblock to
454 // be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending
456 // With two threads, the vertical positions of the rows being decoded are:
457 // Decode: [ 0..15][16..31][32..47][48..63][64..79][...
458 // Deblock: [ 0..11][12..27][28..43][44..59][...
459 // If we use two threads and two caches of 16 pixels, the sequence would be:
460 // Decode: [ 0..15][16..31][ 0..15!!][16..31][ 0..15][...
461 // Deblock: [ 0..11][12..27!!][-4..11][12..27][...
462 // The problem occurs during row [12..15!!] that both the decoding and
463 // deblocking threads are writing simultaneously.
464 // With 3 cache lines, one get a safe write pattern:
465 // Decode: [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0..
466 // Deblock: [ 0..11][12..27][28..43][-4..11][12..27][28...
467 // Note that multi-threaded output _without_ deblocking can make use of two
468 // cache lines of 16 pixels only, since there's no lagging behind. The decoding
469 // and output process have non-concurrent writing:
470 // Decode: [ 0..15][16..31][ 0..15][16..31][...
471 // io->put: [ 0..15][16..31][ 0..15][...
473 #define MT_CACHE_LINES 3
474 #define ST_CACHE_LINES 1 // 1 cache row only for single-threaded case
476 // Initialize multi/single-thread worker
477 static int InitThreadContext(VP8Decoder
* const dec
) {
479 if (dec
->mt_method_
> 0) {
480 WebPWorker
* const worker
= &dec
->worker_
;
481 if (!WebPWorkerReset(worker
)) {
482 return VP8SetError(dec
, VP8_STATUS_OUT_OF_MEMORY
,
483 "thread initialization failed.");
486 worker
->data2
= (void*)&dec
->thread_ctx_
.io_
;
487 worker
->hook
= (WebPWorkerHook
)FinishRow
;
489 (dec
->filter_type_
> 0) ? MT_CACHE_LINES
: MT_CACHE_LINES
- 1;
491 dec
->num_caches_
= ST_CACHE_LINES
;
496 int VP8GetThreadMethod(const WebPDecoderOptions
* const options
,
497 const WebPHeaderStructure
* const headers
,
498 int width
, int height
) {
499 if (options
== NULL
|| options
->use_threads
== 0) {
505 assert(!headers
->is_lossless
);
506 #if defined(WEBP_USE_THREAD)
507 if (width
< MIN_WIDTH_FOR_THREADS
) return 0;
508 // TODO(skal): tune the heuristic further
510 if (height
< 2 * width
) return 2;
513 #else // !WEBP_USE_THREAD
518 #undef MT_CACHE_LINES
519 #undef ST_CACHE_LINES
521 //------------------------------------------------------------------------------
524 static int AllocateMemory(VP8Decoder
* const dec
) {
525 const int num_caches
= dec
->num_caches_
;
526 const int mb_w
= dec
->mb_w_
;
527 // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
528 const size_t intra_pred_mode_size
= 4 * mb_w
* sizeof(uint8_t);
529 const size_t top_size
= sizeof(VP8TopSamples
) * mb_w
;
530 const size_t mb_info_size
= (mb_w
+ 1) * sizeof(VP8MB
);
531 const size_t f_info_size
=
532 (dec
->filter_type_
> 0) ?
533 mb_w
* (dec
->mt_method_
> 0 ? 2 : 1) * sizeof(VP8FInfo
)
535 const size_t yuv_size
= YUV_SIZE
* sizeof(*dec
->yuv_b_
);
536 const size_t mb_data_size
=
537 (dec
->mt_method_
== 2 ? 2 : 1) * mb_w
* sizeof(*dec
->mb_data_
);
538 const size_t cache_height
= (16 * num_caches
539 + kFilterExtraRows
[dec
->filter_type_
]) * 3 / 2;
540 const size_t cache_size
= top_size
* cache_height
;
541 // alpha_size is the only one that scales as width x height.
542 const uint64_t alpha_size
= (dec
->alpha_data_
!= NULL
) ?
543 (uint64_t)dec
->pic_hdr_
.width_
* dec
->pic_hdr_
.height_
: 0ULL;
544 const uint64_t needed
= (uint64_t)intra_pred_mode_size
545 + top_size
+ mb_info_size
+ f_info_size
546 + yuv_size
+ mb_data_size
547 + cache_size
+ alpha_size
+ ALIGN_MASK
;
550 if (needed
!= (size_t)needed
) return 0; // check for overflow
551 if (needed
> dec
->mem_size_
) {
554 dec
->mem_
= WebPSafeMalloc(needed
, sizeof(uint8_t));
555 if (dec
->mem_
== NULL
) {
556 return VP8SetError(dec
, VP8_STATUS_OUT_OF_MEMORY
,
557 "no memory during frame initialization.");
559 // down-cast is ok, thanks to WebPSafeAlloc() above.
560 dec
->mem_size_
= (size_t)needed
;
563 mem
= (uint8_t*)dec
->mem_
;
564 dec
->intra_t_
= (uint8_t*)mem
;
565 mem
+= intra_pred_mode_size
;
567 dec
->yuv_t_
= (VP8TopSamples
*)mem
;
570 dec
->mb_info_
= ((VP8MB
*)mem
) + 1;
573 dec
->f_info_
= f_info_size
? (VP8FInfo
*)mem
: NULL
;
575 dec
->thread_ctx_
.id_
= 0;
576 dec
->thread_ctx_
.f_info_
= dec
->f_info_
;
577 if (dec
->mt_method_
> 0) {
578 // secondary cache line. The deblocking process need to make use of the
579 // filtering strength from previous macroblock row, while the new ones
580 // are being decoded in parallel. We'll just swap the pointers.
581 dec
->thread_ctx_
.f_info_
+= mb_w
;
584 mem
= (uint8_t*)((uintptr_t)(mem
+ ALIGN_MASK
) & ~ALIGN_MASK
);
585 assert((yuv_size
& ALIGN_MASK
) == 0);
586 dec
->yuv_b_
= (uint8_t*)mem
;
589 dec
->mb_data_
= (VP8MBData
*)mem
;
590 dec
->thread_ctx_
.mb_data_
= (VP8MBData
*)mem
;
591 if (dec
->mt_method_
== 2) {
592 dec
->thread_ctx_
.mb_data_
+= mb_w
;
596 dec
->cache_y_stride_
= 16 * mb_w
;
597 dec
->cache_uv_stride_
= 8 * mb_w
;
599 const int extra_rows
= kFilterExtraRows
[dec
->filter_type_
];
600 const int extra_y
= extra_rows
* dec
->cache_y_stride_
;
601 const int extra_uv
= (extra_rows
/ 2) * dec
->cache_uv_stride_
;
602 dec
->cache_y_
= ((uint8_t*)mem
) + extra_y
;
603 dec
->cache_u_
= dec
->cache_y_
604 + 16 * num_caches
* dec
->cache_y_stride_
+ extra_uv
;
605 dec
->cache_v_
= dec
->cache_u_
606 + 8 * num_caches
* dec
->cache_uv_stride_
+ extra_uv
;
612 dec
->alpha_plane_
= alpha_size
? (uint8_t*)mem
: NULL
;
614 assert(mem
<= (uint8_t*)dec
->mem_
+ dec
->mem_size_
);
616 // note: left/top-info is initialized once for all.
617 memset(dec
->mb_info_
- 1, 0, mb_info_size
);
618 VP8InitScanline(dec
); // initialize left too.
621 memset(dec
->intra_t_
, B_DC_PRED
, intra_pred_mode_size
);
626 static void InitIo(VP8Decoder
* const dec
, VP8Io
* io
) {
629 io
->y
= dec
->cache_y_
;
630 io
->u
= dec
->cache_u_
;
631 io
->v
= dec
->cache_v_
;
632 io
->y_stride
= dec
->cache_y_stride_
;
633 io
->uv_stride
= dec
->cache_uv_stride_
;
637 int VP8InitFrame(VP8Decoder
* const dec
, VP8Io
* io
) {
638 if (!InitThreadContext(dec
)) return 0; // call first. Sets dec->num_caches_.
639 if (!AllocateMemory(dec
)) return 0;
641 VP8DspInit(); // Init critical function pointers and look-up tables.
645 //------------------------------------------------------------------------------
646 // Main reconstruction function.
648 static const int kScan
[16] = {
649 0 + 0 * BPS
, 4 + 0 * BPS
, 8 + 0 * BPS
, 12 + 0 * BPS
,
650 0 + 4 * BPS
, 4 + 4 * BPS
, 8 + 4 * BPS
, 12 + 4 * BPS
,
651 0 + 8 * BPS
, 4 + 8 * BPS
, 8 + 8 * BPS
, 12 + 8 * BPS
,
652 0 + 12 * BPS
, 4 + 12 * BPS
, 8 + 12 * BPS
, 12 + 12 * BPS
655 static int CheckMode(int mb_x
, int mb_y
, int mode
) {
656 if (mode
== B_DC_PRED
) {
658 return (mb_y
== 0) ? B_DC_PRED_NOTOPLEFT
: B_DC_PRED_NOLEFT
;
660 return (mb_y
== 0) ? B_DC_PRED_NOTOP
: B_DC_PRED
;
666 static void Copy32b(uint8_t* dst
, uint8_t* src
) {
670 static WEBP_INLINE
void DoTransform(uint32_t bits
, const int16_t* const src
,
671 uint8_t* const dst
) {
672 switch (bits
>> 30) {
674 VP8Transform(src
, dst
, 0);
677 VP8TransformAC3(src
, dst
);
680 VP8TransformDC(src
, dst
);
687 static void DoUVTransform(uint32_t bits
, const int16_t* const src
,
688 uint8_t* const dst
) {
689 if (bits
& 0xff) { // any non-zero coeff at all?
690 if (bits
& 0xaa) { // any non-zero AC coefficient?
691 VP8TransformUV(src
, dst
); // note we don't use the AC3 variant for U/V
693 VP8TransformDCUV(src
, dst
);
698 static void ReconstructRow(const VP8Decoder
* const dec
,
699 const VP8ThreadContext
* ctx
) {
702 const int mb_y
= ctx
->mb_y_
;
703 const int cache_id
= ctx
->id_
;
704 uint8_t* const y_dst
= dec
->yuv_b_
+ Y_OFF
;
705 uint8_t* const u_dst
= dec
->yuv_b_
+ U_OFF
;
706 uint8_t* const v_dst
= dec
->yuv_b_
+ V_OFF
;
707 for (mb_x
= 0; mb_x
< dec
->mb_w_
; ++mb_x
) {
708 const VP8MBData
* const block
= ctx
->mb_data_
+ mb_x
;
710 // Rotate in the left samples from previously decoded block. We move four
711 // pixels at a time for alignment reason, and because of in-loop filter.
713 for (j
= -1; j
< 16; ++j
) {
714 Copy32b(&y_dst
[j
* BPS
- 4], &y_dst
[j
* BPS
+ 12]);
716 for (j
= -1; j
< 8; ++j
) {
717 Copy32b(&u_dst
[j
* BPS
- 4], &u_dst
[j
* BPS
+ 4]);
718 Copy32b(&v_dst
[j
* BPS
- 4], &v_dst
[j
* BPS
+ 4]);
721 for (j
= 0; j
< 16; ++j
) {
722 y_dst
[j
* BPS
- 1] = 129;
724 for (j
= 0; j
< 8; ++j
) {
725 u_dst
[j
* BPS
- 1] = 129;
726 v_dst
[j
* BPS
- 1] = 129;
728 // Init top-left sample on left column too
730 y_dst
[-1 - BPS
] = u_dst
[-1 - BPS
] = v_dst
[-1 - BPS
] = 129;
734 // bring top samples into the cache
735 VP8TopSamples
* const top_yuv
= dec
->yuv_t_
+ mb_x
;
736 const int16_t* const coeffs
= block
->coeffs_
;
737 uint32_t bits
= block
->non_zero_y_
;
741 memcpy(y_dst
- BPS
, top_yuv
[0].y
, 16);
742 memcpy(u_dst
- BPS
, top_yuv
[0].u
, 8);
743 memcpy(v_dst
- BPS
, top_yuv
[0].v
, 8);
744 } else if (mb_x
== 0) {
745 // we only need to do this init once at block (0,0).
746 // Afterward, it remains valid for the whole topmost row.
747 memset(y_dst
- BPS
- 1, 127, 16 + 4 + 1);
748 memset(u_dst
- BPS
- 1, 127, 8 + 1);
749 memset(v_dst
- BPS
- 1, 127, 8 + 1);
752 // predict and add residuals
753 if (block
->is_i4x4_
) { // 4x4
754 uint32_t* const top_right
= (uint32_t*)(y_dst
- BPS
+ 16);
757 if (mb_x
>= dec
->mb_w_
- 1) { // on rightmost border
758 memset(top_right
, top_yuv
[0].y
[15], sizeof(*top_right
));
760 memcpy(top_right
, top_yuv
[1].y
, sizeof(*top_right
));
763 // replicate the top-right pixels below
764 top_right
[BPS
] = top_right
[2 * BPS
] = top_right
[3 * BPS
] = top_right
[0];
766 // predict and add residuals for all 4x4 blocks in turn.
767 for (n
= 0; n
< 16; ++n
, bits
<<= 2) {
768 uint8_t* const dst
= y_dst
+ kScan
[n
];
769 VP8PredLuma4
[block
->imodes_
[n
]](dst
);
770 DoTransform(bits
, coeffs
+ n
* 16, dst
);
773 const int pred_func
= CheckMode(mb_x
, mb_y
,
775 VP8PredLuma16
[pred_func
](y_dst
);
777 for (n
= 0; n
< 16; ++n
, bits
<<= 2) {
778 DoTransform(bits
, coeffs
+ n
* 16, y_dst
+ kScan
[n
]);
784 const uint32_t bits_uv
= block
->non_zero_uv_
;
785 const int pred_func
= CheckMode(mb_x
, mb_y
, block
->uvmode_
);
786 VP8PredChroma8
[pred_func
](u_dst
);
787 VP8PredChroma8
[pred_func
](v_dst
);
788 DoUVTransform(bits_uv
>> 0, coeffs
+ 16 * 16, u_dst
);
789 DoUVTransform(bits_uv
>> 8, coeffs
+ 20 * 16, v_dst
);
792 // stash away top samples for next block
793 if (mb_y
< dec
->mb_h_
- 1) {
794 memcpy(top_yuv
[0].y
, y_dst
+ 15 * BPS
, 16);
795 memcpy(top_yuv
[0].u
, u_dst
+ 7 * BPS
, 8);
796 memcpy(top_yuv
[0].v
, v_dst
+ 7 * BPS
, 8);
799 // Transfer reconstructed samples from yuv_b_ cache to final destination.
801 const int y_offset
= cache_id
* 16 * dec
->cache_y_stride_
;
802 const int uv_offset
= cache_id
* 8 * dec
->cache_uv_stride_
;
803 uint8_t* const y_out
= dec
->cache_y_
+ mb_x
* 16 + y_offset
;
804 uint8_t* const u_out
= dec
->cache_u_
+ mb_x
* 8 + uv_offset
;
805 uint8_t* const v_out
= dec
->cache_v_
+ mb_x
* 8 + uv_offset
;
806 for (j
= 0; j
< 16; ++j
) {
807 memcpy(y_out
+ j
* dec
->cache_y_stride_
, y_dst
+ j
* BPS
, 16);
809 for (j
= 0; j
< 8; ++j
) {
810 memcpy(u_out
+ j
* dec
->cache_uv_stride_
, u_dst
+ j
* BPS
, 8);
811 memcpy(v_out
+ j
* dec
->cache_uv_stride_
, v_dst
+ j
* BPS
, 8);
817 //------------------------------------------------------------------------------