1 // SPDX-License-Identifier: LGPL-2.1+
3 * Copyright 2016 Tom aan de Wiel
4 * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
6 * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
8 * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
12 #include <linux/string.h>
13 #include <linux/kernel.h>
14 #include "codec-fwht.h"
16 #define OVERFLOW_BIT BIT(14)
19 * Note: bit 0 of the header must always be 0. Otherwise it cannot
20 * be guaranteed that the magic 8 byte sequence (see below) can
21 * never occur in the rlc output.
23 #define PFRAME_BIT BIT(15)
24 #define DUPS_MASK 0x1ffe
31 static const uint8_t zigzag
[64] = {
37 5, 12, 19, 26, 33, 40,
38 6, 13, 20, 27, 34, 41, 48,
39 7, 14, 21, 28, 35, 42, 49, 56,
40 15, 22, 29, 36, 43, 50, 57,
41 23, 30, 37, 44, 51, 58,
50 static int rlc(const s16
*in
, __be16
*output
, int blocktype
)
58 /* read in block from framebuffer */
62 for (y
= 0; y
< 8; y
++) {
63 for (x
= 0; x
< 8; x
++) {
69 /* keep track of amount of trailing zeros */
70 for (i
= 63; i
>= 0 && !block
[zigzag
[i
]]; i
--)
73 *output
++ = (blocktype
== PBLOCK
? htons(PFRAME_BIT
) : 0);
76 to_encode
= 8 * 8 - (lastzero_run
> 14 ? lastzero_run
: 0);
79 while (i
< to_encode
) {
83 /* count leading zeros */
84 while ((tmp
= block
[zigzag
[i
]]) == 0 && cnt
< 14) {
92 /* 4 bits for run, 12 for coefficient (quantization by 4) */
93 *output
++ = htons((cnt
| tmp
<< 4));
97 if (lastzero_run
> 14) {
98 *output
= htons(ALL_ZEROS
| 0);
106 * This function will worst-case increase rlc_in by 65*2 bytes:
107 * one s16 value for the header and 8 * 8 coefficients of type s16.
109 static u16
derlc(const __be16
**rlc_in
, s16
*dwht_out
,
110 const __be16
*end_of_input
)
113 const __be16
*input
= *rlc_in
;
116 s16 block
[8 * 8 + 16];
120 if (input
> end_of_input
)
122 stat
= ntohs(*input
++);
125 * Now de-compress, it expands one byte to up to 15 bytes
126 * (or fills the remainder of the 64 bytes with zeroes if it
127 * is the last byte to expand).
129 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
130 * allow for overflow if the incoming data was malformed.
132 while (dec_count
< 8 * 8) {
137 if (input
> end_of_input
)
139 in
= ntohs(*input
++);
143 /* fill remainder with zeros */
145 for (i
= 0; i
< 64 - dec_count
; i
++)
150 for (i
= 0; i
< length
; i
++)
153 dec_count
+= length
+ 1;
158 for (i
= 0; i
< 64; i
++) {
163 dwht_out
[x
+ y
* 8] = *wp
++;
169 static const int quant_table
[] = {
170 2, 2, 2, 2, 2, 2, 2, 2,
171 2, 2, 2, 2, 2, 2, 2, 2,
172 2, 2, 2, 2, 2, 2, 2, 3,
173 2, 2, 2, 2, 2, 2, 3, 6,
174 2, 2, 2, 2, 2, 3, 6, 6,
175 2, 2, 2, 2, 3, 6, 6, 6,
176 2, 2, 2, 3, 6, 6, 6, 6,
177 2, 2, 3, 6, 6, 6, 6, 8,
180 static const int quant_table_p
[] = {
181 3, 3, 3, 3, 3, 3, 3, 3,
182 3, 3, 3, 3, 3, 3, 3, 3,
183 3, 3, 3, 3, 3, 3, 3, 3,
184 3, 3, 3, 3, 3, 3, 3, 6,
185 3, 3, 3, 3, 3, 3, 6, 6,
186 3, 3, 3, 3, 3, 6, 6, 9,
187 3, 3, 3, 3, 6, 6, 9, 9,
188 3, 3, 3, 6, 6, 9, 9, 10,
191 static void quantize_intra(s16
*coeff
, s16
*de_coeff
, u16 qp
)
193 const int *quant
= quant_table
;
196 for (j
= 0; j
< 8; j
++) {
197 for (i
= 0; i
< 8; i
++, quant
++, coeff
++, de_coeff
++) {
199 if (*coeff
>= -qp
&& *coeff
<= qp
)
200 *coeff
= *de_coeff
= 0;
202 *de_coeff
= *coeff
<< *quant
;
207 static void dequantize_intra(s16
*coeff
)
209 const int *quant
= quant_table
;
212 for (j
= 0; j
< 8; j
++)
213 for (i
= 0; i
< 8; i
++, quant
++, coeff
++)
217 static void quantize_inter(s16
*coeff
, s16
*de_coeff
, u16 qp
)
219 const int *quant
= quant_table_p
;
222 for (j
= 0; j
< 8; j
++) {
223 for (i
= 0; i
< 8; i
++, quant
++, coeff
++, de_coeff
++) {
225 if (*coeff
>= -qp
&& *coeff
<= qp
)
226 *coeff
= *de_coeff
= 0;
228 *de_coeff
= *coeff
<< *quant
;
233 static void dequantize_inter(s16
*coeff
)
235 const int *quant
= quant_table_p
;
238 for (j
= 0; j
< 8; j
++)
239 for (i
= 0; i
< 8; i
++, quant
++, coeff
++)
243 static void fwht(const u8
*block
, s16
*output_block
, unsigned int stride
,
244 unsigned int input_step
, bool intra
)
246 /* we'll need more than 8 bits for the transformed coefficients */
247 s32 workspace1
[8], workspace2
[8];
248 const u8
*tmp
= block
;
249 s16
*out
= output_block
;
250 int add
= intra
? 256 : 0;
254 for (i
= 0; i
< 8; i
++, tmp
+= stride
, out
+= 8) {
255 switch (input_step
) {
257 workspace1
[0] = tmp
[0] + tmp
[1] - add
;
258 workspace1
[1] = tmp
[0] - tmp
[1];
260 workspace1
[2] = tmp
[2] + tmp
[3] - add
;
261 workspace1
[3] = tmp
[2] - tmp
[3];
263 workspace1
[4] = tmp
[4] + tmp
[5] - add
;
264 workspace1
[5] = tmp
[4] - tmp
[5];
266 workspace1
[6] = tmp
[6] + tmp
[7] - add
;
267 workspace1
[7] = tmp
[6] - tmp
[7];
270 workspace1
[0] = tmp
[0] + tmp
[2] - add
;
271 workspace1
[1] = tmp
[0] - tmp
[2];
273 workspace1
[2] = tmp
[4] + tmp
[6] - add
;
274 workspace1
[3] = tmp
[4] - tmp
[6];
276 workspace1
[4] = tmp
[8] + tmp
[10] - add
;
277 workspace1
[5] = tmp
[8] - tmp
[10];
279 workspace1
[6] = tmp
[12] + tmp
[14] - add
;
280 workspace1
[7] = tmp
[12] - tmp
[14];
283 workspace1
[0] = tmp
[0] + tmp
[3] - add
;
284 workspace1
[1] = tmp
[0] - tmp
[3];
286 workspace1
[2] = tmp
[6] + tmp
[9] - add
;
287 workspace1
[3] = tmp
[6] - tmp
[9];
289 workspace1
[4] = tmp
[12] + tmp
[15] - add
;
290 workspace1
[5] = tmp
[12] - tmp
[15];
292 workspace1
[6] = tmp
[18] + tmp
[21] - add
;
293 workspace1
[7] = tmp
[18] - tmp
[21];
296 workspace1
[0] = tmp
[0] + tmp
[4] - add
;
297 workspace1
[1] = tmp
[0] - tmp
[4];
299 workspace1
[2] = tmp
[8] + tmp
[12] - add
;
300 workspace1
[3] = tmp
[8] - tmp
[12];
302 workspace1
[4] = tmp
[16] + tmp
[20] - add
;
303 workspace1
[5] = tmp
[16] - tmp
[20];
305 workspace1
[6] = tmp
[24] + tmp
[28] - add
;
306 workspace1
[7] = tmp
[24] - tmp
[28];
311 workspace2
[0] = workspace1
[0] + workspace1
[2];
312 workspace2
[1] = workspace1
[0] - workspace1
[2];
313 workspace2
[2] = workspace1
[1] - workspace1
[3];
314 workspace2
[3] = workspace1
[1] + workspace1
[3];
316 workspace2
[4] = workspace1
[4] + workspace1
[6];
317 workspace2
[5] = workspace1
[4] - workspace1
[6];
318 workspace2
[6] = workspace1
[5] - workspace1
[7];
319 workspace2
[7] = workspace1
[5] + workspace1
[7];
322 out
[0] = workspace2
[0] + workspace2
[4];
323 out
[1] = workspace2
[0] - workspace2
[4];
324 out
[2] = workspace2
[1] - workspace2
[5];
325 out
[3] = workspace2
[1] + workspace2
[5];
326 out
[4] = workspace2
[2] + workspace2
[6];
327 out
[5] = workspace2
[2] - workspace2
[6];
328 out
[6] = workspace2
[3] - workspace2
[7];
329 out
[7] = workspace2
[3] + workspace2
[7];
334 for (i
= 0; i
< 8; i
++, out
++) {
336 workspace1
[0] = out
[0] + out
[1 * 8];
337 workspace1
[1] = out
[0] - out
[1 * 8];
339 workspace1
[2] = out
[2 * 8] + out
[3 * 8];
340 workspace1
[3] = out
[2 * 8] - out
[3 * 8];
342 workspace1
[4] = out
[4 * 8] + out
[5 * 8];
343 workspace1
[5] = out
[4 * 8] - out
[5 * 8];
345 workspace1
[6] = out
[6 * 8] + out
[7 * 8];
346 workspace1
[7] = out
[6 * 8] - out
[7 * 8];
349 workspace2
[0] = workspace1
[0] + workspace1
[2];
350 workspace2
[1] = workspace1
[0] - workspace1
[2];
351 workspace2
[2] = workspace1
[1] - workspace1
[3];
352 workspace2
[3] = workspace1
[1] + workspace1
[3];
354 workspace2
[4] = workspace1
[4] + workspace1
[6];
355 workspace2
[5] = workspace1
[4] - workspace1
[6];
356 workspace2
[6] = workspace1
[5] - workspace1
[7];
357 workspace2
[7] = workspace1
[5] + workspace1
[7];
359 out
[0 * 8] = workspace2
[0] + workspace2
[4];
360 out
[1 * 8] = workspace2
[0] - workspace2
[4];
361 out
[2 * 8] = workspace2
[1] - workspace2
[5];
362 out
[3 * 8] = workspace2
[1] + workspace2
[5];
363 out
[4 * 8] = workspace2
[2] + workspace2
[6];
364 out
[5 * 8] = workspace2
[2] - workspace2
[6];
365 out
[6 * 8] = workspace2
[3] - workspace2
[7];
366 out
[7 * 8] = workspace2
[3] + workspace2
[7];
371 * Not the nicest way of doing it, but P-blocks get twice the range of
372 * that of the I-blocks. Therefore we need a type bigger than 8 bits.
373 * Furthermore values can be negative... This is just a version that
374 * works with 16 signed data
376 static void fwht16(const s16
*block
, s16
*output_block
, int stride
, int intra
)
378 /* we'll need more than 8 bits for the transformed coefficients */
379 s32 workspace1
[8], workspace2
[8];
380 const s16
*tmp
= block
;
381 s16
*out
= output_block
;
384 for (i
= 0; i
< 8; i
++, tmp
+= stride
, out
+= 8) {
386 workspace1
[0] = tmp
[0] + tmp
[1];
387 workspace1
[1] = tmp
[0] - tmp
[1];
389 workspace1
[2] = tmp
[2] + tmp
[3];
390 workspace1
[3] = tmp
[2] - tmp
[3];
392 workspace1
[4] = tmp
[4] + tmp
[5];
393 workspace1
[5] = tmp
[4] - tmp
[5];
395 workspace1
[6] = tmp
[6] + tmp
[7];
396 workspace1
[7] = tmp
[6] - tmp
[7];
399 workspace2
[0] = workspace1
[0] + workspace1
[2];
400 workspace2
[1] = workspace1
[0] - workspace1
[2];
401 workspace2
[2] = workspace1
[1] - workspace1
[3];
402 workspace2
[3] = workspace1
[1] + workspace1
[3];
404 workspace2
[4] = workspace1
[4] + workspace1
[6];
405 workspace2
[5] = workspace1
[4] - workspace1
[6];
406 workspace2
[6] = workspace1
[5] - workspace1
[7];
407 workspace2
[7] = workspace1
[5] + workspace1
[7];
410 out
[0] = workspace2
[0] + workspace2
[4];
411 out
[1] = workspace2
[0] - workspace2
[4];
412 out
[2] = workspace2
[1] - workspace2
[5];
413 out
[3] = workspace2
[1] + workspace2
[5];
414 out
[4] = workspace2
[2] + workspace2
[6];
415 out
[5] = workspace2
[2] - workspace2
[6];
416 out
[6] = workspace2
[3] - workspace2
[7];
417 out
[7] = workspace2
[3] + workspace2
[7];
422 for (i
= 0; i
< 8; i
++, out
++) {
424 workspace1
[0] = out
[0] + out
[1*8];
425 workspace1
[1] = out
[0] - out
[1*8];
427 workspace1
[2] = out
[2*8] + out
[3*8];
428 workspace1
[3] = out
[2*8] - out
[3*8];
430 workspace1
[4] = out
[4*8] + out
[5*8];
431 workspace1
[5] = out
[4*8] - out
[5*8];
433 workspace1
[6] = out
[6*8] + out
[7*8];
434 workspace1
[7] = out
[6*8] - out
[7*8];
437 workspace2
[0] = workspace1
[0] + workspace1
[2];
438 workspace2
[1] = workspace1
[0] - workspace1
[2];
439 workspace2
[2] = workspace1
[1] - workspace1
[3];
440 workspace2
[3] = workspace1
[1] + workspace1
[3];
442 workspace2
[4] = workspace1
[4] + workspace1
[6];
443 workspace2
[5] = workspace1
[4] - workspace1
[6];
444 workspace2
[6] = workspace1
[5] - workspace1
[7];
445 workspace2
[7] = workspace1
[5] + workspace1
[7];
448 out
[0*8] = workspace2
[0] + workspace2
[4];
449 out
[1*8] = workspace2
[0] - workspace2
[4];
450 out
[2*8] = workspace2
[1] - workspace2
[5];
451 out
[3*8] = workspace2
[1] + workspace2
[5];
452 out
[4*8] = workspace2
[2] + workspace2
[6];
453 out
[5*8] = workspace2
[2] - workspace2
[6];
454 out
[6*8] = workspace2
[3] - workspace2
[7];
455 out
[7*8] = workspace2
[3] + workspace2
[7];
459 static void ifwht(const s16
*block
, s16
*output_block
, int intra
)
462 * we'll need more than 8 bits for the transformed coefficients
463 * use native unit of cpu
465 int workspace1
[8], workspace2
[8];
466 int inter
= intra
? 0 : 1;
467 const s16
*tmp
= block
;
468 s16
*out
= output_block
;
471 for (i
= 0; i
< 8; i
++, tmp
+= 8, out
+= 8) {
473 workspace1
[0] = tmp
[0] + tmp
[1];
474 workspace1
[1] = tmp
[0] - tmp
[1];
476 workspace1
[2] = tmp
[2] + tmp
[3];
477 workspace1
[3] = tmp
[2] - tmp
[3];
479 workspace1
[4] = tmp
[4] + tmp
[5];
480 workspace1
[5] = tmp
[4] - tmp
[5];
482 workspace1
[6] = tmp
[6] + tmp
[7];
483 workspace1
[7] = tmp
[6] - tmp
[7];
486 workspace2
[0] = workspace1
[0] + workspace1
[2];
487 workspace2
[1] = workspace1
[0] - workspace1
[2];
488 workspace2
[2] = workspace1
[1] - workspace1
[3];
489 workspace2
[3] = workspace1
[1] + workspace1
[3];
491 workspace2
[4] = workspace1
[4] + workspace1
[6];
492 workspace2
[5] = workspace1
[4] - workspace1
[6];
493 workspace2
[6] = workspace1
[5] - workspace1
[7];
494 workspace2
[7] = workspace1
[5] + workspace1
[7];
497 out
[0] = workspace2
[0] + workspace2
[4];
498 out
[1] = workspace2
[0] - workspace2
[4];
499 out
[2] = workspace2
[1] - workspace2
[5];
500 out
[3] = workspace2
[1] + workspace2
[5];
501 out
[4] = workspace2
[2] + workspace2
[6];
502 out
[5] = workspace2
[2] - workspace2
[6];
503 out
[6] = workspace2
[3] - workspace2
[7];
504 out
[7] = workspace2
[3] + workspace2
[7];
509 for (i
= 0; i
< 8; i
++, out
++) {
511 workspace1
[0] = out
[0] + out
[1 * 8];
512 workspace1
[1] = out
[0] - out
[1 * 8];
514 workspace1
[2] = out
[2 * 8] + out
[3 * 8];
515 workspace1
[3] = out
[2 * 8] - out
[3 * 8];
517 workspace1
[4] = out
[4 * 8] + out
[5 * 8];
518 workspace1
[5] = out
[4 * 8] - out
[5 * 8];
520 workspace1
[6] = out
[6 * 8] + out
[7 * 8];
521 workspace1
[7] = out
[6 * 8] - out
[7 * 8];
524 workspace2
[0] = workspace1
[0] + workspace1
[2];
525 workspace2
[1] = workspace1
[0] - workspace1
[2];
526 workspace2
[2] = workspace1
[1] - workspace1
[3];
527 workspace2
[3] = workspace1
[1] + workspace1
[3];
529 workspace2
[4] = workspace1
[4] + workspace1
[6];
530 workspace2
[5] = workspace1
[4] - workspace1
[6];
531 workspace2
[6] = workspace1
[5] - workspace1
[7];
532 workspace2
[7] = workspace1
[5] + workspace1
[7];
538 out
[0 * 8] = workspace2
[0] + workspace2
[4];
539 out
[1 * 8] = workspace2
[0] - workspace2
[4];
540 out
[2 * 8] = workspace2
[1] - workspace2
[5];
541 out
[3 * 8] = workspace2
[1] + workspace2
[5];
542 out
[4 * 8] = workspace2
[2] + workspace2
[6];
543 out
[5 * 8] = workspace2
[2] - workspace2
[6];
544 out
[6 * 8] = workspace2
[3] - workspace2
[7];
545 out
[7 * 8] = workspace2
[3] + workspace2
[7];
547 for (d
= 0; d
< 8; d
++)
552 out
[0 * 8] = workspace2
[0] + workspace2
[4];
553 out
[1 * 8] = workspace2
[0] - workspace2
[4];
554 out
[2 * 8] = workspace2
[1] - workspace2
[5];
555 out
[3 * 8] = workspace2
[1] + workspace2
[5];
556 out
[4 * 8] = workspace2
[2] + workspace2
[6];
557 out
[5 * 8] = workspace2
[2] - workspace2
[6];
558 out
[6 * 8] = workspace2
[3] - workspace2
[7];
559 out
[7 * 8] = workspace2
[3] + workspace2
[7];
561 for (d
= 0; d
< 8; d
++) {
569 static void fill_encoder_block(const u8
*input
, s16
*dst
,
570 unsigned int stride
, unsigned int input_step
)
574 for (i
= 0; i
< 8; i
++) {
575 for (j
= 0; j
< 8; j
++, input
+= input_step
)
577 input
+= stride
- 8 * input_step
;
581 static int var_intra(const s16
*input
)
585 const s16
*tmp
= input
;
588 for (i
= 0; i
< 8 * 8; i
++, tmp
++)
592 for (i
= 0; i
< 8 * 8; i
++, tmp
++)
593 ret
+= (*tmp
- mean
) < 0 ? -(*tmp
- mean
) : (*tmp
- mean
);
597 static int var_inter(const s16
*old
, const s16
*new)
602 for (i
= 0; i
< 8 * 8; i
++, old
++, new++)
603 ret
+= (*old
- *new) < 0 ? -(*old
- *new) : (*old
- *new);
607 static int decide_blocktype(const u8
*cur
, const u8
*reference
,
608 s16
*deltablock
, unsigned int stride
,
609 unsigned int input_step
)
618 fill_encoder_block(cur
, tmp
, stride
, input_step
);
619 fill_encoder_block(reference
, old
, 8, 1);
620 vari
= var_intra(tmp
);
622 for (k
= 0; k
< 8; k
++) {
623 for (l
= 0; l
< 8; l
++) {
624 *deltablock
= *work
- *reference
;
631 vard
= var_inter(old
, tmp
);
632 return vari
<= vard
? IBLOCK
: PBLOCK
;
635 static void fill_decoder_block(u8
*dst
, const s16
*input
, int stride
)
639 for (i
= 0; i
< 8; i
++) {
640 for (j
= 0; j
< 8; j
++, input
++, dst
++) {
643 else if (*input
> 255)
652 static void add_deltas(s16
*deltas
, const u8
*ref
, int stride
)
656 for (k
= 0; k
< 8; k
++) {
657 for (l
= 0; l
< 8; l
++) {
660 * Due to quantizing, it might possible that the
661 * decoded coefficients are slightly out of range
665 else if (*deltas
> 255)
673 static u32
encode_plane(u8
*input
, u8
*refp
, __be16
**rlco
, __be16
*rlco_max
,
674 struct fwht_cframe
*cf
, u32 height
, u32 width
,
675 u32 stride
, unsigned int input_step
,
676 bool is_intra
, bool next_is_intra
)
678 u8
*input_start
= input
;
679 __be16
*rlco_start
= *rlco
;
681 __be16 pframe_bit
= htons(PFRAME_BIT
);
683 unsigned int last_size
= 0;
686 width
= round_up(width
, 8);
687 height
= round_up(height
, 8);
689 for (j
= 0; j
< height
/ 8; j
++) {
690 input
= input_start
+ j
* 8 * stride
;
691 for (i
= 0; i
< width
/ 8; i
++) {
692 /* intra code, first frame is always intra coded. */
693 int blocktype
= IBLOCK
;
697 blocktype
= decide_blocktype(input
, refp
,
698 deltablock
, stride
, input_step
);
699 if (blocktype
== IBLOCK
) {
700 fwht(input
, cf
->coeffs
, stride
, input_step
, 1);
701 quantize_intra(cf
->coeffs
, cf
->de_coeffs
,
705 encoding
|= FWHT_FRAME_PCODED
;
706 fwht16(deltablock
, cf
->coeffs
, 8, 0);
707 quantize_inter(cf
->coeffs
, cf
->de_coeffs
,
710 if (!next_is_intra
) {
711 ifwht(cf
->de_coeffs
, cf
->de_fwht
, blocktype
);
713 if (blocktype
== PBLOCK
)
714 add_deltas(cf
->de_fwht
, refp
, 8);
715 fill_decoder_block(refp
, cf
->de_fwht
, 8);
718 input
+= 8 * input_step
;
721 size
= rlc(cf
->coeffs
, *rlco
, blocktype
);
722 if (last_size
== size
&&
723 !memcmp(*rlco
+ 1, *rlco
- size
+ 1, 2 * size
- 2)) {
724 __be16
*last_rlco
= *rlco
- size
;
725 s16 hdr
= ntohs(*last_rlco
);
727 if (!((*last_rlco
^ **rlco
) & pframe_bit
) &&
728 (hdr
& DUPS_MASK
) < DUPS_MASK
)
729 *last_rlco
= htons(hdr
+ 2);
735 if (*rlco
>= rlco_max
) {
736 encoding
|= FWHT_FRAME_UNENCODED
;
744 if (encoding
& FWHT_FRAME_UNENCODED
) {
745 u8
*out
= (u8
*)rlco_start
;
750 * The compressed stream should never contain the magic
751 * header, so when we copy the YUV data we replace 0xff
752 * by 0xfe. Since YUV is limited range such values
753 * shouldn't appear anyway.
755 for (j
= 0; j
< height
; j
++) {
756 for (i
= 0, p
= input
; i
< width
; i
++, p
+= input_step
)
757 *out
++ = (*p
== 0xff) ? 0xfe : *p
;
760 *rlco
= (__be16
*)out
;
761 encoding
&= ~FWHT_FRAME_PCODED
;
766 u32
fwht_encode_frame(struct fwht_raw_frame
*frm
,
767 struct fwht_raw_frame
*ref_frm
,
768 struct fwht_cframe
*cf
,
769 bool is_intra
, bool next_is_intra
,
770 unsigned int width
, unsigned int height
,
771 unsigned int stride
, unsigned int chroma_stride
)
773 unsigned int size
= height
* width
;
774 __be16
*rlco
= cf
->rlc_data
;
778 rlco_max
= rlco
+ size
/ 2 - 256;
779 encoding
= encode_plane(frm
->luma
, ref_frm
->luma
, &rlco
, rlco_max
, cf
,
780 height
, width
, stride
,
781 frm
->luma_alpha_step
, is_intra
, next_is_intra
);
782 if (encoding
& FWHT_FRAME_UNENCODED
)
783 encoding
|= FWHT_LUMA_UNENCODED
;
784 encoding
&= ~FWHT_FRAME_UNENCODED
;
786 if (frm
->components_num
>= 3) {
787 u32 chroma_h
= height
/ frm
->height_div
;
788 u32 chroma_w
= width
/ frm
->width_div
;
789 unsigned int chroma_size
= chroma_h
* chroma_w
;
791 rlco_max
= rlco
+ chroma_size
/ 2 - 256;
792 encoding
|= encode_plane(frm
->cb
, ref_frm
->cb
, &rlco
, rlco_max
,
793 cf
, chroma_h
, chroma_w
,
794 chroma_stride
, frm
->chroma_step
,
795 is_intra
, next_is_intra
);
796 if (encoding
& FWHT_FRAME_UNENCODED
)
797 encoding
|= FWHT_CB_UNENCODED
;
798 encoding
&= ~FWHT_FRAME_UNENCODED
;
799 rlco_max
= rlco
+ chroma_size
/ 2 - 256;
800 encoding
|= encode_plane(frm
->cr
, ref_frm
->cr
, &rlco
, rlco_max
,
801 cf
, chroma_h
, chroma_w
,
802 chroma_stride
, frm
->chroma_step
,
803 is_intra
, next_is_intra
);
804 if (encoding
& FWHT_FRAME_UNENCODED
)
805 encoding
|= FWHT_CR_UNENCODED
;
806 encoding
&= ~FWHT_FRAME_UNENCODED
;
809 if (frm
->components_num
== 4) {
810 rlco_max
= rlco
+ size
/ 2 - 256;
811 encoding
|= encode_plane(frm
->alpha
, ref_frm
->alpha
, &rlco
,
812 rlco_max
, cf
, height
, width
,
813 stride
, frm
->luma_alpha_step
,
814 is_intra
, next_is_intra
);
815 if (encoding
& FWHT_FRAME_UNENCODED
)
816 encoding
|= FWHT_ALPHA_UNENCODED
;
817 encoding
&= ~FWHT_FRAME_UNENCODED
;
820 cf
->size
= (rlco
- cf
->rlc_data
) * sizeof(*rlco
);
824 static bool decode_plane(struct fwht_cframe
*cf
, const __be16
**rlco
, u8
*ref
,
825 u32 height
, u32 width
, u32 coded_width
,
826 bool uncompressed
, const __be16
*end_of_rlco_buf
)
828 unsigned int copies
= 0;
833 width
= round_up(width
, 8);
834 height
= round_up(height
, 8);
837 if (end_of_rlco_buf
+ 1 < *rlco
+ width
* height
/ 2)
839 memcpy(ref
, *rlco
, width
* height
);
840 *rlco
+= width
* height
/ 2;
845 * When decoding each macroblock the rlco pointer will be increased
846 * by 65 * 2 bytes worst-case.
847 * To avoid overflow the buffer has to be 65/64th of the actual raw
848 * image size, just in case someone feeds it malicious data.
850 for (j
= 0; j
< height
/ 8; j
++) {
851 for (i
= 0; i
< width
/ 8; i
++) {
852 u8
*refp
= ref
+ j
* 8 * coded_width
+ i
* 8;
855 memcpy(cf
->de_fwht
, copy
, sizeof(copy
));
856 if (stat
& PFRAME_BIT
)
857 add_deltas(cf
->de_fwht
, refp
,
859 fill_decoder_block(refp
, cf
->de_fwht
,
865 stat
= derlc(rlco
, cf
->coeffs
, end_of_rlco_buf
);
866 if (stat
& OVERFLOW_BIT
)
868 if (stat
& PFRAME_BIT
)
869 dequantize_inter(cf
->coeffs
);
871 dequantize_intra(cf
->coeffs
);
873 ifwht(cf
->coeffs
, cf
->de_fwht
,
874 (stat
& PFRAME_BIT
) ? 0 : 1);
876 copies
= (stat
& DUPS_MASK
) >> 1;
878 memcpy(copy
, cf
->de_fwht
, sizeof(copy
));
879 if (stat
& PFRAME_BIT
)
880 add_deltas(cf
->de_fwht
, refp
, coded_width
);
881 fill_decoder_block(refp
, cf
->de_fwht
, coded_width
);
887 bool fwht_decode_frame(struct fwht_cframe
*cf
, struct fwht_raw_frame
*ref
,
888 u32 hdr_flags
, unsigned int components_num
,
889 unsigned int width
, unsigned int height
,
890 unsigned int coded_width
)
892 const __be16
*rlco
= cf
->rlc_data
;
893 const __be16
*end_of_rlco_buf
= cf
->rlc_data
+
894 (cf
->size
/ sizeof(*rlco
)) - 1;
896 if (!decode_plane(cf
, &rlco
, ref
->luma
, height
, width
, coded_width
,
897 hdr_flags
& FWHT_FL_LUMA_IS_UNCOMPRESSED
,
901 if (components_num
>= 3) {
906 if (!(hdr_flags
& FWHT_FL_CHROMA_FULL_HEIGHT
))
908 if (!(hdr_flags
& FWHT_FL_CHROMA_FULL_WIDTH
)) {
912 if (!decode_plane(cf
, &rlco
, ref
->cb
, h
, w
, c
,
913 hdr_flags
& FWHT_FL_CB_IS_UNCOMPRESSED
,
916 if (!decode_plane(cf
, &rlco
, ref
->cr
, h
, w
, c
,
917 hdr_flags
& FWHT_FL_CR_IS_UNCOMPRESSED
,
922 if (components_num
== 4)
923 if (!decode_plane(cf
, &rlco
, ref
->alpha
, height
, width
,
925 hdr_flags
& FWHT_FL_ALPHA_IS_UNCOMPRESSED
,