blk-mq: always free hctx after request queue is freed
[linux/fpc-iii.git] / drivers / media / platform / vicodec / codec-fwht.c
blobd1d6085da9f1de33c7a2665bc45c3b2543bd79cc
1 // SPDX-License-Identifier: LGPL-2.1+
2 /*
3 * Copyright 2016 Tom aan de Wiel
4 * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
6 * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
8 * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9 * R.D. Brown, 1977
12 #include <linux/string.h>
13 #include <linux/kernel.h>
14 #include "codec-fwht.h"
16 #define OVERFLOW_BIT BIT(14)
19 * Note: bit 0 of the header must always be 0. Otherwise it cannot
20 * be guaranteed that the magic 8 byte sequence (see below) can
21 * never occur in the rlc output.
23 #define PFRAME_BIT BIT(15)
24 #define DUPS_MASK 0x1ffe
26 #define PBLOCK 0
27 #define IBLOCK 1
29 #define ALL_ZEROS 15
31 static const uint8_t zigzag[64] = {
33 1, 8,
34 2, 9, 16,
35 3, 10, 17, 24,
36 4, 11, 18, 25, 32,
37 5, 12, 19, 26, 33, 40,
38 6, 13, 20, 27, 34, 41, 48,
39 7, 14, 21, 28, 35, 42, 49, 56,
40 15, 22, 29, 36, 43, 50, 57,
41 23, 30, 37, 44, 51, 58,
42 31, 38, 45, 52, 59,
43 39, 46, 53, 60,
44 47, 54, 61,
45 55, 62,
46 63,
50 static int rlc(const s16 *in, __be16 *output, int blocktype)
52 s16 block[8 * 8];
53 s16 *wp = block;
54 int i = 0;
55 int x, y;
56 int ret = 0;
58 /* read in block from framebuffer */
59 int lastzero_run = 0;
60 int to_encode;
62 for (y = 0; y < 8; y++) {
63 for (x = 0; x < 8; x++) {
64 *wp = in[x + y * 8];
65 wp++;
69 /* keep track of amount of trailing zeros */
70 for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
71 lastzero_run++;
73 *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
74 ret++;
76 to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
78 i = 0;
79 while (i < to_encode) {
80 int cnt = 0;
81 int tmp;
83 /* count leading zeros */
84 while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
85 cnt++;
86 i++;
87 if (i == to_encode) {
88 cnt--;
89 break;
92 /* 4 bits for run, 12 for coefficient (quantization by 4) */
93 *output++ = htons((cnt | tmp << 4));
94 i++;
95 ret++;
97 if (lastzero_run > 14) {
98 *output = htons(ALL_ZEROS | 0);
99 ret++;
102 return ret;
106 * This function will worst-case increase rlc_in by 65*2 bytes:
107 * one s16 value for the header and 8 * 8 coefficients of type s16.
109 static u16 derlc(const __be16 **rlc_in, s16 *dwht_out,
110 const __be16 *end_of_input)
112 /* header */
113 const __be16 *input = *rlc_in;
114 u16 stat;
115 int dec_count = 0;
116 s16 block[8 * 8 + 16];
117 s16 *wp = block;
118 int i;
120 if (input > end_of_input)
121 return OVERFLOW_BIT;
122 stat = ntohs(*input++);
125 * Now de-compress, it expands one byte to up to 15 bytes
126 * (or fills the remainder of the 64 bytes with zeroes if it
127 * is the last byte to expand).
129 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
130 * allow for overflow if the incoming data was malformed.
132 while (dec_count < 8 * 8) {
133 s16 in;
134 int length;
135 int coeff;
137 if (input > end_of_input)
138 return OVERFLOW_BIT;
139 in = ntohs(*input++);
140 length = in & 0xf;
141 coeff = in >> 4;
143 /* fill remainder with zeros */
144 if (length == 15) {
145 for (i = 0; i < 64 - dec_count; i++)
146 *wp++ = 0;
147 break;
150 for (i = 0; i < length; i++)
151 *wp++ = 0;
152 *wp++ = coeff;
153 dec_count += length + 1;
156 wp = block;
158 for (i = 0; i < 64; i++) {
159 int pos = zigzag[i];
160 int y = pos / 8;
161 int x = pos % 8;
163 dwht_out[x + y * 8] = *wp++;
165 *rlc_in = input;
166 return stat;
169 static const int quant_table[] = {
170 2, 2, 2, 2, 2, 2, 2, 2,
171 2, 2, 2, 2, 2, 2, 2, 2,
172 2, 2, 2, 2, 2, 2, 2, 3,
173 2, 2, 2, 2, 2, 2, 3, 6,
174 2, 2, 2, 2, 2, 3, 6, 6,
175 2, 2, 2, 2, 3, 6, 6, 6,
176 2, 2, 2, 3, 6, 6, 6, 6,
177 2, 2, 3, 6, 6, 6, 6, 8,
180 static const int quant_table_p[] = {
181 3, 3, 3, 3, 3, 3, 3, 3,
182 3, 3, 3, 3, 3, 3, 3, 3,
183 3, 3, 3, 3, 3, 3, 3, 3,
184 3, 3, 3, 3, 3, 3, 3, 6,
185 3, 3, 3, 3, 3, 3, 6, 6,
186 3, 3, 3, 3, 3, 6, 6, 9,
187 3, 3, 3, 3, 6, 6, 9, 9,
188 3, 3, 3, 6, 6, 9, 9, 10,
191 static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
193 const int *quant = quant_table;
194 int i, j;
196 for (j = 0; j < 8; j++) {
197 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
198 *coeff >>= *quant;
199 if (*coeff >= -qp && *coeff <= qp)
200 *coeff = *de_coeff = 0;
201 else
202 *de_coeff = *coeff << *quant;
207 static void dequantize_intra(s16 *coeff)
209 const int *quant = quant_table;
210 int i, j;
212 for (j = 0; j < 8; j++)
213 for (i = 0; i < 8; i++, quant++, coeff++)
214 *coeff <<= *quant;
217 static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
219 const int *quant = quant_table_p;
220 int i, j;
222 for (j = 0; j < 8; j++) {
223 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
224 *coeff >>= *quant;
225 if (*coeff >= -qp && *coeff <= qp)
226 *coeff = *de_coeff = 0;
227 else
228 *de_coeff = *coeff << *quant;
233 static void dequantize_inter(s16 *coeff)
235 const int *quant = quant_table_p;
236 int i, j;
238 for (j = 0; j < 8; j++)
239 for (i = 0; i < 8; i++, quant++, coeff++)
240 *coeff <<= *quant;
243 static void fwht(const u8 *block, s16 *output_block, unsigned int stride,
244 unsigned int input_step, bool intra)
246 /* we'll need more than 8 bits for the transformed coefficients */
247 s32 workspace1[8], workspace2[8];
248 const u8 *tmp = block;
249 s16 *out = output_block;
250 int add = intra ? 256 : 0;
251 unsigned int i;
253 /* stage 1 */
254 for (i = 0; i < 8; i++, tmp += stride, out += 8) {
255 switch (input_step) {
256 case 1:
257 workspace1[0] = tmp[0] + tmp[1] - add;
258 workspace1[1] = tmp[0] - tmp[1];
260 workspace1[2] = tmp[2] + tmp[3] - add;
261 workspace1[3] = tmp[2] - tmp[3];
263 workspace1[4] = tmp[4] + tmp[5] - add;
264 workspace1[5] = tmp[4] - tmp[5];
266 workspace1[6] = tmp[6] + tmp[7] - add;
267 workspace1[7] = tmp[6] - tmp[7];
268 break;
269 case 2:
270 workspace1[0] = tmp[0] + tmp[2] - add;
271 workspace1[1] = tmp[0] - tmp[2];
273 workspace1[2] = tmp[4] + tmp[6] - add;
274 workspace1[3] = tmp[4] - tmp[6];
276 workspace1[4] = tmp[8] + tmp[10] - add;
277 workspace1[5] = tmp[8] - tmp[10];
279 workspace1[6] = tmp[12] + tmp[14] - add;
280 workspace1[7] = tmp[12] - tmp[14];
281 break;
282 case 3:
283 workspace1[0] = tmp[0] + tmp[3] - add;
284 workspace1[1] = tmp[0] - tmp[3];
286 workspace1[2] = tmp[6] + tmp[9] - add;
287 workspace1[3] = tmp[6] - tmp[9];
289 workspace1[4] = tmp[12] + tmp[15] - add;
290 workspace1[5] = tmp[12] - tmp[15];
292 workspace1[6] = tmp[18] + tmp[21] - add;
293 workspace1[7] = tmp[18] - tmp[21];
294 break;
295 default:
296 workspace1[0] = tmp[0] + tmp[4] - add;
297 workspace1[1] = tmp[0] - tmp[4];
299 workspace1[2] = tmp[8] + tmp[12] - add;
300 workspace1[3] = tmp[8] - tmp[12];
302 workspace1[4] = tmp[16] + tmp[20] - add;
303 workspace1[5] = tmp[16] - tmp[20];
305 workspace1[6] = tmp[24] + tmp[28] - add;
306 workspace1[7] = tmp[24] - tmp[28];
307 break;
310 /* stage 2 */
311 workspace2[0] = workspace1[0] + workspace1[2];
312 workspace2[1] = workspace1[0] - workspace1[2];
313 workspace2[2] = workspace1[1] - workspace1[3];
314 workspace2[3] = workspace1[1] + workspace1[3];
316 workspace2[4] = workspace1[4] + workspace1[6];
317 workspace2[5] = workspace1[4] - workspace1[6];
318 workspace2[6] = workspace1[5] - workspace1[7];
319 workspace2[7] = workspace1[5] + workspace1[7];
321 /* stage 3 */
322 out[0] = workspace2[0] + workspace2[4];
323 out[1] = workspace2[0] - workspace2[4];
324 out[2] = workspace2[1] - workspace2[5];
325 out[3] = workspace2[1] + workspace2[5];
326 out[4] = workspace2[2] + workspace2[6];
327 out[5] = workspace2[2] - workspace2[6];
328 out[6] = workspace2[3] - workspace2[7];
329 out[7] = workspace2[3] + workspace2[7];
332 out = output_block;
334 for (i = 0; i < 8; i++, out++) {
335 /* stage 1 */
336 workspace1[0] = out[0] + out[1 * 8];
337 workspace1[1] = out[0] - out[1 * 8];
339 workspace1[2] = out[2 * 8] + out[3 * 8];
340 workspace1[3] = out[2 * 8] - out[3 * 8];
342 workspace1[4] = out[4 * 8] + out[5 * 8];
343 workspace1[5] = out[4 * 8] - out[5 * 8];
345 workspace1[6] = out[6 * 8] + out[7 * 8];
346 workspace1[7] = out[6 * 8] - out[7 * 8];
348 /* stage 2 */
349 workspace2[0] = workspace1[0] + workspace1[2];
350 workspace2[1] = workspace1[0] - workspace1[2];
351 workspace2[2] = workspace1[1] - workspace1[3];
352 workspace2[3] = workspace1[1] + workspace1[3];
354 workspace2[4] = workspace1[4] + workspace1[6];
355 workspace2[5] = workspace1[4] - workspace1[6];
356 workspace2[6] = workspace1[5] - workspace1[7];
357 workspace2[7] = workspace1[5] + workspace1[7];
358 /* stage 3 */
359 out[0 * 8] = workspace2[0] + workspace2[4];
360 out[1 * 8] = workspace2[0] - workspace2[4];
361 out[2 * 8] = workspace2[1] - workspace2[5];
362 out[3 * 8] = workspace2[1] + workspace2[5];
363 out[4 * 8] = workspace2[2] + workspace2[6];
364 out[5 * 8] = workspace2[2] - workspace2[6];
365 out[6 * 8] = workspace2[3] - workspace2[7];
366 out[7 * 8] = workspace2[3] + workspace2[7];
371 * Not the nicest way of doing it, but P-blocks get twice the range of
372 * that of the I-blocks. Therefore we need a type bigger than 8 bits.
373 * Furthermore values can be negative... This is just a version that
374 * works with 16 signed data
376 static void fwht16(const s16 *block, s16 *output_block, int stride, int intra)
378 /* we'll need more than 8 bits for the transformed coefficients */
379 s32 workspace1[8], workspace2[8];
380 const s16 *tmp = block;
381 s16 *out = output_block;
382 int i;
384 for (i = 0; i < 8; i++, tmp += stride, out += 8) {
385 /* stage 1 */
386 workspace1[0] = tmp[0] + tmp[1];
387 workspace1[1] = tmp[0] - tmp[1];
389 workspace1[2] = tmp[2] + tmp[3];
390 workspace1[3] = tmp[2] - tmp[3];
392 workspace1[4] = tmp[4] + tmp[5];
393 workspace1[5] = tmp[4] - tmp[5];
395 workspace1[6] = tmp[6] + tmp[7];
396 workspace1[7] = tmp[6] - tmp[7];
398 /* stage 2 */
399 workspace2[0] = workspace1[0] + workspace1[2];
400 workspace2[1] = workspace1[0] - workspace1[2];
401 workspace2[2] = workspace1[1] - workspace1[3];
402 workspace2[3] = workspace1[1] + workspace1[3];
404 workspace2[4] = workspace1[4] + workspace1[6];
405 workspace2[5] = workspace1[4] - workspace1[6];
406 workspace2[6] = workspace1[5] - workspace1[7];
407 workspace2[7] = workspace1[5] + workspace1[7];
409 /* stage 3 */
410 out[0] = workspace2[0] + workspace2[4];
411 out[1] = workspace2[0] - workspace2[4];
412 out[2] = workspace2[1] - workspace2[5];
413 out[3] = workspace2[1] + workspace2[5];
414 out[4] = workspace2[2] + workspace2[6];
415 out[5] = workspace2[2] - workspace2[6];
416 out[6] = workspace2[3] - workspace2[7];
417 out[7] = workspace2[3] + workspace2[7];
420 out = output_block;
422 for (i = 0; i < 8; i++, out++) {
423 /* stage 1 */
424 workspace1[0] = out[0] + out[1*8];
425 workspace1[1] = out[0] - out[1*8];
427 workspace1[2] = out[2*8] + out[3*8];
428 workspace1[3] = out[2*8] - out[3*8];
430 workspace1[4] = out[4*8] + out[5*8];
431 workspace1[5] = out[4*8] - out[5*8];
433 workspace1[6] = out[6*8] + out[7*8];
434 workspace1[7] = out[6*8] - out[7*8];
436 /* stage 2 */
437 workspace2[0] = workspace1[0] + workspace1[2];
438 workspace2[1] = workspace1[0] - workspace1[2];
439 workspace2[2] = workspace1[1] - workspace1[3];
440 workspace2[3] = workspace1[1] + workspace1[3];
442 workspace2[4] = workspace1[4] + workspace1[6];
443 workspace2[5] = workspace1[4] - workspace1[6];
444 workspace2[6] = workspace1[5] - workspace1[7];
445 workspace2[7] = workspace1[5] + workspace1[7];
447 /* stage 3 */
448 out[0*8] = workspace2[0] + workspace2[4];
449 out[1*8] = workspace2[0] - workspace2[4];
450 out[2*8] = workspace2[1] - workspace2[5];
451 out[3*8] = workspace2[1] + workspace2[5];
452 out[4*8] = workspace2[2] + workspace2[6];
453 out[5*8] = workspace2[2] - workspace2[6];
454 out[6*8] = workspace2[3] - workspace2[7];
455 out[7*8] = workspace2[3] + workspace2[7];
459 static void ifwht(const s16 *block, s16 *output_block, int intra)
462 * we'll need more than 8 bits for the transformed coefficients
463 * use native unit of cpu
465 int workspace1[8], workspace2[8];
466 int inter = intra ? 0 : 1;
467 const s16 *tmp = block;
468 s16 *out = output_block;
469 int i;
471 for (i = 0; i < 8; i++, tmp += 8, out += 8) {
472 /* stage 1 */
473 workspace1[0] = tmp[0] + tmp[1];
474 workspace1[1] = tmp[0] - tmp[1];
476 workspace1[2] = tmp[2] + tmp[3];
477 workspace1[3] = tmp[2] - tmp[3];
479 workspace1[4] = tmp[4] + tmp[5];
480 workspace1[5] = tmp[4] - tmp[5];
482 workspace1[6] = tmp[6] + tmp[7];
483 workspace1[7] = tmp[6] - tmp[7];
485 /* stage 2 */
486 workspace2[0] = workspace1[0] + workspace1[2];
487 workspace2[1] = workspace1[0] - workspace1[2];
488 workspace2[2] = workspace1[1] - workspace1[3];
489 workspace2[3] = workspace1[1] + workspace1[3];
491 workspace2[4] = workspace1[4] + workspace1[6];
492 workspace2[5] = workspace1[4] - workspace1[6];
493 workspace2[6] = workspace1[5] - workspace1[7];
494 workspace2[7] = workspace1[5] + workspace1[7];
496 /* stage 3 */
497 out[0] = workspace2[0] + workspace2[4];
498 out[1] = workspace2[0] - workspace2[4];
499 out[2] = workspace2[1] - workspace2[5];
500 out[3] = workspace2[1] + workspace2[5];
501 out[4] = workspace2[2] + workspace2[6];
502 out[5] = workspace2[2] - workspace2[6];
503 out[6] = workspace2[3] - workspace2[7];
504 out[7] = workspace2[3] + workspace2[7];
507 out = output_block;
509 for (i = 0; i < 8; i++, out++) {
510 /* stage 1 */
511 workspace1[0] = out[0] + out[1 * 8];
512 workspace1[1] = out[0] - out[1 * 8];
514 workspace1[2] = out[2 * 8] + out[3 * 8];
515 workspace1[3] = out[2 * 8] - out[3 * 8];
517 workspace1[4] = out[4 * 8] + out[5 * 8];
518 workspace1[5] = out[4 * 8] - out[5 * 8];
520 workspace1[6] = out[6 * 8] + out[7 * 8];
521 workspace1[7] = out[6 * 8] - out[7 * 8];
523 /* stage 2 */
524 workspace2[0] = workspace1[0] + workspace1[2];
525 workspace2[1] = workspace1[0] - workspace1[2];
526 workspace2[2] = workspace1[1] - workspace1[3];
527 workspace2[3] = workspace1[1] + workspace1[3];
529 workspace2[4] = workspace1[4] + workspace1[6];
530 workspace2[5] = workspace1[4] - workspace1[6];
531 workspace2[6] = workspace1[5] - workspace1[7];
532 workspace2[7] = workspace1[5] + workspace1[7];
534 /* stage 3 */
535 if (inter) {
536 int d;
538 out[0 * 8] = workspace2[0] + workspace2[4];
539 out[1 * 8] = workspace2[0] - workspace2[4];
540 out[2 * 8] = workspace2[1] - workspace2[5];
541 out[3 * 8] = workspace2[1] + workspace2[5];
542 out[4 * 8] = workspace2[2] + workspace2[6];
543 out[5 * 8] = workspace2[2] - workspace2[6];
544 out[6 * 8] = workspace2[3] - workspace2[7];
545 out[7 * 8] = workspace2[3] + workspace2[7];
547 for (d = 0; d < 8; d++)
548 out[8 * d] >>= 6;
549 } else {
550 int d;
552 out[0 * 8] = workspace2[0] + workspace2[4];
553 out[1 * 8] = workspace2[0] - workspace2[4];
554 out[2 * 8] = workspace2[1] - workspace2[5];
555 out[3 * 8] = workspace2[1] + workspace2[5];
556 out[4 * 8] = workspace2[2] + workspace2[6];
557 out[5 * 8] = workspace2[2] - workspace2[6];
558 out[6 * 8] = workspace2[3] - workspace2[7];
559 out[7 * 8] = workspace2[3] + workspace2[7];
561 for (d = 0; d < 8; d++) {
562 out[8 * d] >>= 6;
563 out[8 * d] += 128;
569 static void fill_encoder_block(const u8 *input, s16 *dst,
570 unsigned int stride, unsigned int input_step)
572 int i, j;
574 for (i = 0; i < 8; i++) {
575 for (j = 0; j < 8; j++, input += input_step)
576 *dst++ = *input;
577 input += stride - 8 * input_step;
581 static int var_intra(const s16 *input)
583 int32_t mean = 0;
584 int32_t ret = 0;
585 const s16 *tmp = input;
586 int i;
588 for (i = 0; i < 8 * 8; i++, tmp++)
589 mean += *tmp;
590 mean /= 64;
591 tmp = input;
592 for (i = 0; i < 8 * 8; i++, tmp++)
593 ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
594 return ret;
597 static int var_inter(const s16 *old, const s16 *new)
599 int32_t ret = 0;
600 int i;
602 for (i = 0; i < 8 * 8; i++, old++, new++)
603 ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
604 return ret;
607 static int decide_blocktype(const u8 *cur, const u8 *reference,
608 s16 *deltablock, unsigned int stride,
609 unsigned int input_step)
611 s16 tmp[64];
612 s16 old[64];
613 s16 *work = tmp;
614 unsigned int k, l;
615 int vari;
616 int vard;
618 fill_encoder_block(cur, tmp, stride, input_step);
619 fill_encoder_block(reference, old, 8, 1);
620 vari = var_intra(tmp);
622 for (k = 0; k < 8; k++) {
623 for (l = 0; l < 8; l++) {
624 *deltablock = *work - *reference;
625 deltablock++;
626 work++;
627 reference++;
630 deltablock -= 64;
631 vard = var_inter(old, tmp);
632 return vari <= vard ? IBLOCK : PBLOCK;
635 static void fill_decoder_block(u8 *dst, const s16 *input, int stride)
637 int i, j;
639 for (i = 0; i < 8; i++) {
640 for (j = 0; j < 8; j++, input++, dst++) {
641 if (*input < 0)
642 *dst = 0;
643 else if (*input > 255)
644 *dst = 255;
645 else
646 *dst = *input;
648 dst += stride - 8;
652 static void add_deltas(s16 *deltas, const u8 *ref, int stride)
654 int k, l;
656 for (k = 0; k < 8; k++) {
657 for (l = 0; l < 8; l++) {
658 *deltas += *ref++;
660 * Due to quantizing, it might possible that the
661 * decoded coefficients are slightly out of range
663 if (*deltas < 0)
664 *deltas = 0;
665 else if (*deltas > 255)
666 *deltas = 255;
667 deltas++;
669 ref += stride - 8;
673 static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
674 struct fwht_cframe *cf, u32 height, u32 width,
675 u32 stride, unsigned int input_step,
676 bool is_intra, bool next_is_intra)
678 u8 *input_start = input;
679 __be16 *rlco_start = *rlco;
680 s16 deltablock[64];
681 __be16 pframe_bit = htons(PFRAME_BIT);
682 u32 encoding = 0;
683 unsigned int last_size = 0;
684 unsigned int i, j;
686 width = round_up(width, 8);
687 height = round_up(height, 8);
689 for (j = 0; j < height / 8; j++) {
690 input = input_start + j * 8 * stride;
691 for (i = 0; i < width / 8; i++) {
692 /* intra code, first frame is always intra coded. */
693 int blocktype = IBLOCK;
694 unsigned int size;
696 if (!is_intra)
697 blocktype = decide_blocktype(input, refp,
698 deltablock, stride, input_step);
699 if (blocktype == IBLOCK) {
700 fwht(input, cf->coeffs, stride, input_step, 1);
701 quantize_intra(cf->coeffs, cf->de_coeffs,
702 cf->i_frame_qp);
703 } else {
704 /* inter code */
705 encoding |= FWHT_FRAME_PCODED;
706 fwht16(deltablock, cf->coeffs, 8, 0);
707 quantize_inter(cf->coeffs, cf->de_coeffs,
708 cf->p_frame_qp);
710 if (!next_is_intra) {
711 ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
713 if (blocktype == PBLOCK)
714 add_deltas(cf->de_fwht, refp, 8);
715 fill_decoder_block(refp, cf->de_fwht, 8);
718 input += 8 * input_step;
719 refp += 8 * 8;
721 size = rlc(cf->coeffs, *rlco, blocktype);
722 if (last_size == size &&
723 !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
724 __be16 *last_rlco = *rlco - size;
725 s16 hdr = ntohs(*last_rlco);
727 if (!((*last_rlco ^ **rlco) & pframe_bit) &&
728 (hdr & DUPS_MASK) < DUPS_MASK)
729 *last_rlco = htons(hdr + 2);
730 else
731 *rlco += size;
732 } else {
733 *rlco += size;
735 if (*rlco >= rlco_max) {
736 encoding |= FWHT_FRAME_UNENCODED;
737 goto exit_loop;
739 last_size = size;
743 exit_loop:
744 if (encoding & FWHT_FRAME_UNENCODED) {
745 u8 *out = (u8 *)rlco_start;
746 u8 *p;
748 input = input_start;
750 * The compressed stream should never contain the magic
751 * header, so when we copy the YUV data we replace 0xff
752 * by 0xfe. Since YUV is limited range such values
753 * shouldn't appear anyway.
755 for (j = 0; j < height; j++) {
756 for (i = 0, p = input; i < width; i++, p += input_step)
757 *out++ = (*p == 0xff) ? 0xfe : *p;
758 input += stride;
760 *rlco = (__be16 *)out;
761 encoding &= ~FWHT_FRAME_PCODED;
763 return encoding;
766 u32 fwht_encode_frame(struct fwht_raw_frame *frm,
767 struct fwht_raw_frame *ref_frm,
768 struct fwht_cframe *cf,
769 bool is_intra, bool next_is_intra,
770 unsigned int width, unsigned int height,
771 unsigned int stride, unsigned int chroma_stride)
773 unsigned int size = height * width;
774 __be16 *rlco = cf->rlc_data;
775 __be16 *rlco_max;
776 u32 encoding;
778 rlco_max = rlco + size / 2 - 256;
779 encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
780 height, width, stride,
781 frm->luma_alpha_step, is_intra, next_is_intra);
782 if (encoding & FWHT_FRAME_UNENCODED)
783 encoding |= FWHT_LUMA_UNENCODED;
784 encoding &= ~FWHT_FRAME_UNENCODED;
786 if (frm->components_num >= 3) {
787 u32 chroma_h = height / frm->height_div;
788 u32 chroma_w = width / frm->width_div;
789 unsigned int chroma_size = chroma_h * chroma_w;
791 rlco_max = rlco + chroma_size / 2 - 256;
792 encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max,
793 cf, chroma_h, chroma_w,
794 chroma_stride, frm->chroma_step,
795 is_intra, next_is_intra);
796 if (encoding & FWHT_FRAME_UNENCODED)
797 encoding |= FWHT_CB_UNENCODED;
798 encoding &= ~FWHT_FRAME_UNENCODED;
799 rlco_max = rlco + chroma_size / 2 - 256;
800 encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max,
801 cf, chroma_h, chroma_w,
802 chroma_stride, frm->chroma_step,
803 is_intra, next_is_intra);
804 if (encoding & FWHT_FRAME_UNENCODED)
805 encoding |= FWHT_CR_UNENCODED;
806 encoding &= ~FWHT_FRAME_UNENCODED;
809 if (frm->components_num == 4) {
810 rlco_max = rlco + size / 2 - 256;
811 encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco,
812 rlco_max, cf, height, width,
813 stride, frm->luma_alpha_step,
814 is_intra, next_is_intra);
815 if (encoding & FWHT_FRAME_UNENCODED)
816 encoding |= FWHT_ALPHA_UNENCODED;
817 encoding &= ~FWHT_FRAME_UNENCODED;
820 cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
821 return encoding;
824 static bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco, u8 *ref,
825 u32 height, u32 width, u32 coded_width,
826 bool uncompressed, const __be16 *end_of_rlco_buf)
828 unsigned int copies = 0;
829 s16 copy[8 * 8];
830 u16 stat;
831 unsigned int i, j;
833 width = round_up(width, 8);
834 height = round_up(height, 8);
836 if (uncompressed) {
837 if (end_of_rlco_buf + 1 < *rlco + width * height / 2)
838 return false;
839 memcpy(ref, *rlco, width * height);
840 *rlco += width * height / 2;
841 return true;
845 * When decoding each macroblock the rlco pointer will be increased
846 * by 65 * 2 bytes worst-case.
847 * To avoid overflow the buffer has to be 65/64th of the actual raw
848 * image size, just in case someone feeds it malicious data.
850 for (j = 0; j < height / 8; j++) {
851 for (i = 0; i < width / 8; i++) {
852 u8 *refp = ref + j * 8 * coded_width + i * 8;
854 if (copies) {
855 memcpy(cf->de_fwht, copy, sizeof(copy));
856 if (stat & PFRAME_BIT)
857 add_deltas(cf->de_fwht, refp,
858 coded_width);
859 fill_decoder_block(refp, cf->de_fwht,
860 coded_width);
861 copies--;
862 continue;
865 stat = derlc(rlco, cf->coeffs, end_of_rlco_buf);
866 if (stat & OVERFLOW_BIT)
867 return false;
868 if (stat & PFRAME_BIT)
869 dequantize_inter(cf->coeffs);
870 else
871 dequantize_intra(cf->coeffs);
873 ifwht(cf->coeffs, cf->de_fwht,
874 (stat & PFRAME_BIT) ? 0 : 1);
876 copies = (stat & DUPS_MASK) >> 1;
877 if (copies)
878 memcpy(copy, cf->de_fwht, sizeof(copy));
879 if (stat & PFRAME_BIT)
880 add_deltas(cf->de_fwht, refp, coded_width);
881 fill_decoder_block(refp, cf->de_fwht, coded_width);
884 return true;
887 bool fwht_decode_frame(struct fwht_cframe *cf, struct fwht_raw_frame *ref,
888 u32 hdr_flags, unsigned int components_num,
889 unsigned int width, unsigned int height,
890 unsigned int coded_width)
892 const __be16 *rlco = cf->rlc_data;
893 const __be16 *end_of_rlco_buf = cf->rlc_data +
894 (cf->size / sizeof(*rlco)) - 1;
896 if (!decode_plane(cf, &rlco, ref->luma, height, width, coded_width,
897 hdr_flags & FWHT_FL_LUMA_IS_UNCOMPRESSED,
898 end_of_rlco_buf))
899 return false;
901 if (components_num >= 3) {
902 u32 h = height;
903 u32 w = width;
904 u32 c = coded_width;
906 if (!(hdr_flags & FWHT_FL_CHROMA_FULL_HEIGHT))
907 h /= 2;
908 if (!(hdr_flags & FWHT_FL_CHROMA_FULL_WIDTH)) {
909 w /= 2;
910 c /= 2;
912 if (!decode_plane(cf, &rlco, ref->cb, h, w, c,
913 hdr_flags & FWHT_FL_CB_IS_UNCOMPRESSED,
914 end_of_rlco_buf))
915 return false;
916 if (!decode_plane(cf, &rlco, ref->cr, h, w, c,
917 hdr_flags & FWHT_FL_CR_IS_UNCOMPRESSED,
918 end_of_rlco_buf))
919 return false;
922 if (components_num == 4)
923 if (!decode_plane(cf, &rlco, ref->alpha, height, width,
924 coded_width,
925 hdr_flags & FWHT_FL_ALPHA_IS_UNCOMPRESSED,
926 end_of_rlco_buf))
927 return false;
928 return true;