4 * Copyright (C) 1994-1998, Thomas G. Lane.
5 * Modified 2010-2015 by Guido Vollbeding.
6 * This file is part of the Independent JPEG Group's software.
7 * For conditions of distribution and use, see the accompanying README file.
9 * This file contains a floating-point implementation of the
10 * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
11 * must also perform dequantization of the input coefficients.
13 * This implementation should be more accurate than either of the integer
14 * IDCT implementations. However, it may not give the same results on all
15 * machines because of differences in roundoff behavior. Speed will depend
16 * on the hardware's floating point capacity.
18 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
19 * on each row (or vice versa, but it's more convenient to emit a row at
20 * a time). Direct algorithms are also available, but they are much more
21 * complex and seem not to be any faster when reduced to code.
23 * This implementation is based on Arai, Agui, and Nakajima's algorithm for
24 * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
25 * Japanese, but the algorithm is described in the Pennebaker & Mitchell
26 * JPEG textbook (see REFERENCES section in file README). The following code
27 * is based directly on figure 4-8 in P&M.
28 * While an 8-point DCT cannot be done in less than 11 multiplies, it is
29 * possible to arrange the computation so that many of the multiplies are
30 * simple scalings of the final outputs. These multiplies can then be
31 * folded into the multiplications or divisions by the JPEG quantization
32 * table entries. The AA&N method leaves only 5 multiplies and 29 adds
33 * to be done in the DCT itself.
34 * The primary disadvantage of this method is that with a fixed-point
35 * implementation, accuracy is lost due to imprecise representation of the
36 * scaled quantization values. However, that problem does not arise if
37 * we use floating point arithmetic.
40 #define JPEG_INTERNALS
43 #include "jdct.h" /* Private declarations for DCT subsystem */
45 #ifdef DCT_FLOAT_SUPPORTED
49 * This module is specialized to the case DCTSIZE = 8.
53 Sorry
, this code only copes with
8x8 DCTs
. /* deliberate syntax err */
57 /* Dequantize a coefficient by multiplying it by the multiplier-table
58 * entry; produce a float result.
61 #define DEQUANTIZE(coef,quantval) (((FAST_FLOAT) (coef)) * (quantval))
65 * Perform dequantization and inverse DCT on one block of coefficients.
67 * cK represents cos(K*pi/16).
71 jpeg_idct_float (j_decompress_ptr cinfo
, jpeg_component_info
* compptr
,
73 JSAMPARRAY output_buf
, JDIMENSION output_col
)
75 FAST_FLOAT tmp0
, tmp1
, tmp2
, tmp3
, tmp4
, tmp5
, tmp6
, tmp7
;
76 FAST_FLOAT tmp10
, tmp11
, tmp12
, tmp13
;
77 FAST_FLOAT z5
, z10
, z11
, z12
, z13
;
79 FLOAT_MULT_TYPE
* quantptr
;
82 JSAMPLE
*range_limit
= IDCT_range_limit(cinfo
);
84 FAST_FLOAT workspace
[DCTSIZE2
]; /* buffers data between passes */
86 /* Pass 1: process columns from input, store into work array. */
89 quantptr
= (FLOAT_MULT_TYPE
*) compptr
->dct_table
;
91 for (ctr
= DCTSIZE
; ctr
> 0; ctr
--) {
92 /* Due to quantization, we will usually find that many of the input
93 * coefficients are zero, especially the AC terms. We can exploit this
94 * by short-circuiting the IDCT calculation for any column in which all
95 * the AC terms are zero. In that case each output is equal to the
96 * DC coefficient (with scale factor as needed).
97 * With typical images and quantization tables, half or more of the
98 * column DCT calculations can be simplified this way.
101 if (inptr
[DCTSIZE
*1] == 0 && inptr
[DCTSIZE
*2] == 0 &&
102 inptr
[DCTSIZE
*3] == 0 && inptr
[DCTSIZE
*4] == 0 &&
103 inptr
[DCTSIZE
*5] == 0 && inptr
[DCTSIZE
*6] == 0 &&
104 inptr
[DCTSIZE
*7] == 0) {
105 /* AC terms all zero */
106 FAST_FLOAT dcval
= DEQUANTIZE(inptr
[DCTSIZE
*0], quantptr
[DCTSIZE
*0]);
108 wsptr
[DCTSIZE
*0] = dcval
;
109 wsptr
[DCTSIZE
*1] = dcval
;
110 wsptr
[DCTSIZE
*2] = dcval
;
111 wsptr
[DCTSIZE
*3] = dcval
;
112 wsptr
[DCTSIZE
*4] = dcval
;
113 wsptr
[DCTSIZE
*5] = dcval
;
114 wsptr
[DCTSIZE
*6] = dcval
;
115 wsptr
[DCTSIZE
*7] = dcval
;
117 inptr
++; /* advance pointers to next column */
125 tmp0
= DEQUANTIZE(inptr
[DCTSIZE
*0], quantptr
[DCTSIZE
*0]);
126 tmp1
= DEQUANTIZE(inptr
[DCTSIZE
*2], quantptr
[DCTSIZE
*2]);
127 tmp2
= DEQUANTIZE(inptr
[DCTSIZE
*4], quantptr
[DCTSIZE
*4]);
128 tmp3
= DEQUANTIZE(inptr
[DCTSIZE
*6], quantptr
[DCTSIZE
*6]);
130 tmp10
= tmp0
+ tmp2
; /* phase 3 */
133 tmp13
= tmp1
+ tmp3
; /* phases 5-3 */
134 tmp12
= (tmp1
- tmp3
) * ((FAST_FLOAT
) 1.414213562) - tmp13
; /* 2*c4 */
136 tmp0
= tmp10
+ tmp13
; /* phase 2 */
137 tmp3
= tmp10
- tmp13
;
138 tmp1
= tmp11
+ tmp12
;
139 tmp2
= tmp11
- tmp12
;
143 tmp4
= DEQUANTIZE(inptr
[DCTSIZE
*1], quantptr
[DCTSIZE
*1]);
144 tmp5
= DEQUANTIZE(inptr
[DCTSIZE
*3], quantptr
[DCTSIZE
*3]);
145 tmp6
= DEQUANTIZE(inptr
[DCTSIZE
*5], quantptr
[DCTSIZE
*5]);
146 tmp7
= DEQUANTIZE(inptr
[DCTSIZE
*7], quantptr
[DCTSIZE
*7]);
148 z13
= tmp6
+ tmp5
; /* phase 6 */
153 tmp7
= z11
+ z13
; /* phase 5 */
154 tmp11
= (z11
- z13
) * ((FAST_FLOAT
) 1.414213562); /* 2*c4 */
156 z5
= (z10
+ z12
) * ((FAST_FLOAT
) 1.847759065); /* 2*c2 */
157 tmp10
= z5
- z12
* ((FAST_FLOAT
) 1.082392200); /* 2*(c2-c6) */
158 tmp12
= z5
- z10
* ((FAST_FLOAT
) 2.613125930); /* 2*(c2+c6) */
160 tmp6
= tmp12
- tmp7
; /* phase 2 */
164 wsptr
[DCTSIZE
*0] = tmp0
+ tmp7
;
165 wsptr
[DCTSIZE
*7] = tmp0
- tmp7
;
166 wsptr
[DCTSIZE
*1] = tmp1
+ tmp6
;
167 wsptr
[DCTSIZE
*6] = tmp1
- tmp6
;
168 wsptr
[DCTSIZE
*2] = tmp2
+ tmp5
;
169 wsptr
[DCTSIZE
*5] = tmp2
- tmp5
;
170 wsptr
[DCTSIZE
*3] = tmp3
+ tmp4
;
171 wsptr
[DCTSIZE
*4] = tmp3
- tmp4
;
173 inptr
++; /* advance pointers to next column */
178 /* Pass 2: process rows from work array, store into output array. */
181 for (ctr
= 0; ctr
< DCTSIZE
; ctr
++) {
182 outptr
= output_buf
[ctr
] + output_col
;
183 /* Rows of zeroes can be exploited in the same way as we did with columns.
184 * However, the column calculation has created many nonzero AC terms, so
185 * the simplification applies less often (typically 5% to 10% of the time).
186 * And testing floats for zero is relatively expensive, so we don't bother.
191 /* Prepare range-limit and float->int conversion */
192 z5
= wsptr
[0] + (((FAST_FLOAT
) RANGE_CENTER
) + ((FAST_FLOAT
) 0.5));
193 tmp10
= z5
+ wsptr
[4];
194 tmp11
= z5
- wsptr
[4];
196 tmp13
= wsptr
[2] + wsptr
[6];
197 tmp12
= (wsptr
[2] - wsptr
[6]) *
198 ((FAST_FLOAT
) 1.414213562) - tmp13
; /* 2*c4 */
200 tmp0
= tmp10
+ tmp13
;
201 tmp3
= tmp10
- tmp13
;
202 tmp1
= tmp11
+ tmp12
;
203 tmp2
= tmp11
- tmp12
;
207 z13
= wsptr
[5] + wsptr
[3];
208 z10
= wsptr
[5] - wsptr
[3];
209 z11
= wsptr
[1] + wsptr
[7];
210 z12
= wsptr
[1] - wsptr
[7];
212 tmp7
= z11
+ z13
; /* phase 5 */
213 tmp11
= (z11
- z13
) * ((FAST_FLOAT
) 1.414213562); /* 2*c4 */
215 z5
= (z10
+ z12
) * ((FAST_FLOAT
) 1.847759065); /* 2*c2 */
216 tmp10
= z5
- z12
* ((FAST_FLOAT
) 1.082392200); /* 2*(c2-c6) */
217 tmp12
= z5
- z10
* ((FAST_FLOAT
) 2.613125930); /* 2*(c2+c6) */
219 tmp6
= tmp12
- tmp7
; /* phase 2 */
223 /* Final output stage: float->int conversion and range-limit */
225 outptr
[0] = range_limit
[(int) (tmp0
+ tmp7
) & RANGE_MASK
];
226 outptr
[7] = range_limit
[(int) (tmp0
- tmp7
) & RANGE_MASK
];
227 outptr
[1] = range_limit
[(int) (tmp1
+ tmp6
) & RANGE_MASK
];
228 outptr
[6] = range_limit
[(int) (tmp1
- tmp6
) & RANGE_MASK
];
229 outptr
[2] = range_limit
[(int) (tmp2
+ tmp5
) & RANGE_MASK
];
230 outptr
[5] = range_limit
[(int) (tmp2
- tmp5
) & RANGE_MASK
];
231 outptr
[3] = range_limit
[(int) (tmp3
+ tmp4
) & RANGE_MASK
];
232 outptr
[4] = range_limit
[(int) (tmp3
- tmp4
) & RANGE_MASK
];
234 wsptr
+= DCTSIZE
; /* advance pointer to next row */
238 #endif /* DCT_FLOAT_SUPPORTED */