8 * Copyright (C) 1994-1998, Thomas G. Lane.
9 * This file is part of the Independent JPEG Group's software.
10 * For conditions of distribution and use, see the accompanying README file.
12 * This file contains a floating-point implementation of the
13 * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
14 * must also perform dequantization of the input coefficients.
16 * This implementation should be more accurate than either of the integer
17 * IDCT implementations. However, it may not give the same results on all
18 * machines because of differences in roundoff behavior. Speed will depend
19 * on the hardware's floating point capacity.
21 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
22 * on each row (or vice versa, but it's more convenient to emit a row at
23 * a time). Direct algorithms are also available, but they are much more
24 * complex and seem not to be any faster when reduced to code.
26 * This implementation is based on Arai, Agui, and Nakajima's algorithm for
27 * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
28 * Japanese, but the algorithm is described in the Pennebaker & Mitchell
29 * JPEG textbook (see REFERENCES section in file README). The following code
30 * is based directly on figure 4-8 in P&M.
31 * While an 8-point DCT cannot be done in less than 11 multiplies, it is
32 * possible to arrange the computation so that many of the multiplies are
33 * simple scalings of the final outputs. These multiplies can then be
34 * folded into the multiplications or divisions by the JPEG quantization
35 * table entries. The AA&N method leaves only 5 multiplies and 29 adds
36 * to be done in the DCT itself.
37 * The primary disadvantage of this method is that with a fixed-point
38 * implementation, accuracy is lost due to imprecise representation of the
39 * scaled quantization values. However, that problem does not arise if
40 * we use floating point arithmetic.
43 #define JPEG_INTERNALS
46 #include "jdct.h" /* Private declarations for DCT subsystem */
48 #ifdef DCT_FLOAT_SUPPORTED
52 * This module is specialized to the case DCTSIZE = 8.
56 Sorry
, this code only copes with
8x8 DCTs
. /* deliberate syntax err */
60 /* Dequantize a coefficient by multiplying it by the multiplier-table
61 * entry; produce a float result.
64 #define DEQUANTIZE(coef,quantval) (((FAST_FLOAT) (coef)) * (quantval))
68 * Perform dequantization and inverse DCT on one block of coefficients.
72 jpeg_idct_float (j_decompress_ptr cinfo
, jpeg_component_info
* compptr
,
74 JSAMPARRAY output_buf
, JDIMENSION output_col
)
76 FAST_FLOAT tmp0
, tmp1
, tmp2
, tmp3
, tmp4
, tmp5
, tmp6
, tmp7
;
77 FAST_FLOAT tmp10
, tmp11
, tmp12
, tmp13
;
78 FAST_FLOAT z5
, z10
, z11
, z12
, z13
;
80 FLOAT_MULT_TYPE
* quantptr
;
83 JSAMPLE
*range_limit
= IDCT_range_limit(cinfo
);
85 FAST_FLOAT workspace
[DCTSIZE2
]; /* buffers data between passes */
88 /* Pass 1: process columns from input, store into work array. */
91 quantptr
= (FLOAT_MULT_TYPE
*) compptr
->dct_table
;
93 for (ctr
= DCTSIZE
; ctr
> 0; ctr
--) {
94 /* Due to quantization, we will usually find that many of the input
95 * coefficients are zero, especially the AC terms. We can exploit this
96 * by short-circuiting the IDCT calculation for any column in which all
97 * the AC terms are zero. In that case each output is equal to the
98 * DC coefficient (with scale factor as needed).
99 * With typical images and quantization tables, half or more of the
100 * column DCT calculations can be simplified this way.
103 if (inptr
[DCTSIZE
*1] == 0 && inptr
[DCTSIZE
*2] == 0 &&
104 inptr
[DCTSIZE
*3] == 0 && inptr
[DCTSIZE
*4] == 0 &&
105 inptr
[DCTSIZE
*5] == 0 && inptr
[DCTSIZE
*6] == 0 &&
106 inptr
[DCTSIZE
*7] == 0) {
107 /* AC terms all zero */
108 FAST_FLOAT dcval
= DEQUANTIZE(inptr
[DCTSIZE
*0], quantptr
[DCTSIZE
*0]);
110 wsptr
[DCTSIZE
*0] = dcval
;
111 wsptr
[DCTSIZE
*1] = dcval
;
112 wsptr
[DCTSIZE
*2] = dcval
;
113 wsptr
[DCTSIZE
*3] = dcval
;
114 wsptr
[DCTSIZE
*4] = dcval
;
115 wsptr
[DCTSIZE
*5] = dcval
;
116 wsptr
[DCTSIZE
*6] = dcval
;
117 wsptr
[DCTSIZE
*7] = dcval
;
119 inptr
++; /* advance pointers to next column */
127 tmp0
= DEQUANTIZE(inptr
[DCTSIZE
*0], quantptr
[DCTSIZE
*0]);
128 tmp1
= DEQUANTIZE(inptr
[DCTSIZE
*2], quantptr
[DCTSIZE
*2]);
129 tmp2
= DEQUANTIZE(inptr
[DCTSIZE
*4], quantptr
[DCTSIZE
*4]);
130 tmp3
= DEQUANTIZE(inptr
[DCTSIZE
*6], quantptr
[DCTSIZE
*6]);
132 tmp10
= tmp0
+ tmp2
; /* phase 3 */
135 tmp13
= tmp1
+ tmp3
; /* phases 5-3 */
136 tmp12
= (tmp1
- tmp3
) * ((FAST_FLOAT
) 1.414213562) - tmp13
; /* 2*c4 */
138 tmp0
= tmp10
+ tmp13
; /* phase 2 */
139 tmp3
= tmp10
- tmp13
;
140 tmp1
= tmp11
+ tmp12
;
141 tmp2
= tmp11
- tmp12
;
145 tmp4
= DEQUANTIZE(inptr
[DCTSIZE
*1], quantptr
[DCTSIZE
*1]);
146 tmp5
= DEQUANTIZE(inptr
[DCTSIZE
*3], quantptr
[DCTSIZE
*3]);
147 tmp6
= DEQUANTIZE(inptr
[DCTSIZE
*5], quantptr
[DCTSIZE
*5]);
148 tmp7
= DEQUANTIZE(inptr
[DCTSIZE
*7], quantptr
[DCTSIZE
*7]);
150 z13
= tmp6
+ tmp5
; /* phase 6 */
155 tmp7
= z11
+ z13
; /* phase 5 */
156 tmp11
= (z11
- z13
) * ((FAST_FLOAT
) 1.414213562); /* 2*c4 */
158 z5
= (z10
+ z12
) * ((FAST_FLOAT
) 1.847759065); /* 2*c2 */
159 tmp10
= ((FAST_FLOAT
) 1.082392200) * z12
- z5
; /* 2*(c2-c6) */
160 tmp12
= ((FAST_FLOAT
) -2.613125930) * z10
+ z5
; /* -2*(c2+c6) */
162 tmp6
= tmp12
- tmp7
; /* phase 2 */
166 wsptr
[DCTSIZE
*0] = tmp0
+ tmp7
;
167 wsptr
[DCTSIZE
*7] = tmp0
- tmp7
;
168 wsptr
[DCTSIZE
*1] = tmp1
+ tmp6
;
169 wsptr
[DCTSIZE
*6] = tmp1
- tmp6
;
170 wsptr
[DCTSIZE
*2] = tmp2
+ tmp5
;
171 wsptr
[DCTSIZE
*5] = tmp2
- tmp5
;
172 wsptr
[DCTSIZE
*4] = tmp3
+ tmp4
;
173 wsptr
[DCTSIZE
*3] = tmp3
- tmp4
;
175 inptr
++; /* advance pointers to next column */
180 /* Pass 2: process rows from work array, store into output array. */
181 /* Note that we must descale the results by a factor of 8 == 2**3. */
184 for (ctr
= 0; ctr
< DCTSIZE
; ctr
++) {
185 outptr
= output_buf
[ctr
] + output_col
;
186 /* Rows of zeroes can be exploited in the same way as we did with columns.
187 * However, the column calculation has created many nonzero AC terms, so
188 * the simplification applies less often (typically 5% to 10% of the time).
189 * And testing floats for zero is relatively expensive, so we don't bother.
194 tmp10
= wsptr
[0] + wsptr
[4];
195 tmp11
= wsptr
[0] - wsptr
[4];
197 tmp13
= wsptr
[2] + wsptr
[6];
198 tmp12
= (wsptr
[2] - wsptr
[6]) * ((FAST_FLOAT
) 1.414213562) - tmp13
;
200 tmp0
= tmp10
+ tmp13
;
201 tmp3
= tmp10
- tmp13
;
202 tmp1
= tmp11
+ tmp12
;
203 tmp2
= tmp11
- tmp12
;
207 z13
= wsptr
[5] + wsptr
[3];
208 z10
= wsptr
[5] - wsptr
[3];
209 z11
= wsptr
[1] + wsptr
[7];
210 z12
= wsptr
[1] - wsptr
[7];
213 tmp11
= (z11
- z13
) * ((FAST_FLOAT
) 1.414213562);
215 z5
= (z10
+ z12
) * ((FAST_FLOAT
) 1.847759065); /* 2*c2 */
216 tmp10
= ((FAST_FLOAT
) 1.082392200) * z12
- z5
; /* 2*(c2-c6) */
217 tmp12
= ((FAST_FLOAT
) -2.613125930) * z10
+ z5
; /* -2*(c2+c6) */
223 /* Final output stage: scale down by a factor of 8 and range-limit */
225 outptr
[0] = range_limit
[(int) DESCALE((INT32
) (tmp0
+ tmp7
), 3)
227 outptr
[7] = range_limit
[(int) DESCALE((INT32
) (tmp0
- tmp7
), 3)
229 outptr
[1] = range_limit
[(int) DESCALE((INT32
) (tmp1
+ tmp6
), 3)
231 outptr
[6] = range_limit
[(int) DESCALE((INT32
) (tmp1
- tmp6
), 3)
233 outptr
[2] = range_limit
[(int) DESCALE((INT32
) (tmp2
+ tmp5
), 3)
235 outptr
[5] = range_limit
[(int) DESCALE((INT32
) (tmp2
- tmp5
), 3)
237 outptr
[4] = range_limit
[(int) DESCALE((INT32
) (tmp3
+ tmp4
), 3)
239 outptr
[3] = range_limit
[(int) DESCALE((INT32
) (tmp3
- tmp4
), 3)
242 wsptr
+= DCTSIZE
; /* advance pointer to next row */
246 #endif /* DCT_FLOAT_SUPPORTED */