4 * Copyright (C) 1994-1998, Thomas G. Lane.
5 * Modified 2010 by Guido Vollbeding.
6 * This file is part of the Independent JPEG Group's software.
7 * For conditions of distribution and use, see the accompanying README file.
9 * This file contains a floating-point implementation of the
10 * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
11 * must also perform dequantization of the input coefficients.
13 * This implementation should be more accurate than either of the integer
14 * IDCT implementations. However, it may not give the same results on all
15 * machines because of differences in roundoff behavior. Speed will depend
16 * on the hardware's floating point capacity.
18 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
19 * on each row (or vice versa, but it's more convenient to emit a row at
20 * a time). Direct algorithms are also available, but they are much more
21 * complex and seem not to be any faster when reduced to code.
23 * This implementation is based on Arai, Agui, and Nakajima's algorithm for
24 * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
25 * Japanese, but the algorithm is described in the Pennebaker & Mitchell
26 * JPEG textbook (see REFERENCES section in file README). The following code
27 * is based directly on figure 4-8 in P&M.
28 * While an 8-point DCT cannot be done in less than 11 multiplies, it is
29 * possible to arrange the computation so that many of the multiplies are
30 * simple scalings of the final outputs. These multiplies can then be
31 * folded into the multiplications or divisions by the JPEG quantization
32 * table entries. The AA&N method leaves only 5 multiplies and 29 adds
33 * to be done in the DCT itself.
34 * The primary disadvantage of this method is that with a fixed-point
35 * implementation, accuracy is lost due to imprecise representation of the
36 * scaled quantization values. However, that problem does not arise if
37 * we use floating point arithmetic.
40 #define JPEG_INTERNALS
43 #include "jdct.h" /* Private declarations for DCT subsystem */
45 #ifdef DCT_FLOAT_SUPPORTED
49 * This module is specialized to the case DCTSIZE = 8.
53 Sorry
, this code only copes with
8x8 DCTs
. /* deliberate syntax err */
57 /* Dequantize a coefficient by multiplying it by the multiplier-table
58 * entry; produce a float result.
61 #define DEQUANTIZE(coef,quantval) (((FAST_FLOAT) (coef)) * (quantval))
65 * Perform dequantization and inverse DCT on one block of coefficients.
69 jpeg_idct_float (j_decompress_ptr cinfo
, jpeg_component_info
* compptr
,
71 JSAMPARRAY output_buf
, JDIMENSION output_col
)
73 FAST_FLOAT tmp0
, tmp1
, tmp2
, tmp3
, tmp4
, tmp5
, tmp6
, tmp7
;
74 FAST_FLOAT tmp10
, tmp11
, tmp12
, tmp13
;
75 FAST_FLOAT z5
, z10
, z11
, z12
, z13
;
77 FLOAT_MULT_TYPE
* quantptr
;
80 JSAMPLE
*range_limit
= cinfo
->sample_range_limit
;
82 FAST_FLOAT workspace
[DCTSIZE2
]; /* buffers data between passes */
84 /* Pass 1: process columns from input, store into work array. */
87 quantptr
= (FLOAT_MULT_TYPE
*) compptr
->dct_table
;
89 for (ctr
= DCTSIZE
; ctr
> 0; ctr
--) {
90 /* Due to quantization, we will usually find that many of the input
91 * coefficients are zero, especially the AC terms. We can exploit this
92 * by short-circuiting the IDCT calculation for any column in which all
93 * the AC terms are zero. In that case each output is equal to the
94 * DC coefficient (with scale factor as needed).
95 * With typical images and quantization tables, half or more of the
96 * column DCT calculations can be simplified this way.
99 if (inptr
[DCTSIZE
*1] == 0 && inptr
[DCTSIZE
*2] == 0 &&
100 inptr
[DCTSIZE
*3] == 0 && inptr
[DCTSIZE
*4] == 0 &&
101 inptr
[DCTSIZE
*5] == 0 && inptr
[DCTSIZE
*6] == 0 &&
102 inptr
[DCTSIZE
*7] == 0) {
103 /* AC terms all zero */
104 FAST_FLOAT dcval
= DEQUANTIZE(inptr
[DCTSIZE
*0], quantptr
[DCTSIZE
*0]);
106 wsptr
[DCTSIZE
*0] = dcval
;
107 wsptr
[DCTSIZE
*1] = dcval
;
108 wsptr
[DCTSIZE
*2] = dcval
;
109 wsptr
[DCTSIZE
*3] = dcval
;
110 wsptr
[DCTSIZE
*4] = dcval
;
111 wsptr
[DCTSIZE
*5] = dcval
;
112 wsptr
[DCTSIZE
*6] = dcval
;
113 wsptr
[DCTSIZE
*7] = dcval
;
115 inptr
++; /* advance pointers to next column */
123 tmp0
= DEQUANTIZE(inptr
[DCTSIZE
*0], quantptr
[DCTSIZE
*0]);
124 tmp1
= DEQUANTIZE(inptr
[DCTSIZE
*2], quantptr
[DCTSIZE
*2]);
125 tmp2
= DEQUANTIZE(inptr
[DCTSIZE
*4], quantptr
[DCTSIZE
*4]);
126 tmp3
= DEQUANTIZE(inptr
[DCTSIZE
*6], quantptr
[DCTSIZE
*6]);
128 tmp10
= tmp0
+ tmp2
; /* phase 3 */
131 tmp13
= tmp1
+ tmp3
; /* phases 5-3 */
132 tmp12
= (tmp1
- tmp3
) * ((FAST_FLOAT
) 1.414213562) - tmp13
; /* 2*c4 */
134 tmp0
= tmp10
+ tmp13
; /* phase 2 */
135 tmp3
= tmp10
- tmp13
;
136 tmp1
= tmp11
+ tmp12
;
137 tmp2
= tmp11
- tmp12
;
141 tmp4
= DEQUANTIZE(inptr
[DCTSIZE
*1], quantptr
[DCTSIZE
*1]);
142 tmp5
= DEQUANTIZE(inptr
[DCTSIZE
*3], quantptr
[DCTSIZE
*3]);
143 tmp6
= DEQUANTIZE(inptr
[DCTSIZE
*5], quantptr
[DCTSIZE
*5]);
144 tmp7
= DEQUANTIZE(inptr
[DCTSIZE
*7], quantptr
[DCTSIZE
*7]);
146 z13
= tmp6
+ tmp5
; /* phase 6 */
151 tmp7
= z11
+ z13
; /* phase 5 */
152 tmp11
= (z11
- z13
) * ((FAST_FLOAT
) 1.414213562); /* 2*c4 */
154 z5
= (z10
+ z12
) * ((FAST_FLOAT
) 1.847759065); /* 2*c2 */
155 tmp10
= z5
- z12
* ((FAST_FLOAT
) 1.082392200); /* 2*(c2-c6) */
156 tmp12
= z5
- z10
* ((FAST_FLOAT
) 2.613125930); /* 2*(c2+c6) */
158 tmp6
= tmp12
- tmp7
; /* phase 2 */
162 wsptr
[DCTSIZE
*0] = tmp0
+ tmp7
;
163 wsptr
[DCTSIZE
*7] = tmp0
- tmp7
;
164 wsptr
[DCTSIZE
*1] = tmp1
+ tmp6
;
165 wsptr
[DCTSIZE
*6] = tmp1
- tmp6
;
166 wsptr
[DCTSIZE
*2] = tmp2
+ tmp5
;
167 wsptr
[DCTSIZE
*5] = tmp2
- tmp5
;
168 wsptr
[DCTSIZE
*3] = tmp3
+ tmp4
;
169 wsptr
[DCTSIZE
*4] = tmp3
- tmp4
;
171 inptr
++; /* advance pointers to next column */
176 /* Pass 2: process rows from work array, store into output array. */
179 for (ctr
= 0; ctr
< DCTSIZE
; ctr
++) {
180 outptr
= output_buf
[ctr
] + output_col
;
181 /* Rows of zeroes can be exploited in the same way as we did with columns.
182 * However, the column calculation has created many nonzero AC terms, so
183 * the simplification applies less often (typically 5% to 10% of the time).
184 * And testing floats for zero is relatively expensive, so we don't bother.
189 /* Apply signed->unsigned and prepare float->int conversion */
190 z5
= wsptr
[0] + ((FAST_FLOAT
) CENTERJSAMPLE
+ (FAST_FLOAT
) 0.5);
191 tmp10
= z5
+ wsptr
[4];
192 tmp11
= z5
- wsptr
[4];
194 tmp13
= wsptr
[2] + wsptr
[6];
195 tmp12
= (wsptr
[2] - wsptr
[6]) * ((FAST_FLOAT
) 1.414213562) - tmp13
;
197 tmp0
= tmp10
+ tmp13
;
198 tmp3
= tmp10
- tmp13
;
199 tmp1
= tmp11
+ tmp12
;
200 tmp2
= tmp11
- tmp12
;
204 z13
= wsptr
[5] + wsptr
[3];
205 z10
= wsptr
[5] - wsptr
[3];
206 z11
= wsptr
[1] + wsptr
[7];
207 z12
= wsptr
[1] - wsptr
[7];
210 tmp11
= (z11
- z13
) * ((FAST_FLOAT
) 1.414213562);
212 z5
= (z10
+ z12
) * ((FAST_FLOAT
) 1.847759065); /* 2*c2 */
213 tmp10
= z5
- z12
* ((FAST_FLOAT
) 1.082392200); /* 2*(c2-c6) */
214 tmp12
= z5
- z10
* ((FAST_FLOAT
) 2.613125930); /* 2*(c2+c6) */
220 /* Final output stage: float->int conversion and range-limit */
222 outptr
[0] = range_limit
[((int) (tmp0
+ tmp7
)) & RANGE_MASK
];
223 outptr
[7] = range_limit
[((int) (tmp0
- tmp7
)) & RANGE_MASK
];
224 outptr
[1] = range_limit
[((int) (tmp1
+ tmp6
)) & RANGE_MASK
];
225 outptr
[6] = range_limit
[((int) (tmp1
- tmp6
)) & RANGE_MASK
];
226 outptr
[2] = range_limit
[((int) (tmp2
+ tmp5
)) & RANGE_MASK
];
227 outptr
[5] = range_limit
[((int) (tmp2
- tmp5
)) & RANGE_MASK
];
228 outptr
[3] = range_limit
[((int) (tmp3
+ tmp4
)) & RANGE_MASK
];
229 outptr
[4] = range_limit
[((int) (tmp3
- tmp4
)) & RANGE_MASK
];
231 wsptr
+= DCTSIZE
; /* advance pointer to next row */
235 #endif /* DCT_FLOAT_SUPPORTED */