4 * Copyright (C) 1994-1998, Thomas G. Lane.
5 * This file is part of the Independent JPEG Group's software.
6 * For conditions of distribution and use, see the accompanying README file.
8 * This file contains a floating-point implementation of the
9 * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
10 * must also perform dequantization of the input coefficients.
12 * This implementation should be more accurate than either of the integer
13 * IDCT implementations. However, it may not give the same results on all
14 * machines because of differences in roundoff behavior. Speed will depend
15 * on the hardware's floating point capacity.
17 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
18 * on each row (or vice versa, but it's more convenient to emit a row at
19 * a time). Direct algorithms are also available, but they are much more
20 * complex and seem not to be any faster when reduced to code.
22 * This implementation is based on Arai, Agui, and Nakajima's algorithm for
23 * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
24 * Japanese, but the algorithm is described in the Pennebaker & Mitchell
25 * JPEG textbook (see REFERENCES section in file README). The following code
26 * is based directly on figure 4-8 in P&M.
27 * While an 8-point DCT cannot be done in less than 11 multiplies, it is
28 * possible to arrange the computation so that many of the multiplies are
29 * simple scalings of the final outputs. These multiplies can then be
30 * folded into the multiplications or divisions by the JPEG quantization
31 * table entries. The AA&N method leaves only 5 multiplies and 29 adds
32 * to be done in the DCT itself.
33 * The primary disadvantage of this method is that with a fixed-point
34 * implementation, accuracy is lost due to imprecise representation of the
35 * scaled quantization values. However, that problem does not arise if
36 * we use floating point arithmetic.
39 #define JPEG_INTERNALS
42 #include "jdct.h" /* Private declarations for DCT subsystem */
44 #ifdef DCT_FLOAT_SUPPORTED
48 * This module is specialized to the case DCTSIZE = 8.
52 Sorry
, this code only copes with
8x8 DCTs
. /* deliberate syntax err */
56 /* Dequantize a coefficient by multiplying it by the multiplier-table
57 * entry; produce a float result.
60 #define DEQUANTIZE(coef,quantval) (((FAST_FLOAT) (coef)) * (quantval))
64 * Perform dequantization and inverse DCT on one block of coefficients.
68 jpeg_idct_float (j_decompress_ptr cinfo
, jpeg_component_info
* compptr
,
70 JSAMPARRAY output_buf
, JDIMENSION output_col
)
72 FAST_FLOAT tmp0
, tmp1
, tmp2
, tmp3
, tmp4
, tmp5
, tmp6
, tmp7
;
73 FAST_FLOAT tmp10
, tmp11
, tmp12
, tmp13
;
74 FAST_FLOAT z5
, z10
, z11
, z12
, z13
;
76 FLOAT_MULT_TYPE
* quantptr
;
79 JSAMPLE
*range_limit
= IDCT_range_limit(cinfo
);
81 FAST_FLOAT workspace
[DCTSIZE2
]; /* buffers data between passes */
84 /* Pass 1: process columns from input, store into work array. */
87 quantptr
= (FLOAT_MULT_TYPE
*) compptr
->dct_table
;
89 for (ctr
= DCTSIZE
; ctr
> 0; ctr
--) {
90 /* Due to quantization, we will usually find that many of the input
91 * coefficients are zero, especially the AC terms. We can exploit this
92 * by short-circuiting the IDCT calculation for any column in which all
93 * the AC terms are zero. In that case each output is equal to the
94 * DC coefficient (with scale factor as needed).
95 * With typical images and quantization tables, half or more of the
96 * column DCT calculations can be simplified this way.
99 if (inptr
[DCTSIZE
*1] == 0 && inptr
[DCTSIZE
*2] == 0 &&
100 inptr
[DCTSIZE
*3] == 0 && inptr
[DCTSIZE
*4] == 0 &&
101 inptr
[DCTSIZE
*5] == 0 && inptr
[DCTSIZE
*6] == 0 &&
102 inptr
[DCTSIZE
*7] == 0) {
103 /* AC terms all zero */
104 FAST_FLOAT dcval
= DEQUANTIZE(inptr
[DCTSIZE
*0], quantptr
[DCTSIZE
*0]);
106 wsptr
[DCTSIZE
*0] = dcval
;
107 wsptr
[DCTSIZE
*1] = dcval
;
108 wsptr
[DCTSIZE
*2] = dcval
;
109 wsptr
[DCTSIZE
*3] = dcval
;
110 wsptr
[DCTSIZE
*4] = dcval
;
111 wsptr
[DCTSIZE
*5] = dcval
;
112 wsptr
[DCTSIZE
*6] = dcval
;
113 wsptr
[DCTSIZE
*7] = dcval
;
115 inptr
++; /* advance pointers to next column */
123 tmp0
= DEQUANTIZE(inptr
[DCTSIZE
*0], quantptr
[DCTSIZE
*0]);
124 tmp1
= DEQUANTIZE(inptr
[DCTSIZE
*2], quantptr
[DCTSIZE
*2]);
125 tmp2
= DEQUANTIZE(inptr
[DCTSIZE
*4], quantptr
[DCTSIZE
*4]);
126 tmp3
= DEQUANTIZE(inptr
[DCTSIZE
*6], quantptr
[DCTSIZE
*6]);
128 tmp10
= tmp0
+ tmp2
; /* phase 3 */
131 tmp13
= tmp1
+ tmp3
; /* phases 5-3 */
132 tmp12
= (tmp1
- tmp3
) * ((FAST_FLOAT
) 1.414213562) - tmp13
; /* 2*c4 */
134 tmp0
= tmp10
+ tmp13
; /* phase 2 */
135 tmp3
= tmp10
- tmp13
;
136 tmp1
= tmp11
+ tmp12
;
137 tmp2
= tmp11
- tmp12
;
141 tmp4
= DEQUANTIZE(inptr
[DCTSIZE
*1], quantptr
[DCTSIZE
*1]);
142 tmp5
= DEQUANTIZE(inptr
[DCTSIZE
*3], quantptr
[DCTSIZE
*3]);
143 tmp6
= DEQUANTIZE(inptr
[DCTSIZE
*5], quantptr
[DCTSIZE
*5]);
144 tmp7
= DEQUANTIZE(inptr
[DCTSIZE
*7], quantptr
[DCTSIZE
*7]);
146 z13
= tmp6
+ tmp5
; /* phase 6 */
151 tmp7
= z11
+ z13
; /* phase 5 */
152 tmp11
= (z11
- z13
) * ((FAST_FLOAT
) 1.414213562); /* 2*c4 */
154 z5
= (z10
+ z12
) * ((FAST_FLOAT
) 1.847759065); /* 2*c2 */
155 tmp10
= ((FAST_FLOAT
) 1.082392200) * z12
- z5
; /* 2*(c2-c6) */
156 tmp12
= ((FAST_FLOAT
) -2.613125930) * z10
+ z5
; /* -2*(c2+c6) */
158 tmp6
= tmp12
- tmp7
; /* phase 2 */
162 wsptr
[DCTSIZE
*0] = tmp0
+ tmp7
;
163 wsptr
[DCTSIZE
*7] = tmp0
- tmp7
;
164 wsptr
[DCTSIZE
*1] = tmp1
+ tmp6
;
165 wsptr
[DCTSIZE
*6] = tmp1
- tmp6
;
166 wsptr
[DCTSIZE
*2] = tmp2
+ tmp5
;
167 wsptr
[DCTSIZE
*5] = tmp2
- tmp5
;
168 wsptr
[DCTSIZE
*4] = tmp3
+ tmp4
;
169 wsptr
[DCTSIZE
*3] = tmp3
- tmp4
;
171 inptr
++; /* advance pointers to next column */
176 /* Pass 2: process rows from work array, store into output array. */
177 /* Note that we must descale the results by a factor of 8 == 2**3. */
180 for (ctr
= 0; ctr
< DCTSIZE
; ctr
++) {
181 outptr
= output_buf
[ctr
] + output_col
;
182 /* Rows of zeroes can be exploited in the same way as we did with columns.
183 * However, the column calculation has created many nonzero AC terms, so
184 * the simplification applies less often (typically 5% to 10% of the time).
185 * And testing floats for zero is relatively expensive, so we don't bother.
190 tmp10
= wsptr
[0] + wsptr
[4];
191 tmp11
= wsptr
[0] - wsptr
[4];
193 tmp13
= wsptr
[2] + wsptr
[6];
194 tmp12
= (wsptr
[2] - wsptr
[6]) * ((FAST_FLOAT
) 1.414213562) - tmp13
;
196 tmp0
= tmp10
+ tmp13
;
197 tmp3
= tmp10
- tmp13
;
198 tmp1
= tmp11
+ tmp12
;
199 tmp2
= tmp11
- tmp12
;
203 z13
= wsptr
[5] + wsptr
[3];
204 z10
= wsptr
[5] - wsptr
[3];
205 z11
= wsptr
[1] + wsptr
[7];
206 z12
= wsptr
[1] - wsptr
[7];
209 tmp11
= (z11
- z13
) * ((FAST_FLOAT
) 1.414213562);
211 z5
= (z10
+ z12
) * ((FAST_FLOAT
) 1.847759065); /* 2*c2 */
212 tmp10
= ((FAST_FLOAT
) 1.082392200) * z12
- z5
; /* 2*(c2-c6) */
213 tmp12
= ((FAST_FLOAT
) -2.613125930) * z10
+ z5
; /* -2*(c2+c6) */
219 /* Final output stage: scale down by a factor of 8 and range-limit */
221 outptr
[0] = range_limit
[(int) DESCALE((JPEG_INT32
) (tmp0
+ tmp7
), 3)
223 outptr
[7] = range_limit
[(int) DESCALE((JPEG_INT32
) (tmp0
- tmp7
), 3)
225 outptr
[1] = range_limit
[(int) DESCALE((JPEG_INT32
) (tmp1
+ tmp6
), 3)
227 outptr
[6] = range_limit
[(int) DESCALE((JPEG_INT32
) (tmp1
- tmp6
), 3)
229 outptr
[2] = range_limit
[(int) DESCALE((JPEG_INT32
) (tmp2
+ tmp5
), 3)
231 outptr
[5] = range_limit
[(int) DESCALE((JPEG_INT32
) (tmp2
- tmp5
), 3)
233 outptr
[4] = range_limit
[(int) DESCALE((JPEG_INT32
) (tmp3
+ tmp4
), 3)
235 outptr
[3] = range_limit
[(int) DESCALE((JPEG_INT32
) (tmp3
- tmp4
), 3)
238 wsptr
+= DCTSIZE
; /* advance pointer to next row */
242 #endif /* DCT_FLOAT_SUPPORTED */