]>
Commit | Line | Data |
---|---|---|
00337e45 A |
1 | /*- |
2 | * Copyright (c) 2009 Xin LI <delphij@FreeBSD.org> | |
3 | * All rights reserved. | |
4 | * | |
5 | * Redistribution and use in source and binary forms, with or without | |
6 | * modification, are permitted provided that the following conditions | |
7 | * are met: | |
8 | * 1. Redistributions of source code must retain the above copyright | |
9 | * notice, this list of conditions and the following disclaimer. | |
10 | * 2. Redistributions in binary form must reproduce the above copyright | |
11 | * notice, this list of conditions and the following disclaimer in the | |
12 | * documentation and/or other materials provided with the distribution. | |
13 | * | |
14 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
17 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
18 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
19 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
20 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
21 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
22 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
23 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
24 | * SUCH DAMAGE. | |
25 | * | |
26 | * $FreeBSD: src/usr.bin/gzip/unpack.c,v 1.3 2010/10/16 15:24:04 bcr Exp $ | |
27 | */ | |
28 | ||
29 | /* This file is #included by gzip.c */ | |
30 | ||
31 | /* | |
32 | * pack(1) file format: | |
33 | * | |
34 | * The first 7 bytes is the header: | |
35 | * 00, 01 - Signature (US, RS), we already validated it earlier. | |
36 | * 02..05 - Uncompressed size | |
37 | * 06 - Level for the huffman tree (<=24) | |
38 | * | |
39 | * pack(1) will then store symbols (leaf) nodes count in each huffman | |
40 | * tree levels, each level would consume 1 byte (See [1]). | |
41 | * | |
42 | * After the symbol count table, there is the symbol table, storing | |
43 | * symbols represented by corresponding leaf node. EOB is not being | |
44 | * explicitly transmitted (not necessary anyway) in the symbol table. | |
45 | * | |
46 | * Compressed data goes after the symbol table. | |
47 | * | |
48 | * NOTES | |
49 | * | |
50 | * [1] If we count EOB into the symbols, that would mean that we will | |
51 | * have at most 256 symbols in the huffman tree. pack(1) rejects empty | |
52 | * file and files that just repeats one character, which means that we | |
53 | * will have at least 2 symbols. Therefore, pack(1) would reduce the | |
54 | * last level symbol count by 2 which makes it a number in | |
55 | * range [0..254], so all levels' symbol count would fit into 1 byte. | |
56 | */ | |
57 | ||
58 | #define PACK_HEADER_LENGTH 7 | |
59 | #define HTREE_MAXLEVEL 24 | |
60 | ||
61 | /* | |
62 | * unpack descriptor | |
63 | * | |
64 | * Represent the huffman tree in a similar way that pack(1) would | |
65 | * store in a packed file. We store all symbols in a linear table, | |
66 | * and store pointers to each level's first symbol. In addition to | |
67 | * that, maintain two counts for each level: inner nodes count and | |
68 | * leaf nodes count. | |
69 | */ | |
70 | typedef struct { | |
71 | int symbol_size; /* Size of the symbol table */ | |
72 | int treelevels; /* Levels for the huffman tree */ | |
73 | ||
74 | int *symbolsin; /* Table of leaf symbols count in | |
75 | each level */ | |
76 | int *inodesin; /* Table of internal nodes count in | |
77 | each level */ | |
78 | ||
79 | char *symbol; /* The symbol table */ | |
80 | char *symbol_eob; /* Pointer to the EOB symbol */ | |
81 | char **tree; /* Decoding huffman tree (pointers to | |
82 | first symbol of each tree level */ | |
83 | ||
84 | off_t uncompressed_size; /* Uncompressed size */ | |
85 | FILE *fpIn; /* Input stream */ | |
86 | FILE *fpOut; /* Output stream */ | |
87 | } unpack_descriptor_t; | |
88 | ||
89 | /* | |
90 | * Release resource allocated to an unpack descriptor. | |
91 | * | |
92 | * Caller is responsible to make sure that all of these pointers are | |
93 | * initialized (in our case, they all point to valid memory block). | |
94 | * We don't zero out pointers here because nobody else would ever | |
95 | * reference the memory block without scrubbing them. | |
96 | */ | |
97 | static void | |
98 | unpack_descriptor_fini(unpack_descriptor_t *unpackd) | |
99 | { | |
100 | ||
101 | free(unpackd->symbolsin); | |
102 | free(unpackd->inodesin); | |
103 | free(unpackd->symbol); | |
104 | free(unpackd->tree); | |
105 | ||
106 | fclose(unpackd->fpIn); | |
107 | fclose(unpackd->fpOut); | |
108 | } | |
109 | ||
110 | /* | |
111 | * Recursively fill the internal node count table | |
112 | */ | |
113 | static void | |
114 | unpackd_fill_inodesin(const unpack_descriptor_t *unpackd, int level) | |
115 | { | |
116 | ||
117 | /* | |
118 | * The internal nodes would be 1/2 of total internal nodes and | |
119 | * leaf nodes in the next level. For the last level there | |
120 | * would be no internal node by definition. | |
121 | */ | |
122 | if (level < unpackd->treelevels) { | |
123 | unpackd_fill_inodesin(unpackd, level + 1); | |
124 | unpackd->inodesin[level] = (unpackd->inodesin[level + 1] + | |
125 | unpackd->symbolsin[level + 1]) / 2; | |
126 | } else | |
127 | unpackd->inodesin[level] = 0; | |
128 | } | |
129 | ||
130 | /* | |
131 | * Update counter for accepted bytes | |
132 | */ | |
133 | static void | |
134 | accepted_bytes(off_t *bytes_in, off_t newbytes) | |
135 | { | |
136 | ||
137 | if (bytes_in != NULL) | |
138 | (*bytes_in) += newbytes; | |
139 | } | |
140 | ||
141 | /* | |
142 | * Read file header and construct the tree. Also, prepare the buffered I/O | |
143 | * for decode routine. | |
144 | * | |
145 | * Return value is uncompressed size. | |
146 | */ | |
147 | static void | |
148 | unpack_parse_header(int in, int out, char *pre, size_t prelen, off_t *bytes_in, | |
149 | unpack_descriptor_t *unpackd) | |
150 | { | |
151 | unsigned char hdr[PACK_HEADER_LENGTH]; /* buffer for header */ | |
152 | ssize_t bytesread; /* Bytes read from the file */ | |
153 | int i, j, thisbyte; | |
154 | ||
155 | /* Prepend the header buffer if we already read some data */ | |
156 | if (prelen != 0) | |
157 | memcpy(hdr, pre, prelen); | |
158 | ||
159 | /* Read in and fill the rest bytes of header */ | |
160 | bytesread = read(in, hdr + prelen, PACK_HEADER_LENGTH - prelen); | |
161 | if (bytesread < 0) | |
162 | maybe_err("Error reading pack header"); | |
163 | ||
164 | accepted_bytes(bytes_in, PACK_HEADER_LENGTH); | |
165 | ||
166 | /* Obtain uncompressed length (bytes 2,3,4,5)*/ | |
167 | unpackd->uncompressed_size = 0; | |
168 | for (i = 2; i <= 5; i++) { | |
169 | unpackd->uncompressed_size <<= 8; | |
170 | unpackd->uncompressed_size |= hdr[i]; | |
171 | } | |
172 | ||
173 | /* Get the levels of the tree */ | |
174 | unpackd->treelevels = hdr[6]; | |
175 | if (unpackd->treelevels > HTREE_MAXLEVEL || unpackd->treelevels < 1) | |
176 | maybe_errx("Huffman tree has insane levels"); | |
177 | ||
178 | /* Let libc take care for buffering from now on */ | |
179 | if ((unpackd->fpIn = fdopen(in, "r")) == NULL) | |
180 | maybe_err("Can not fdopen() input stream"); | |
181 | if ((unpackd->fpOut = fdopen(out, "w")) == NULL) | |
182 | maybe_err("Can not fdopen() output stream"); | |
183 | ||
184 | /* Allocate for the tables of bounds and the tree itself */ | |
185 | unpackd->inodesin = | |
186 | calloc(unpackd->treelevels, sizeof(*(unpackd->inodesin))); | |
187 | unpackd->symbolsin = | |
188 | calloc(unpackd->treelevels, sizeof(*(unpackd->symbolsin))); | |
189 | unpackd->tree = | |
190 | calloc(unpackd->treelevels, (sizeof (*(unpackd->tree)))); | |
191 | if (unpackd->inodesin == NULL || unpackd->symbolsin == NULL || | |
192 | unpackd->tree == NULL) | |
193 | maybe_err("calloc"); | |
194 | ||
195 | /* We count from 0 so adjust to match array upper bound */ | |
196 | unpackd->treelevels--; | |
197 | ||
198 | /* Read the levels symbol count table and calculate total */ | |
199 | unpackd->symbol_size = 1; /* EOB */ | |
200 | for (i = 0; i <= unpackd->treelevels; i++) { | |
201 | if ((thisbyte = fgetc(unpackd->fpIn)) == EOF) | |
202 | maybe_err("File appears to be truncated"); | |
203 | unpackd->symbolsin[i] = (unsigned char)thisbyte; | |
204 | unpackd->symbol_size += unpackd->symbolsin[i]; | |
205 | } | |
206 | accepted_bytes(bytes_in, unpackd->treelevels); | |
207 | if (unpackd->symbol_size > 256) | |
208 | maybe_errx("Bad symbol table"); | |
209 | ||
210 | /* Allocate for the symbol table, point symbol_eob at the beginning */ | |
211 | unpackd->symbol_eob = unpackd->symbol = calloc(1, unpackd->symbol_size); | |
212 | if (unpackd->symbol == NULL) | |
213 | maybe_err("calloc"); | |
214 | ||
215 | /* | |
216 | * Read in the symbol table, which contain [2, 256] symbols. | |
217 | * In order to fit the count in one byte, pack(1) would offset | |
218 | * it by reducing 2 from the actual number from the last level. | |
219 | * | |
220 | * We adjust the last level's symbol count by 1 here, because | |
221 | * the EOB symbol is not being transmitted explicitly. Another | |
222 | * adjustment would be done later afterward. | |
223 | */ | |
224 | unpackd->symbolsin[unpackd->treelevels]++; | |
225 | for (i = 0; i <= unpackd->treelevels; i++) { | |
226 | unpackd->tree[i] = unpackd->symbol_eob; | |
227 | for (j = 0; j < unpackd->symbolsin[i]; j++) { | |
228 | if ((thisbyte = fgetc(unpackd->fpIn)) == EOF) | |
229 | maybe_errx("Symbol table truncated"); | |
230 | *unpackd->symbol_eob++ = (char)thisbyte; | |
231 | } | |
232 | accepted_bytes(bytes_in, unpackd->symbolsin[i]); | |
233 | } | |
234 | ||
235 | /* Now, take account for the EOB symbol as well */ | |
236 | unpackd->symbolsin[unpackd->treelevels]++; | |
237 | ||
238 | /* | |
239 | * The symbolsin table has been constructed now. | |
240 | * Calculate the internal nodes count table based on it. | |
241 | */ | |
242 | unpackd_fill_inodesin(unpackd, 0); | |
243 | } | |
244 | ||
245 | /* | |
246 | * Decode huffman stream, based on the huffman tree. | |
247 | */ | |
248 | static void | |
249 | unpack_decode(const unpack_descriptor_t *unpackd, off_t *bytes_in) | |
250 | { | |
251 | int thislevel, thiscode, thisbyte, inlevelindex; | |
252 | int i; | |
253 | off_t bytes_out = 0; | |
254 | const char *thissymbol; /* The symbol pointer decoded from stream */ | |
255 | ||
256 | /* | |
257 | * Decode huffman. Fetch every bytes from the file, get it | |
258 | * into 'thiscode' bit-by-bit, then output the symbol we got | |
259 | * when one has been found. | |
260 | * | |
261 | * Assumption: sizeof(int) > ((max tree levels + 1) / 8). | |
262 | * bad things could happen if not. | |
263 | */ | |
264 | thislevel = 0; | |
265 | thiscode = thisbyte = 0; | |
266 | ||
267 | while ((thisbyte = fgetc(unpackd->fpIn)) != EOF) { | |
268 | accepted_bytes(bytes_in, 1); | |
269 | ||
270 | /* | |
271 | * Split one bit from thisbyte, from highest to lowest, | |
272 | * feed the bit into thiscode, until we got a symbol from | |
273 | * the tree. | |
274 | */ | |
275 | for (i = 7; i >= 0; i--) { | |
276 | thiscode = (thiscode << 1) | ((thisbyte >> i) & 1); | |
277 | ||
278 | /* Did we got a symbol? (referencing leaf node) */ | |
279 | if (thiscode >= unpackd->inodesin[thislevel]) { | |
280 | inlevelindex = | |
281 | thiscode - unpackd->inodesin[thislevel]; | |
282 | if (inlevelindex > unpackd->symbolsin[thislevel]) | |
283 | maybe_errx("File corrupt"); | |
284 | ||
285 | thissymbol = | |
286 | &(unpackd->tree[thislevel][inlevelindex]); | |
287 | if ((thissymbol == unpackd->symbol_eob) && | |
288 | (bytes_out == unpackd->uncompressed_size)) | |
289 | goto finished; | |
290 | ||
291 | fputc((*thissymbol), unpackd->fpOut); | |
292 | bytes_out++; | |
293 | ||
294 | /* Prepare for next input */ | |
295 | thislevel = 0; thiscode = 0; | |
296 | } else { | |
297 | thislevel++; | |
298 | if (thislevel > unpackd->treelevels) | |
299 | maybe_errx("File corrupt"); | |
300 | } | |
301 | } | |
302 | } | |
303 | ||
304 | finished: | |
305 | if (bytes_out != unpackd->uncompressed_size) | |
306 | maybe_errx("Premature EOF"); | |
307 | } | |
308 | ||
309 | /* Handler for pack(1)'ed file */ | |
310 | static off_t | |
311 | unpack(int in, int out, char *pre, size_t prelen, off_t *bytes_in) | |
312 | { | |
313 | unpack_descriptor_t unpackd; | |
314 | ||
315 | in = dup(in); | |
316 | if (in == -1) | |
317 | maybe_err("dup"); | |
318 | out = dup(out); | |
319 | if (out == -1) | |
320 | maybe_err("dup"); | |
321 | ||
322 | unpack_parse_header(in, out, pre, prelen, bytes_in, &unpackd); | |
323 | unpack_decode(&unpackd, bytes_in); | |
324 | unpack_descriptor_fini(&unpackd); | |
325 | ||
326 | /* If we reached here, the unpack was successful */ | |
327 | return (unpackd.uncompressed_size); | |
328 | } | |
329 |