| 1 | /*--------------------------------------------------------------------------- |
| 2 | |
| 3 | unshrink.c version 1.21 23 Nov 95 |
| 4 | |
| 5 | |
| 6 | NOTE: This code may or may not infringe on the so-called "Welch |
| 7 | patent" owned by Unisys. (From reading the patent, it appears |
| 8 | that a pure LZW decompressor is *not* covered, but this claim has |
| 9 | not been tested in court, and Unisys is reported to believe other- |
| 10 | wise.) It is therefore the responsibility of the user to acquire |
| 11 | whatever license(s) may be required for legal use of this code. |
| 12 | |
| 13 | THE INFO-ZIP GROUP DISCLAIMS ALL LIABILITY FOR USE OF THIS CODE |
| 14 | IN VIOLATION OF APPLICABLE PATENT LAW. |
| 15 | |
| 16 | |
| 17 | Shrinking is basically a dynamic LZW algorithm with allowed code sizes of |
| 18 | up to 13 bits; in addition, there is provision for partial clearing of |
| 19 | leaf nodes. PKWARE uses the special code 256 (decimal) to indicate a |
| 20 | change in code size or a partial clear of the code tree: 256,1 for the |
| 21 | former and 256,2 for the latter. [Note that partial clearing can "orphan" |
| 22 | nodes: the parent-to-be can be cleared before its new child is added, |
| 23 | but the child is added anyway (as an orphan, as though the parent still |
| 24 | existed). When the tree fills up to the point where the parent node is |
| 25 | reused, the orphan is effectively "adopted." Versions prior to 1.05 were |
| 26 | affected more due to greater use of pointers (to children and siblings |
| 27 | as well as parents).] |
| 28 | |
| 29 | This replacement version of unshrink.c was written from scratch. It is |
| 30 | based only on the algorithms described in Mark Nelson's _The Data Compres- |
| 31 | sion Book_ and in Terry Welch's original paper in the June 1984 issue of |
| 32 | IEEE _Computer_; no existing source code, including any in Nelson's book, |
| 33 | was used. |
| 34 | |
| 35 | Memory requirements have been reduced in this version and are now no more |
| 36 | than the original Sam Smith code. This is still larger than any of the |
| 37 | other algorithms: at a minimum, 8K+8K+16K (stack+values+parents) assuming |
| 38 | 16-bit short ints, and this does not even include the output buffer (the |
| 39 | other algorithms leave the uncompressed data in the work area, typically |
| 40 | called slide[]). For machines with a 64KB data space this is a problem, |
| 41 | particularly when text conversion is required and line endings have more |
| 42 | than one character. UnZip's solution is to use two roughly equal halves |
| 43 | of outbuf for the ASCII conversion in such a case; the "unshrink" argument |
| 44 | to flush() signals that this is the case. |
| 45 | |
| 46 | For large-memory machines, a second outbuf is allocated for translations, |
| 47 | but only if unshrinking and only if translations are required. |
| 48 | |
| 49 | | binary mode | text mode |
| 50 | --------------------------------------------------- |
| 51 | big mem | big outbuf | big outbuf + big outbuf2 <- malloc'd here |
| 52 | small mem | small outbuf | half + half small outbuf |
| 53 | |
| 54 | Copyright 1994, 1995 Greg Roelofs. See the accompanying file "COPYING" |
| 55 | in UnZip 5.20 (or later) source or binary distributions. |
| 56 | |
| 57 | ---------------------------------------------------------------------------*/ |
| 58 | |
| 59 | |
| 60 | #define UNZIP_INTERNAL |
| 61 | #include "unzip.h" /* defines LZW_CLEAN by default */ |
| 62 | |
| 63 | |
| 64 | #ifndef LZW_CLEAN |
| 65 | |
| 66 | static void partial_clear OF((__GPRO)); |
| 67 | |
| 68 | #ifdef DEBUG |
| 69 | # define OUTDBG(c) \ |
| 70 | if ((c)<32 || (c)>=127) pipeit("\\x%02x",(c)); else { } |
| 71 | #else |
| 72 | # define OUTDBG(c) |
| 73 | #endif |
| 74 | |
| 75 | /* HSIZE is defined as 2^13 (8192) in unzip.h */ |
| 76 | #define BOGUSCODE 256 |
| 77 | #define FLAG_BITS parent /* upper bits of parent[] used as flag bits */ |
| 78 | #define CODE_MASK (HSIZE - 1) /* 0x1fff (lower bits are parent's index) */ |
| 79 | #define FREE_CODE HSIZE /* 0x2000 (code is unused or was cleared) */ |
| 80 | #define HAS_CHILD (HSIZE << 1) /* 0x4000 (code has a child--do not clear) */ |
| 81 | |
| 82 | #define parent G.area.shrink.Parent |
| 83 | #define Value G.area.shrink.value /* "value" conflicts with Pyramid ioctl.h */ |
| 84 | #define stack G.area.shrink.Stack |
| 85 | |
| 86 | |
| 87 | /***********************/ |
| 88 | /* Function unshrink() */ |
| 89 | /***********************/ |
| 90 | |
| 91 | int unshrink(__G) |
| 92 | __GDEF |
| 93 | { |
| 94 | int offset = (HSIZE - 1); |
| 95 | uch *stacktop = stack + offset; |
| 96 | register uch *newstr; |
| 97 | int codesize=9, len, KwKwK, error; |
| 98 | shrint code, oldcode, freecode, curcode; |
| 99 | shrint lastfreecode; |
| 100 | unsigned int outbufsiz; |
| 101 | #if (defined(DLL) && !defined(NO_SLIDE_REDIR)) |
| 102 | /* Normally realbuf and outbuf will be the same. However, if the data |
| 103 | * are redirected to a large memory buffer, realbuf will point to the |
| 104 | * new location while outbuf will remain pointing to the malloc'd |
| 105 | * memory buffer. */ |
| 106 | uch *realbuf = G.outbuf; |
| 107 | #else |
| 108 | # define realbuf G.outbuf |
| 109 | #endif |
| 110 | |
| 111 | |
| 112 | /*--------------------------------------------------------------------------- |
| 113 | Initialize various variables. |
| 114 | ---------------------------------------------------------------------------*/ |
| 115 | |
| 116 | lastfreecode = BOGUSCODE; |
| 117 | |
| 118 | #ifndef VMS /* VMS uses its own buffer scheme for textmode flush(). */ |
| 119 | #ifndef SMALL_MEM |
| 120 | /* non-memory-limited machines: allocate second (large) buffer for |
| 121 | * textmode conversion in flush(), but only if needed */ |
| 122 | if (G.pInfo->textmode && !G.outbuf2 && |
| 123 | (G.outbuf2 = (uch *)malloc(TRANSBUFSIZ)) == (uch *)NULL) |
| 124 | return PK_MEM3; |
| 125 | #endif |
| 126 | #endif /* !VMS */ |
| 127 | |
| 128 | for (code = 0; code < BOGUSCODE; ++code) { |
| 129 | Value[code] = (uch)code; |
| 130 | parent[code] = BOGUSCODE; |
| 131 | } |
| 132 | for (code = BOGUSCODE+1; code < HSIZE; ++code) |
| 133 | parent[code] = FREE_CODE; |
| 134 | |
| 135 | #if (defined(DLL) && !defined(NO_SLIDE_REDIR)) |
| 136 | if (G.redirect_slide) { /* use normal outbuf unless we're a DLL routine */ |
| 137 | realbuf = G.redirect_buffer; |
| 138 | outbufsiz = G.redirect_size; |
| 139 | } else |
| 140 | #endif |
| 141 | #ifdef DLL |
| 142 | if (G.pInfo->textmode && !G.redirect_data) |
| 143 | #else |
| 144 | if (G.pInfo->textmode) |
| 145 | #endif |
| 146 | outbufsiz = RAWBUFSIZ; |
| 147 | else |
| 148 | outbufsiz = OUTBUFSIZ; |
| 149 | G.outptr = realbuf; |
| 150 | G.outcnt = 0L; |
| 151 | |
| 152 | /*--------------------------------------------------------------------------- |
| 153 | Get and output first code, then loop over remaining ones. |
| 154 | ---------------------------------------------------------------------------*/ |
| 155 | |
| 156 | READBITS(codesize, oldcode) |
| 157 | if (!G.zipeof) { |
| 158 | *G.outptr++ = (uch)oldcode; |
| 159 | OUTDBG((uch)oldcode) |
| 160 | ++G.outcnt; |
| 161 | } |
| 162 | |
| 163 | do { |
| 164 | READBITS(codesize, code) |
| 165 | if (G.zipeof) |
| 166 | break; |
| 167 | if (code == BOGUSCODE) { /* possible to have consecutive escapes? */ |
| 168 | READBITS(codesize, code) |
| 169 | if (code == 1) { |
| 170 | ++codesize; |
| 171 | Trace((stderr, " (codesize now %d bits)\n", codesize)); |
| 172 | } else if (code == 2) { |
| 173 | Trace((stderr, " (partial clear code)\n")); |
| 174 | partial_clear(__G); /* clear leafs (nodes with no children) */ |
| 175 | Trace((stderr, " (done with partial clear)\n")); |
| 176 | lastfreecode = BOGUSCODE; /* reset start of free-node search */ |
| 177 | } |
| 178 | continue; |
| 179 | } |
| 180 | |
| 181 | /*----------------------------------------------------------------------- |
| 182 | Translate code: traverse tree from leaf back to root. |
| 183 | -----------------------------------------------------------------------*/ |
| 184 | |
| 185 | newstr = stacktop; |
| 186 | curcode = code; |
| 187 | |
| 188 | if (parent[curcode] == FREE_CODE) { |
| 189 | /* or (FLAG_BITS[curcode] & FREE_CODE)? */ |
| 190 | KwKwK = TRUE; |
| 191 | Trace((stderr, " (found a KwKwK code %d; oldcode = %d)\n", code, |
| 192 | oldcode)); |
| 193 | --newstr; /* last character will be same as first character */ |
| 194 | curcode = oldcode; |
| 195 | } else |
| 196 | KwKwK = FALSE; |
| 197 | |
| 198 | do { |
| 199 | *newstr-- = Value[curcode]; |
| 200 | curcode = (shrint)(parent[curcode] & CODE_MASK); |
| 201 | } while (curcode != BOGUSCODE); |
| 202 | |
| 203 | len = (int)(stacktop - newstr++); |
| 204 | if (KwKwK) |
| 205 | *stacktop = *newstr; |
| 206 | |
| 207 | /*----------------------------------------------------------------------- |
| 208 | Write expanded string in reverse order to output buffer. |
| 209 | -----------------------------------------------------------------------*/ |
| 210 | |
| 211 | Trace((stderr, "code %4d; oldcode %4d; char %3d (%c); string [", code, |
| 212 | oldcode, (int)(*newstr), (*newstr<32 || *newstr>=127)? ' ':*newstr)); |
| 213 | |
| 214 | { |
| 215 | register uch *p; |
| 216 | |
| 217 | for (p = newstr; p < newstr+len; ++p) { |
| 218 | *G.outptr++ = *p; |
| 219 | OUTDBG(*p) |
| 220 | if (++G.outcnt == outbufsiz) { |
| 221 | Trace((stderr, "doing flush(), outcnt = %lu\n", G.outcnt)); |
| 222 | if ((error = flush(__G__ realbuf, G.outcnt, TRUE)) != 0) |
| 223 | pipeit("unshrink: flush() error (%d)\n", |
| 224 | error); |
| 225 | Trace((stderr, "done with flush()\n")); |
| 226 | G.outptr = realbuf; |
| 227 | G.outcnt = 0L; |
| 228 | } |
| 229 | } |
| 230 | } |
| 231 | |
| 232 | /*----------------------------------------------------------------------- |
| 233 | Add new leaf (first character of newstr) to tree as child of oldcode. |
| 234 | -----------------------------------------------------------------------*/ |
| 235 | |
| 236 | /* search for freecode */ |
| 237 | freecode = (shrint)(lastfreecode + 1); |
| 238 | /* add if-test before loop for speed? */ |
| 239 | while (parent[freecode] != FREE_CODE) |
| 240 | ++freecode; |
| 241 | lastfreecode = freecode; |
| 242 | Trace((stderr, "]; newcode %d\n", freecode)); |
| 243 | |
| 244 | Value[freecode] = *newstr; |
| 245 | parent[freecode] = oldcode; |
| 246 | oldcode = code; |
| 247 | |
| 248 | } while (!G.zipeof); |
| 249 | |
| 250 | /*--------------------------------------------------------------------------- |
| 251 | Flush any remaining data and return to sender... |
| 252 | ---------------------------------------------------------------------------*/ |
| 253 | |
| 254 | if (G.outcnt > 0L) { |
| 255 | Trace((stderr, "doing final flush(), outcnt = %lu\n", G.outcnt)); |
| 256 | if ((error = flush(__G__ realbuf, G.outcnt, TRUE)) != 0) |
| 257 | pipeit("unshrink: flush() error (%d)\n", error); |
| 258 | Trace((stderr, "done with flush()\n")); |
| 259 | } |
| 260 | |
| 261 | return PK_OK; |
| 262 | |
| 263 | } /* end function unshrink() */ |
| 264 | |
| 265 | |
| 266 | |
| 267 | |
| 268 | |
| 269 | /****************************/ |
| 270 | /* Function partial_clear() */ /* no longer recursive... */ |
| 271 | /****************************/ |
| 272 | |
| 273 | static void partial_clear(__G) |
| 274 | __GDEF |
| 275 | { |
| 276 | register shrint code; |
| 277 | |
| 278 | /* clear all nodes which have no children (i.e., leaf nodes only) */ |
| 279 | |
| 280 | /* first loop: mark each parent as such */ |
| 281 | for (code = BOGUSCODE+1; code < HSIZE; ++code) { |
| 282 | register shrint cparent = (shrint)(parent[code] & CODE_MASK); |
| 283 | |
| 284 | if (cparent > BOGUSCODE && cparent != FREE_CODE) |
| 285 | FLAG_BITS[cparent] |= HAS_CHILD; /* set parent's child-bit */ |
| 286 | } |
| 287 | |
| 288 | /* second loop: clear all nodes *not* marked as parents; reset flag bits */ |
| 289 | for (code = BOGUSCODE+1; code < HSIZE; ++code) { |
| 290 | if (FLAG_BITS[code] & HAS_CHILD) /* just clear child-bit */ |
| 291 | FLAG_BITS[code] &= ~HAS_CHILD; |
| 292 | else { /* leaf: lose it */ |
| 293 | Trace((stderr, "%d\n", code)); |
| 294 | parent[code] = FREE_CODE; |
| 295 | } |
| 296 | } |
| 297 | |
| 298 | return; |
| 299 | } |
| 300 | |
| 301 | #endif /* !LZW_CLEAN */ |