]>
Commit | Line | Data |
---|---|---|
44bd5ea7 | 1 | /* |
e407a76b | 2 | * Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin. |
44bd5ea7 A |
3 | * Copyright (c) 1989, 1993 |
4 | * The Regents of the University of California. All rights reserved. | |
5 | * | |
6 | * This code is derived from software contributed to Berkeley by | |
7 | * James A. Woods. | |
8 | * | |
9 | * Redistribution and use in source and binary forms, with or without | |
10 | * modification, are permitted provided that the following conditions | |
11 | * are met: | |
12 | * 1. Redistributions of source code must retain the above copyright | |
13 | * notice, this list of conditions and the following disclaimer. | |
14 | * 2. Redistributions in binary form must reproduce the above copyright | |
15 | * notice, this list of conditions and the following disclaimer in the | |
16 | * documentation and/or other materials provided with the distribution. | |
17 | * 3. All advertising materials mentioning features or use of this software | |
18 | * must display the following acknowledgement: | |
19 | * This product includes software developed by the University of | |
20 | * California, Berkeley and its contributors. | |
21 | * 4. Neither the name of the University nor the names of its contributors | |
22 | * may be used to endorse or promote products derived from this software | |
23 | * without specific prior written permission. | |
24 | * | |
25 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
26 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
27 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
28 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
29 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
30 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
31 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
32 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
33 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
34 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
35 | * SUCH DAMAGE. | |
e407a76b A |
36 | * |
37 | * $FreeBSD: src/usr.bin/locate/code/locate.code.c,v 1.13 2002/03/22 01:22:47 imp Exp $ | |
44bd5ea7 A |
38 | */ |
39 | ||
44bd5ea7 | 40 | #ifndef lint |
e407a76b A |
41 | static char copyright[] = |
42 | "@(#) Copyright (c) 1989, 1993\n\ | |
43 | The Regents of the University of California. All rights reserved.\n"; | |
44bd5ea7 A |
44 | #endif /* not lint */ |
45 | ||
46 | #ifndef lint | |
e407a76b | 47 | static char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 6/6/93"; |
44bd5ea7 A |
48 | #endif /* not lint */ |
49 | ||
50 | /* | |
51 | * PURPOSE: sorted list compressor (works with a modified 'find' | |
52 | * to encode/decode a filename database) | |
53 | * | |
54 | * USAGE: bigram < list > bigrams | |
55 | * process bigrams (see updatedb) > common_bigrams | |
56 | * code common_bigrams < list > squozen_list | |
57 | * | |
58 | * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 | |
59 | * February/March 1983, p. 8). Output format is, per line, an | |
60 | * offset differential count byte followed by a partially bigram- | |
61 | * encoded ascii residue. A bigram is a two-character sequence, | |
62 | * the first 128 most common of which are encoded in one byte. | |
63 | * | |
64 | * EXAMPLE: For simple front compression with no bigram encoding, | |
65 | * if the input is... then the output is... | |
66 | * | |
67 | * /usr/src 0 /usr/src | |
68 | * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c | |
69 | * /usr/src/cmd/armadillo.c 14 armadillo.c | |
70 | * /usr/tmp/zoo 5 tmp/zoo | |
71 | * | |
72 | * The codes are: | |
73 | * | |
74 | * 0-28 likeliest differential counts + offset to make nonnegative | |
75 | * 30 switch code for out-of-range count to follow in next word | |
e407a76b | 76 | * 31 an 8 bit char followed |
44bd5ea7 A |
77 | * 128-255 bigram codes (128 most common, as determined by 'updatedb') |
78 | * 32-127 single character (printable) ascii residue (ie, literal) | |
79 | * | |
e407a76b A |
80 | * The locate database store any character except newline ('\n') |
81 | * and NUL ('\0'). The 8-bit character support don't wast extra | |
82 | * space until you have characters in file names less than 32 | |
83 | * or greather than 127. | |
84 | * | |
85 | * | |
86 | * SEE ALSO: updatedb.sh, ../bigram/locate.bigram.c | |
44bd5ea7 A |
87 | * |
88 | * AUTHOR: James A. Woods, Informatics General Corp., | |
89 | * NASA Ames Research Center, 10/82 | |
e407a76b A |
90 | * 8-bit file names characters: |
91 | * Wolfram Schneider, Berlin September 1996 | |
44bd5ea7 A |
92 | */ |
93 | ||
94 | #include <sys/param.h> | |
44bd5ea7 A |
95 | #include <err.h> |
96 | #include <errno.h> | |
44bd5ea7 A |
97 | #include <stdlib.h> |
98 | #include <string.h> | |
e407a76b | 99 | #include <stdio.h> |
44bd5ea7 | 100 | #include <unistd.h> |
44bd5ea7 A |
101 | #include "locate.h" |
102 | ||
103 | #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ | |
104 | ||
e407a76b A |
105 | u_char buf1[MAXPATHLEN] = " "; |
106 | u_char buf2[MAXPATHLEN]; | |
107 | u_char bigrams[BGBUFSIZE + 1] = { 0 }; | |
108 | ||
109 | #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ | |
44bd5ea7 | 110 | |
e407a76b A |
111 | #ifdef LOOKUP |
112 | #define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)]) | |
113 | typedef short bg_t; | |
114 | bg_t big[UCHAR_MAX + 1][UCHAR_MAX + 1]; | |
115 | #else | |
116 | #define BGINDEX(x) bgindex(x) | |
117 | typedef int bg_t; | |
118 | int bgindex(char *); | |
119 | #endif /* LOOKUP */ | |
120 | ||
121 | ||
122 | void usage(void); | |
44bd5ea7 A |
123 | |
124 | int | |
125 | main(argc, argv) | |
126 | int argc; | |
127 | char *argv[]; | |
128 | { | |
e407a76b | 129 | register u_char *cp, *oldpath, *path; |
44bd5ea7 A |
130 | int ch, code, count, diffcount, oldcount; |
131 | FILE *fp; | |
e407a76b | 132 | register int i, j; |
44bd5ea7 A |
133 | |
134 | while ((ch = getopt(argc, argv, "")) != -1) | |
135 | switch(ch) { | |
44bd5ea7 A |
136 | default: |
137 | usage(); | |
138 | } | |
139 | argc -= optind; | |
140 | argv += optind; | |
141 | ||
142 | if (argc != 1) | |
143 | usage(); | |
144 | ||
145 | if ((fp = fopen(argv[0], "r")) == NULL) | |
146 | err(1, "%s", argv[0]); | |
147 | ||
148 | /* First copy bigram array to stdout. */ | |
149 | (void)fgets(bigrams, BGBUFSIZE + 1, fp); | |
e407a76b | 150 | |
44bd5ea7 A |
151 | if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE) |
152 | err(1, "stdout"); | |
153 | (void)fclose(fp); | |
154 | ||
e407a76b A |
155 | #ifdef LOOKUP |
156 | /* init lookup table */ | |
157 | for (i = 0; i < UCHAR_MAX + 1; i++) | |
158 | for (j = 0; j < UCHAR_MAX + 1; j++) | |
159 | big[i][j] = (bg_t)-1; | |
160 | ||
161 | for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2) | |
162 | big[(u_char)*cp][(u_char)*(cp + 1)] = (bg_t)i; | |
163 | ||
164 | #endif /* LOOKUP */ | |
165 | ||
44bd5ea7 A |
166 | oldpath = buf1; |
167 | path = buf2; | |
168 | oldcount = 0; | |
e407a76b | 169 | |
44bd5ea7 | 170 | while (fgets(path, sizeof(buf2), stdin) != NULL) { |
44bd5ea7 | 171 | |
e407a76b A |
172 | /* skip empty lines */ |
173 | if (*path == '\n') | |
174 | continue; | |
175 | ||
176 | /* remove newline */ | |
44bd5ea7 | 177 | for (cp = path; *cp != '\0'; cp++) { |
e407a76b A |
178 | #ifndef LOCATE_CHAR30 |
179 | /* old locate implementations core'd for char 30 */ | |
180 | if (*cp == SWITCH) | |
44bd5ea7 | 181 | *cp = '?'; |
e407a76b A |
182 | else |
183 | #endif /* !LOCATE_CHAR30 */ | |
184 | ||
185 | /* chop newline */ | |
186 | if (*cp == '\n') | |
187 | *cp = '\0'; | |
44bd5ea7 A |
188 | } |
189 | ||
190 | /* Skip longest common prefix. */ | |
191 | for (cp = path; *cp == *oldpath; cp++, oldpath++) | |
e407a76b | 192 | if (*cp == '\0') |
44bd5ea7 | 193 | break; |
e407a76b | 194 | |
44bd5ea7 A |
195 | count = cp - path; |
196 | diffcount = count - oldcount + OFFSET; | |
197 | oldcount = count; | |
198 | if (diffcount < 0 || diffcount > 2 * OFFSET) { | |
199 | if (putchar(SWITCH) == EOF || | |
200 | putw(diffcount, stdout) == EOF) | |
201 | err(1, "stdout"); | |
202 | } else | |
203 | if (putchar(diffcount) == EOF) | |
204 | err(1, "stdout"); | |
205 | ||
206 | while (*cp != '\0') { | |
e407a76b A |
207 | /* print *two* characters */ |
208 | ||
209 | if ((code = BGINDEX(cp)) != (bg_t)-1) { | |
210 | /* | |
211 | * print *one* as bigram | |
212 | * Found, so mark byte with | |
213 | * parity bit. | |
214 | */ | |
44bd5ea7 A |
215 | if (putchar((code / 2) | PARITY) == EOF) |
216 | err(1, "stdout"); | |
217 | cp += 2; | |
218 | } | |
e407a76b A |
219 | |
220 | else { | |
221 | for (i = 0; i < 2; i++) { | |
222 | if (*cp == '\0') | |
223 | break; | |
224 | ||
225 | /* print umlauts in file names */ | |
226 | if (*cp < ASCII_MIN || | |
227 | *cp > ASCII_MAX) { | |
228 | if (putchar(UMLAUT) == EOF || | |
229 | putchar(*cp++) == EOF) | |
230 | err(1, "stdout"); | |
231 | } | |
232 | ||
233 | else { | |
234 | /* normal character */ | |
235 | if(putchar(*cp++) == EOF) | |
236 | err(1, "stdout"); | |
237 | } | |
238 | } | |
239 | ||
240 | } | |
44bd5ea7 | 241 | } |
e407a76b | 242 | |
44bd5ea7 A |
243 | if (path == buf1) { /* swap pointers */ |
244 | path = buf2; | |
245 | oldpath = buf1; | |
246 | } else { | |
247 | path = buf1; | |
248 | oldpath = buf2; | |
249 | } | |
250 | } | |
251 | /* Non-zero status if there were errors */ | |
252 | if (fflush(stdout) != 0 || ferror(stdout)) | |
253 | exit(1); | |
254 | exit(0); | |
255 | } | |
256 | ||
e407a76b | 257 | #ifndef LOOKUP |
44bd5ea7 A |
258 | int |
259 | bgindex(bg) /* Return location of bg in bigrams or -1. */ | |
260 | char *bg; | |
261 | { | |
e407a76b | 262 | register char bg0, bg1, *p; |
44bd5ea7 A |
263 | |
264 | bg0 = bg[0]; | |
265 | bg1 = bg[1]; | |
e407a76b | 266 | for (p = bigrams; *p != NULL; p++) |
44bd5ea7 A |
267 | if (*p++ == bg0 && *p == bg1) |
268 | break; | |
e407a76b | 269 | return (*p == NULL ? -1 : (--p - bigrams)); |
44bd5ea7 | 270 | } |
e407a76b | 271 | #endif /* !LOOKUP */ |
44bd5ea7 A |
272 | |
273 | void | |
274 | usage() | |
275 | { | |
276 | (void)fprintf(stderr, | |
277 | "usage: locate.code common_bigrams < list > squozen_list\n"); | |
278 | exit(1); | |
279 | } |