]>
Commit | Line | Data |
---|---|---|
8414a40c VZ |
1 | |
2 | #include "tif_config.h" | |
3 | ||
4 | #include <stdio.h> | |
5 | #include <stdlib.h> | |
6 | #include <string.h> | |
8414a40c VZ |
7 | #include <ctype.h> |
8 | ||
9 | #ifdef HAVE_STRINGS_H | |
10 | # include <strings.h> | |
11 | #endif | |
12 | ||
13 | #ifdef HAVE_IO_H | |
14 | # include <io.h> | |
15 | #endif | |
16 | ||
17 | #ifdef HAVE_FCNTL_H | |
18 | # include <fcntl.h> | |
19 | #endif | |
20 | ||
21 | #ifdef WIN32 | |
22 | #define STRNICMP strnicmp | |
23 | #else | |
24 | #define STRNICMP strncasecmp | |
25 | #endif | |
26 | ||
27 | typedef struct _tag_spec | |
28 | { | |
29 | short | |
30 | id; | |
31 | ||
32 | char | |
33 | *name; | |
34 | } tag_spec; | |
35 | ||
36 | static tag_spec tags[] = { | |
37 | { 5,"Image Name" }, | |
38 | { 7,"Edit Status" }, | |
39 | { 10,"Priority" }, | |
40 | { 15,"Category" }, | |
41 | { 20,"Supplemental Category" }, | |
42 | { 22,"Fixture Identifier" }, | |
43 | { 25,"Keyword" }, | |
44 | { 30,"Release Date" }, | |
45 | { 35,"Release Time" }, | |
46 | { 40,"Special Instructions" }, | |
47 | { 45,"Reference Service" }, | |
48 | { 47,"Reference Date" }, | |
49 | { 50,"Reference Number" }, | |
50 | { 55,"Created Date" }, | |
51 | { 60,"Created Time" }, | |
52 | { 65,"Originating Program" }, | |
53 | { 70,"Program Version" }, | |
54 | { 75,"Object Cycle" }, | |
55 | { 80,"Byline" }, | |
56 | { 85,"Byline Title" }, | |
57 | { 90,"City" }, | |
58 | { 95,"Province State" }, | |
59 | { 100,"Country Code" }, | |
60 | { 101,"Country" }, | |
61 | { 103,"Original Transmission Reference" }, | |
62 | { 105,"Headline" }, | |
63 | { 110,"Credit" }, | |
64 | { 115,"Source" }, | |
65 | { 116,"Copyright String" }, | |
66 | { 120,"Caption" }, | |
67 | { 121,"Local Caption" }, | |
68 | { 122,"Caption Writer" }, | |
69 | { 200,"Custom Field 1" }, | |
70 | { 201,"Custom Field 2" }, | |
71 | { 202,"Custom Field 3" }, | |
72 | { 203,"Custom Field 4" }, | |
73 | { 204,"Custom Field 5" }, | |
74 | { 205,"Custom Field 6" }, | |
75 | { 206,"Custom Field 7" }, | |
76 | { 207,"Custom Field 8" }, | |
77 | { 208,"Custom Field 9" }, | |
78 | { 209,"Custom Field 10" }, | |
79 | { 210,"Custom Field 11" }, | |
80 | { 211,"Custom Field 12" }, | |
81 | { 212,"Custom Field 13" }, | |
82 | { 213,"Custom Field 14" }, | |
83 | { 214,"Custom Field 15" }, | |
84 | { 215,"Custom Field 16" }, | |
85 | { 216,"Custom Field 17" }, | |
86 | { 217,"Custom Field 18" }, | |
87 | { 218,"Custom Field 19" }, | |
88 | { 219,"Custom Field 20" } | |
89 | }; | |
90 | ||
91 | /* | |
92 | * We format the output using HTML conventions | |
93 | * to preserve control characters and such. | |
94 | */ | |
95 | void formatString(FILE *ofile, const char *s, int len) | |
96 | { | |
97 | putc('"', ofile); | |
98 | for (; len > 0; --len, ++s) { | |
99 | int c = *s; | |
100 | switch (c) { | |
101 | case '&': | |
102 | fputs("&", ofile); | |
103 | break; | |
104 | #ifdef HANDLE_GT_LT | |
105 | case '<': | |
106 | fputs("<", ofile); | |
107 | break; | |
108 | case '>': | |
109 | fputs(">", ofile); | |
110 | break; | |
111 | #endif | |
112 | case '"': | |
113 | fputs(""", ofile); | |
114 | break; | |
115 | default: | |
116 | if (iscntrl(c)) | |
117 | fprintf(ofile, "&#%d;", c); | |
118 | else | |
119 | putc(*s, ofile); | |
120 | break; | |
121 | } | |
122 | } | |
123 | fputs("\"\n", ofile); | |
124 | } | |
125 | ||
126 | typedef struct _html_code | |
127 | { | |
128 | short | |
129 | len; | |
130 | const char | |
131 | *code, | |
132 | val; | |
133 | } html_code; | |
134 | ||
135 | static html_code html_codes[] = { | |
136 | #ifdef HANDLE_GT_LT | |
137 | { 4,"<",'<' }, | |
138 | { 4,">",'>' }, | |
139 | #endif | |
140 | { 5,"&",'&' }, | |
141 | { 6,""",'"' } | |
142 | }; | |
143 | ||
144 | /* | |
145 | * This routine converts HTML escape sequence | |
146 | * back to the original ASCII representation. | |
147 | * - returns the number of characters dropped. | |
148 | */ | |
149 | int convertHTMLcodes(char *s, int len) | |
150 | { | |
151 | if (len <=0 || s==(char*)NULL || *s=='\0') | |
152 | return 0; | |
153 | ||
154 | if (s[1] == '#') | |
155 | { | |
156 | int val, o; | |
157 | ||
158 | if (sscanf(s,"&#%d;",&val) == 1) | |
159 | { | |
160 | o = 3; | |
161 | while (s[o] != ';') | |
162 | { | |
163 | o++; | |
164 | if (o > 5) | |
165 | break; | |
166 | } | |
167 | if (o < 5) | |
168 | strcpy(s+1, s+1+o); | |
169 | *s = val; | |
170 | return o; | |
171 | } | |
172 | } | |
173 | else | |
174 | { | |
175 | int | |
176 | i, | |
177 | codes = sizeof(html_codes) / sizeof(html_code); | |
178 | ||
179 | for (i=0; i < codes; i++) | |
180 | { | |
181 | if (html_codes[i].len <= len) | |
182 | if (STRNICMP(s, html_codes[i].code, html_codes[i].len) == 0) | |
183 | { | |
184 | strcpy(s+1, s+html_codes[i].len); | |
185 | *s = html_codes[i].val; | |
186 | return html_codes[i].len-1; | |
187 | } | |
188 | } | |
189 | } | |
190 | ||
191 | return 0; | |
192 | } | |
193 | ||
194 | int formatIPTC(FILE *ifile, FILE *ofile) | |
195 | { | |
196 | unsigned int | |
197 | foundiptc, | |
198 | tagsfound; | |
199 | ||
200 | unsigned char | |
201 | recnum, | |
202 | dataset; | |
203 | ||
204 | char | |
205 | *readable, | |
206 | *str; | |
207 | ||
208 | long | |
209 | tagindx, | |
210 | taglen; | |
211 | ||
212 | int | |
213 | i, | |
214 | tagcount = sizeof(tags) / sizeof(tag_spec); | |
215 | ||
216 | char | |
217 | c; | |
218 | ||
219 | foundiptc = 0; /* found the IPTC-Header */ | |
220 | tagsfound = 0; /* number of tags found */ | |
221 | ||
222 | c = getc(ifile); | |
223 | while (c != EOF) | |
224 | { | |
225 | if (c == 0x1c) | |
226 | foundiptc = 1; | |
227 | else | |
228 | { | |
229 | if (foundiptc) | |
230 | return -1; | |
231 | else | |
232 | continue; | |
233 | } | |
234 | ||
235 | /* we found the 0x1c tag and now grab the dataset and record number tags */ | |
236 | dataset = getc(ifile); | |
237 | if ((char) dataset == EOF) | |
238 | return -1; | |
239 | recnum = getc(ifile); | |
240 | if ((char) recnum == EOF) | |
241 | return -1; | |
242 | /* try to match this record to one of the ones in our named table */ | |
243 | for (i=0; i< tagcount; i++) | |
244 | { | |
245 | if (tags[i].id == recnum) | |
246 | break; | |
247 | } | |
248 | if (i < tagcount) | |
249 | readable = tags[i].name; | |
250 | else | |
251 | readable = ""; | |
252 | ||
253 | /* then we decode the length of the block that follows - long or short fmt */ | |
254 | c = getc(ifile); | |
255 | if (c == EOF) | |
256 | return 0; | |
257 | if (c & (unsigned char) 0x80) | |
258 | { | |
259 | unsigned char | |
260 | buffer[4]; | |
261 | ||
262 | for (i=0; i<4; i++) | |
263 | { | |
264 | c = buffer[i] = getc(ifile); | |
265 | if (c == EOF) | |
266 | return -1; | |
267 | } | |
268 | taglen = (((long) buffer[ 0 ]) << 24) | | |
269 | (((long) buffer[ 1 ]) << 16) | | |
270 | (((long) buffer[ 2 ]) << 8) | | |
271 | (((long) buffer[ 3 ])); | |
272 | } | |
273 | else | |
274 | { | |
275 | unsigned char | |
276 | x = c; | |
277 | ||
278 | taglen = ((long) x) << 8; | |
279 | x = getc(ifile); | |
280 | if ((char)x == EOF) | |
281 | return -1; | |
282 | taglen |= (long) x; | |
283 | } | |
284 | /* make a buffer to hold the tag data and snag it from the input stream */ | |
285 | str = (char *) malloc((unsigned int) (taglen+1)); | |
286 | if (str == (char *) NULL) | |
287 | { | |
288 | printf("Memory allocation failed"); | |
289 | return 0; | |
290 | } | |
291 | for (tagindx=0; tagindx<taglen; tagindx++) | |
292 | { | |
293 | c = str[tagindx] = getc(ifile); | |
294 | if (c == EOF) | |
80ed523f VZ |
295 | { |
296 | free(str); | |
297 | return -1; | |
298 | } | |
8414a40c VZ |
299 | } |
300 | str[ taglen ] = 0; | |
301 | ||
302 | /* now finish up by formatting this binary data into ASCII equivalent */ | |
303 | if (strlen(readable) > 0) | |
304 | fprintf(ofile, "%d#%d#%s=",(unsigned int)dataset, (unsigned int) recnum, readable); | |
305 | else | |
306 | fprintf(ofile, "%d#%d=",(unsigned int)dataset, (unsigned int) recnum); | |
307 | formatString( ofile, str, taglen ); | |
308 | free(str); | |
309 | ||
310 | tagsfound++; | |
311 | ||
312 | c = getc(ifile); | |
313 | } | |
314 | return tagsfound; | |
315 | } | |
316 | ||
317 | int tokenizer(unsigned inflag,char *token,int tokmax,char *line, | |
318 | char *white,char *brkchar,char *quote,char eschar,char *brkused, | |
319 | int *next,char *quoted); | |
320 | ||
321 | char *super_fgets(char *b, int *blen, FILE *file) | |
322 | { | |
323 | int | |
324 | c, | |
325 | len; | |
326 | ||
327 | char | |
328 | *q; | |
329 | ||
330 | len=*blen; | |
331 | for (q=b; ; q++) | |
332 | { | |
333 | c=fgetc(file); | |
334 | if (c == EOF || c == '\n') | |
335 | break; | |
80ed523f | 336 | if (((long)q - (long)b + 1 ) >= (long) len) |
8414a40c | 337 | { |
80ed523f | 338 | long |
8414a40c VZ |
339 | tlen; |
340 | ||
80ed523f | 341 | tlen=(long)q-(long)b; |
8414a40c VZ |
342 | len<<=1; |
343 | b=(char *) realloc((char *) b,(len+2)); | |
344 | if ((char *) b == (char *) NULL) | |
345 | break; | |
346 | q=b+tlen; | |
347 | } | |
348 | *q=(unsigned char) c; | |
349 | } | |
350 | *blen=0; | |
351 | if ((unsigned char *)b != (unsigned char *) NULL) | |
352 | { | |
353 | int | |
354 | tlen; | |
355 | ||
80ed523f | 356 | tlen=(long)q - (long)b; |
8414a40c VZ |
357 | if (tlen == 0) |
358 | return (char *) NULL; | |
359 | b[tlen] = '\0'; | |
360 | *blen=++tlen; | |
361 | } | |
362 | return b; | |
363 | } | |
364 | ||
365 | #define BUFFER_SZ 4096 | |
366 | ||
367 | int main(int argc, char *argv[]) | |
368 | { | |
369 | unsigned int | |
370 | length; | |
371 | ||
372 | unsigned char | |
373 | *buffer; | |
374 | ||
375 | int | |
376 | i, | |
377 | mode; /* iptc binary, or iptc text */ | |
378 | ||
379 | FILE | |
380 | *ifile = stdin, | |
381 | *ofile = stdout; | |
382 | ||
383 | char | |
384 | c, | |
385 | *usage = "usage: iptcutil -t | -b [-i file] [-o file] <input >output"; | |
386 | ||
387 | if( argc < 2 ) | |
388 | { | |
80ed523f | 389 | puts(usage); |
8414a40c VZ |
390 | return 1; |
391 | } | |
392 | ||
393 | mode = 0; | |
394 | length = -1; | |
395 | buffer = (unsigned char *)NULL; | |
396 | ||
397 | for (i=1; i<argc; i++) | |
398 | { | |
399 | c = argv[i][0]; | |
400 | if (c == '-' || c == '/') | |
401 | { | |
402 | c = argv[i][1]; | |
403 | switch( c ) | |
404 | { | |
405 | case 't': | |
406 | mode = 1; | |
407 | #ifdef WIN32 | |
408 | /* Set "stdout" to binary mode: */ | |
409 | _setmode( _fileno( ofile ), _O_BINARY ); | |
410 | #endif | |
411 | break; | |
412 | case 'b': | |
413 | mode = 0; | |
414 | #ifdef WIN32 | |
415 | /* Set "stdin" to binary mode: */ | |
416 | _setmode( _fileno( ifile ), _O_BINARY ); | |
417 | #endif | |
418 | break; | |
419 | case 'i': | |
420 | if (mode == 0) | |
421 | ifile = fopen(argv[++i], "rb"); | |
422 | else | |
423 | ifile = fopen(argv[++i], "rt"); | |
424 | if (ifile == (FILE *)NULL) | |
425 | { | |
426 | printf("Unable to open: %s\n", argv[i]); | |
427 | return 1; | |
428 | } | |
429 | break; | |
430 | case 'o': | |
431 | if (mode == 0) | |
432 | ofile = fopen(argv[++i], "wt"); | |
433 | else | |
434 | ofile = fopen(argv[++i], "wb"); | |
435 | if (ofile == (FILE *)NULL) | |
436 | { | |
437 | printf("Unable to open: %s\n", argv[i]); | |
438 | return 1; | |
439 | } | |
440 | break; | |
441 | default: | |
442 | printf("Unknown option: %s\n", argv[i]); | |
443 | return 1; | |
444 | } | |
445 | } | |
446 | else | |
447 | { | |
80ed523f | 448 | puts(usage); |
8414a40c VZ |
449 | return 1; |
450 | } | |
451 | } | |
452 | ||
453 | if (mode == 0) /* handle binary iptc info */ | |
454 | formatIPTC(ifile, ofile); | |
455 | ||
456 | if (mode == 1) /* handle text form of iptc info */ | |
457 | { | |
458 | char | |
459 | brkused, | |
460 | quoted, | |
461 | *line, | |
462 | *token, | |
463 | *newstr; | |
464 | ||
465 | int | |
466 | state, | |
467 | next; | |
468 | ||
469 | unsigned char | |
470 | recnum = 0, | |
471 | dataset = 0; | |
472 | ||
473 | int | |
474 | inputlen = BUFFER_SZ; | |
475 | ||
476 | line = (char *) malloc(inputlen); | |
477 | token = (char *)NULL; | |
478 | while((line = super_fgets(line,&inputlen,ifile))!=NULL) | |
479 | { | |
480 | state=0; | |
481 | next=0; | |
482 | ||
483 | token = (char *) malloc(inputlen); | |
484 | newstr = (char *) malloc(inputlen); | |
485 | while(tokenizer(0, token, inputlen, line, "", "=", "\"", 0, | |
486 | &brkused,&next,"ed)==0) | |
487 | { | |
488 | if (state == 0) | |
489 | { | |
490 | int | |
491 | state, | |
492 | next; | |
493 | ||
494 | char | |
495 | brkused, | |
496 | quoted; | |
497 | ||
498 | state=0; | |
499 | next=0; | |
500 | while(tokenizer(0, newstr, inputlen, token, "", "#", "", 0, | |
501 | &brkused, &next, "ed)==0) | |
502 | { | |
503 | if (state == 0) | |
504 | dataset = (unsigned char) atoi(newstr); | |
505 | else | |
506 | if (state == 1) | |
507 | recnum = (unsigned char) atoi(newstr); | |
508 | state++; | |
509 | } | |
510 | } | |
511 | else | |
512 | if (state == 1) | |
513 | { | |
514 | int | |
515 | next; | |
516 | ||
517 | unsigned long | |
518 | len; | |
519 | ||
520 | char | |
521 | brkused, | |
522 | quoted; | |
523 | ||
524 | next=0; | |
525 | len = strlen(token); | |
526 | while(tokenizer(0, newstr, inputlen, token, "", "&", "", 0, | |
527 | &brkused, &next, "ed)==0) | |
528 | { | |
529 | if (brkused && next > 0) | |
530 | { | |
531 | char | |
532 | *s = &token[next-1]; | |
533 | ||
534 | len -= convertHTMLcodes(s, strlen(s)); | |
535 | } | |
536 | } | |
537 | ||
538 | fputc(0x1c, ofile); | |
539 | fputc(dataset, ofile); | |
540 | fputc(recnum, ofile); | |
541 | if (len < 0x10000) | |
542 | { | |
543 | fputc((len >> 8) & 255, ofile); | |
544 | fputc(len & 255, ofile); | |
545 | } | |
546 | else | |
547 | { | |
548 | fputc(((len >> 24) & 255) | 0x80, ofile); | |
549 | fputc((len >> 16) & 255, ofile); | |
550 | fputc((len >> 8) & 255, ofile); | |
551 | fputc(len & 255, ofile); | |
552 | } | |
553 | next=0; | |
554 | while (len--) | |
555 | fputc(token[next++], ofile); | |
556 | } | |
557 | state++; | |
558 | } | |
559 | free(token); | |
560 | token = (char *)NULL; | |
561 | free(newstr); | |
562 | newstr = (char *)NULL; | |
563 | } | |
564 | free(line); | |
565 | ||
566 | fclose( ifile ); | |
567 | fclose( ofile ); | |
568 | } | |
569 | ||
570 | return 0; | |
571 | } | |
572 | ||
573 | /* | |
574 | This routine is a generalized, finite state token parser. It allows | |
575 | you extract tokens one at a time from a string of characters. The | |
576 | characters used for white space, for break characters, and for quotes | |
577 | can be specified. Also, characters in the string can be preceded by | |
578 | a specifiable escape character which removes any special meaning the | |
579 | character may have. | |
580 | ||
581 | There are a lot of formal parameters in this subroutine call, but | |
582 | once you get familiar with them, this routine is fairly easy to use. | |
583 | "#define" macros can be used to generate simpler looking calls for | |
584 | commonly used applications of this routine. | |
585 | ||
586 | First, some terminology: | |
587 | ||
588 | token: used here, a single unit of information in | |
589 | the form of a group of characters. | |
590 | ||
591 | white space: space that gets ignored (except within quotes | |
592 | or when escaped), like blanks and tabs. in | |
593 | addition, white space terminates a non-quoted | |
594 | token. | |
595 | ||
596 | break character: a character that separates non-quoted tokens. | |
597 | commas are a common break character. the | |
598 | usage of break characters to signal the end | |
599 | of a token is the same as that of white space, | |
600 | except multiple break characters with nothing | |
601 | or only white space between generate a null | |
602 | token for each two break characters together. | |
603 | ||
604 | for example, if blank is set to be the white | |
605 | space and comma is set to be the break | |
606 | character, the line ... | |
607 | ||
608 | A, B, C , , DEF | |
609 | ||
610 | ... consists of 5 tokens: | |
611 | ||
612 | 1) "A" | |
613 | 2) "B" | |
614 | 3) "C" | |
615 | 4) "" (the null string) | |
616 | 5) "DEF" | |
617 | ||
618 | quote character: a character that, when surrounding a group | |
619 | of other characters, causes the group of | |
620 | characters to be treated as a single token, | |
621 | no matter how many white spaces or break | |
622 | characters exist in the group. also, a | |
623 | token always terminates after the closing | |
624 | quote. for example, if ' is the quote | |
625 | character, blank is white space, and comma | |
626 | is the break character, the following | |
627 | string ... | |
628 | ||
629 | A, ' B, CD'EF GHI | |
630 | ||
631 | ... consists of 4 tokens: | |
632 | ||
633 | 1) "A" | |
634 | 2) " B, CD" (note the blanks & comma) | |
635 | 3) "EF" | |
636 | 4) "GHI" | |
637 | ||
638 | the quote characters themselves do | |
639 | not appear in the resultant tokens. the | |
640 | double quotes are delimiters i use here for | |
641 | documentation purposes only. | |
642 | ||
643 | escape character: a character which itself is ignored but | |
644 | which causes the next character to be | |
645 | used as is. ^ and \ are often used as | |
646 | escape characters. an escape in the last | |
647 | position of the string gets treated as a | |
648 | "normal" (i.e., non-quote, non-white, | |
649 | non-break, and non-escape) character. | |
650 | for example, assume white space, break | |
651 | character, and quote are the same as in the | |
652 | above examples, and further, assume that | |
653 | ^ is the escape character. then, in the | |
654 | string ... | |
655 | ||
656 | ABC, ' DEF ^' GH' I ^ J K^ L ^ | |
657 | ||
658 | ... there are 7 tokens: | |
659 | ||
660 | 1) "ABC" | |
661 | 2) " DEF ' GH" | |
662 | 3) "I" | |
663 | 4) " " (a lone blank) | |
664 | 5) "J" | |
665 | 6) "K L" | |
666 | 7) "^" (passed as is at end of line) | |
667 | ||
668 | ||
669 | OK, now that you have this background, here's how to call "tokenizer": | |
670 | ||
671 | result=tokenizer(flag,token,maxtok,string,white,break,quote,escape, | |
672 | brkused,next,quoted) | |
673 | ||
674 | result: 0 if we haven't reached EOS (end of string), and | |
675 | 1 if we have (this is an "int"). | |
676 | ||
677 | flag: right now, only the low order 3 bits are used. | |
678 | 1 => convert non-quoted tokens to upper case | |
679 | 2 => convert non-quoted tokens to lower case | |
680 | 0 => do not convert non-quoted tokens | |
681 | (this is a "char"). | |
682 | ||
683 | token: a character string containing the returned next token | |
684 | (this is a "char[]"). | |
685 | ||
686 | maxtok: the maximum size of "token". characters beyond | |
687 | "maxtok" are truncated (this is an "int"). | |
688 | ||
689 | string: the string to be parsed (this is a "char[]"). | |
690 | ||
691 | white: a string of the valid white spaces. example: | |
692 | ||
693 | char whitesp[]={" \t"}; | |
694 | ||
695 | blank and tab will be valid white space (this is | |
696 | a "char[]"). | |
697 | ||
698 | break: a string of the valid break characters. example: | |
699 | ||
700 | char breakch[]={";,"}; | |
701 | ||
702 | semicolon and comma will be valid break characters | |
703 | (this is a "char[]"). | |
704 | ||
705 | IMPORTANT: do not use the name "break" as a C | |
706 | variable, as this is a reserved word in C. | |
707 | ||
708 | quote: a string of the valid quote characters. an example | |
709 | would be | |
710 | ||
711 | char whitesp[]={"'\""); | |
712 | ||
713 | (this causes single and double quotes to be valid) | |
714 | note that a token starting with one of these characters | |
715 | needs the same quote character to terminate it. | |
716 | ||
717 | for example, | |
718 | ||
719 | "ABC ' | |
720 | ||
721 | is unterminated, but | |
722 | ||
723 | "DEF" and 'GHI' | |
724 | ||
725 | are properly terminated. note that different quote | |
726 | characters can appear on the same line; only for | |
727 | a given token do the quote characters have to be | |
728 | the same (this is a "char[]"). | |
729 | ||
730 | escape: the escape character (NOT a string ... only one | |
731 | allowed). use zero if none is desired (this is | |
732 | a "char"). | |
733 | ||
734 | brkused: the break character used to terminate the current | |
735 | token. if the token was quoted, this will be the | |
736 | quote used. if the token is the last one on the | |
737 | line, this will be zero (this is a pointer to a | |
738 | "char"). | |
739 | ||
740 | next: this variable points to the first character of the | |
741 | next token. it gets reset by "tokenizer" as it steps | |
742 | through the string. set it to 0 upon initialization, | |
743 | and leave it alone after that. you can change it | |
744 | if you want to jump around in the string or re-parse | |
745 | from the beginning, but be careful (this is a | |
746 | pointer to an "int"). | |
747 | ||
748 | quoted: set to 1 (true) if the token was quoted and 0 (false) | |
749 | if not. you may need this information (for example: | |
750 | in C, a string with quotes around it is a character | |
751 | string, while one without is an identifier). | |
752 | ||
753 | (this is a pointer to a "char"). | |
754 | */ | |
755 | ||
756 | /* states */ | |
757 | ||
758 | #define IN_WHITE 0 | |
759 | #define IN_TOKEN 1 | |
760 | #define IN_QUOTE 2 | |
761 | #define IN_OZONE 3 | |
762 | ||
763 | int _p_state; /* current state */ | |
764 | unsigned _p_flag; /* option flag */ | |
765 | char _p_curquote; /* current quote char */ | |
766 | int _p_tokpos; /* current token pos */ | |
767 | ||
768 | /* routine to find character in string ... used only by "tokenizer" */ | |
769 | ||
770 | int sindex(char ch,char *string) | |
771 | { | |
772 | char *cp; | |
773 | for(cp=string;*cp;++cp) | |
774 | if(ch==*cp) | |
775 | return (int)(cp-string); /* return postion of character */ | |
776 | return -1; /* eol ... no match found */ | |
777 | } | |
778 | ||
779 | /* routine to store a character in a string ... used only by "tokenizer" */ | |
780 | ||
781 | void chstore(char *string,int max,char ch) | |
782 | { | |
783 | char c; | |
784 | if(_p_tokpos>=0&&_p_tokpos<max-1) | |
785 | { | |
786 | if(_p_state==IN_QUOTE) | |
787 | c=ch; | |
788 | else | |
789 | switch(_p_flag&3) | |
790 | { | |
791 | case 1: /* convert to upper */ | |
792 | c=toupper(ch); | |
793 | break; | |
794 | ||
795 | case 2: /* convert to lower */ | |
796 | c=tolower(ch); | |
797 | break; | |
798 | ||
799 | default: /* use as is */ | |
800 | c=ch; | |
801 | break; | |
802 | } | |
803 | string[_p_tokpos++]=c; | |
804 | } | |
805 | return; | |
806 | } | |
807 | ||
808 | int tokenizer(unsigned inflag,char *token,int tokmax,char *line, | |
809 | char *white,char *brkchar,char *quote,char eschar,char *brkused, | |
810 | int *next,char *quoted) | |
811 | { | |
812 | int qp; | |
813 | char c,nc; | |
814 | ||
815 | *brkused=0; /* initialize to null */ | |
816 | *quoted=0; /* assume not quoted */ | |
817 | ||
818 | if(!line[*next]) /* if we're at end of line, indicate such */ | |
819 | return 1; | |
820 | ||
821 | _p_state=IN_WHITE; /* initialize state */ | |
822 | _p_curquote=0; /* initialize previous quote char */ | |
823 | _p_flag=inflag; /* set option flag */ | |
824 | ||
825 | for(_p_tokpos=0;(c=line[*next]);++(*next)) /* main loop */ | |
826 | { | |
827 | if((qp=sindex(c,brkchar))>=0) /* break */ | |
828 | { | |
829 | switch(_p_state) | |
830 | { | |
831 | case IN_WHITE: /* these are the same here ... */ | |
832 | case IN_TOKEN: /* ... just get out */ | |
833 | case IN_OZONE: /* ditto */ | |
834 | ++(*next); | |
835 | *brkused=brkchar[qp]; | |
836 | goto byebye; | |
837 | ||
838 | case IN_QUOTE: /* just keep going */ | |
839 | chstore(token,tokmax,c); | |
840 | break; | |
841 | } | |
842 | } | |
843 | else if((qp=sindex(c,quote))>=0) /* quote */ | |
844 | { | |
845 | switch(_p_state) | |
846 | { | |
847 | case IN_WHITE: /* these are identical, */ | |
848 | _p_state=IN_QUOTE; /* change states */ | |
849 | _p_curquote=quote[qp]; /* save quote char */ | |
850 | *quoted=1; /* set to true as long as something is in quotes */ | |
851 | break; | |
852 | ||
853 | case IN_QUOTE: | |
854 | if(quote[qp]==_p_curquote) /* same as the beginning quote? */ | |
855 | { | |
856 | _p_state=IN_OZONE; | |
857 | _p_curquote=0; | |
858 | } | |
859 | else | |
860 | chstore(token,tokmax,c); /* treat as regular char */ | |
861 | break; | |
862 | ||
863 | case IN_TOKEN: | |
864 | case IN_OZONE: | |
865 | *brkused=c; /* uses quote as break char */ | |
866 | goto byebye; | |
867 | } | |
868 | } | |
869 | else if((qp=sindex(c,white))>=0) /* white */ | |
870 | { | |
871 | switch(_p_state) | |
872 | { | |
873 | case IN_WHITE: | |
874 | case IN_OZONE: | |
875 | break; /* keep going */ | |
876 | ||
877 | case IN_TOKEN: | |
878 | _p_state=IN_OZONE; | |
879 | break; | |
880 | ||
881 | case IN_QUOTE: | |
882 | chstore(token,tokmax,c); /* it's valid here */ | |
883 | break; | |
884 | } | |
885 | } | |
886 | else if(c==eschar) /* escape */ | |
887 | { | |
888 | nc=line[(*next)+1]; | |
889 | if(nc==0) /* end of line */ | |
890 | { | |
891 | *brkused=0; | |
892 | chstore(token,tokmax,c); | |
893 | ++(*next); | |
894 | goto byebye; | |
895 | } | |
896 | switch(_p_state) | |
897 | { | |
898 | case IN_WHITE: | |
899 | --(*next); | |
900 | _p_state=IN_TOKEN; | |
901 | break; | |
902 | ||
903 | case IN_TOKEN: | |
904 | case IN_QUOTE: | |
905 | ++(*next); | |
906 | chstore(token,tokmax,nc); | |
907 | break; | |
908 | ||
909 | case IN_OZONE: | |
910 | goto byebye; | |
911 | } | |
912 | } | |
913 | else /* anything else is just a real character */ | |
914 | { | |
915 | switch(_p_state) | |
916 | { | |
917 | case IN_WHITE: | |
918 | _p_state=IN_TOKEN; /* switch states */ | |
919 | ||
920 | case IN_TOKEN: /* these 2 are */ | |
921 | case IN_QUOTE: /* identical here */ | |
922 | chstore(token,tokmax,c); | |
923 | break; | |
924 | ||
925 | case IN_OZONE: | |
926 | goto byebye; | |
927 | } | |
928 | } | |
929 | } /* end of main loop */ | |
930 | ||
931 | byebye: | |
932 | token[_p_tokpos]=0; /* make sure token ends with EOS */ | |
933 | ||
934 | return 0; | |
935 | } | |
80ed523f VZ |
936 | /* |
937 | * Local Variables: | |
938 | * mode: c | |
939 | * c-basic-offset: 8 | |
940 | * fill-column: 78 | |
941 | * End: | |
942 | */ |