]>
Commit | Line | Data |
---|---|---|
1 | /* $Id$ */ | |
2 | ||
3 | #include "tif_config.h" | |
4 | ||
5 | #include <stdio.h> | |
6 | #include <stdlib.h> | |
7 | #include <string.h> | |
8 | #include <memory.h> | |
9 | #include <ctype.h> | |
10 | ||
11 | #ifdef HAVE_STRINGS_H | |
12 | # include <strings.h> | |
13 | #endif | |
14 | ||
15 | #ifdef HAVE_IO_H | |
16 | # include <io.h> | |
17 | #endif | |
18 | ||
19 | #ifdef HAVE_FCNTL_H | |
20 | # include <fcntl.h> | |
21 | #endif | |
22 | ||
23 | #ifdef WIN32 | |
24 | #define STRNICMP strnicmp | |
25 | #else | |
26 | #define STRNICMP strncasecmp | |
27 | #endif | |
28 | ||
29 | typedef struct _tag_spec | |
30 | { | |
31 | short | |
32 | id; | |
33 | ||
34 | char | |
35 | *name; | |
36 | } tag_spec; | |
37 | ||
38 | static tag_spec tags[] = { | |
39 | { 5,"Image Name" }, | |
40 | { 7,"Edit Status" }, | |
41 | { 10,"Priority" }, | |
42 | { 15,"Category" }, | |
43 | { 20,"Supplemental Category" }, | |
44 | { 22,"Fixture Identifier" }, | |
45 | { 25,"Keyword" }, | |
46 | { 30,"Release Date" }, | |
47 | { 35,"Release Time" }, | |
48 | { 40,"Special Instructions" }, | |
49 | { 45,"Reference Service" }, | |
50 | { 47,"Reference Date" }, | |
51 | { 50,"Reference Number" }, | |
52 | { 55,"Created Date" }, | |
53 | { 60,"Created Time" }, | |
54 | { 65,"Originating Program" }, | |
55 | { 70,"Program Version" }, | |
56 | { 75,"Object Cycle" }, | |
57 | { 80,"Byline" }, | |
58 | { 85,"Byline Title" }, | |
59 | { 90,"City" }, | |
60 | { 95,"Province State" }, | |
61 | { 100,"Country Code" }, | |
62 | { 101,"Country" }, | |
63 | { 103,"Original Transmission Reference" }, | |
64 | { 105,"Headline" }, | |
65 | { 110,"Credit" }, | |
66 | { 115,"Source" }, | |
67 | { 116,"Copyright String" }, | |
68 | { 120,"Caption" }, | |
69 | { 121,"Local Caption" }, | |
70 | { 122,"Caption Writer" }, | |
71 | { 200,"Custom Field 1" }, | |
72 | { 201,"Custom Field 2" }, | |
73 | { 202,"Custom Field 3" }, | |
74 | { 203,"Custom Field 4" }, | |
75 | { 204,"Custom Field 5" }, | |
76 | { 205,"Custom Field 6" }, | |
77 | { 206,"Custom Field 7" }, | |
78 | { 207,"Custom Field 8" }, | |
79 | { 208,"Custom Field 9" }, | |
80 | { 209,"Custom Field 10" }, | |
81 | { 210,"Custom Field 11" }, | |
82 | { 211,"Custom Field 12" }, | |
83 | { 212,"Custom Field 13" }, | |
84 | { 213,"Custom Field 14" }, | |
85 | { 214,"Custom Field 15" }, | |
86 | { 215,"Custom Field 16" }, | |
87 | { 216,"Custom Field 17" }, | |
88 | { 217,"Custom Field 18" }, | |
89 | { 218,"Custom Field 19" }, | |
90 | { 219,"Custom Field 20" } | |
91 | }; | |
92 | ||
93 | /* | |
94 | * We format the output using HTML conventions | |
95 | * to preserve control characters and such. | |
96 | */ | |
97 | void formatString(FILE *ofile, const char *s, int len) | |
98 | { | |
99 | putc('"', ofile); | |
100 | for (; len > 0; --len, ++s) { | |
101 | int c = *s; | |
102 | switch (c) { | |
103 | case '&': | |
104 | fputs("&", ofile); | |
105 | break; | |
106 | #ifdef HANDLE_GT_LT | |
107 | case '<': | |
108 | fputs("<", ofile); | |
109 | break; | |
110 | case '>': | |
111 | fputs(">", ofile); | |
112 | break; | |
113 | #endif | |
114 | case '"': | |
115 | fputs(""", ofile); | |
116 | break; | |
117 | default: | |
118 | if (iscntrl(c)) | |
119 | fprintf(ofile, "&#%d;", c); | |
120 | else | |
121 | putc(*s, ofile); | |
122 | break; | |
123 | } | |
124 | } | |
125 | fputs("\"\n", ofile); | |
126 | } | |
127 | ||
128 | typedef struct _html_code | |
129 | { | |
130 | short | |
131 | len; | |
132 | const char | |
133 | *code, | |
134 | val; | |
135 | } html_code; | |
136 | ||
137 | static html_code html_codes[] = { | |
138 | #ifdef HANDLE_GT_LT | |
139 | { 4,"<",'<' }, | |
140 | { 4,">",'>' }, | |
141 | #endif | |
142 | { 5,"&",'&' }, | |
143 | { 6,""",'"' } | |
144 | }; | |
145 | ||
146 | /* | |
147 | * This routine converts HTML escape sequence | |
148 | * back to the original ASCII representation. | |
149 | * - returns the number of characters dropped. | |
150 | */ | |
151 | int convertHTMLcodes(char *s, int len) | |
152 | { | |
153 | if (len <=0 || s==(char*)NULL || *s=='\0') | |
154 | return 0; | |
155 | ||
156 | if (s[1] == '#') | |
157 | { | |
158 | int val, o; | |
159 | ||
160 | if (sscanf(s,"&#%d;",&val) == 1) | |
161 | { | |
162 | o = 3; | |
163 | while (s[o] != ';') | |
164 | { | |
165 | o++; | |
166 | if (o > 5) | |
167 | break; | |
168 | } | |
169 | if (o < 5) | |
170 | strcpy(s+1, s+1+o); | |
171 | *s = val; | |
172 | return o; | |
173 | } | |
174 | } | |
175 | else | |
176 | { | |
177 | int | |
178 | i, | |
179 | codes = sizeof(html_codes) / sizeof(html_code); | |
180 | ||
181 | for (i=0; i < codes; i++) | |
182 | { | |
183 | if (html_codes[i].len <= len) | |
184 | if (STRNICMP(s, html_codes[i].code, html_codes[i].len) == 0) | |
185 | { | |
186 | strcpy(s+1, s+html_codes[i].len); | |
187 | *s = html_codes[i].val; | |
188 | return html_codes[i].len-1; | |
189 | } | |
190 | } | |
191 | } | |
192 | ||
193 | return 0; | |
194 | } | |
195 | ||
196 | int formatIPTC(FILE *ifile, FILE *ofile) | |
197 | { | |
198 | unsigned int | |
199 | foundiptc, | |
200 | tagsfound; | |
201 | ||
202 | unsigned char | |
203 | recnum, | |
204 | dataset; | |
205 | ||
206 | char | |
207 | *readable, | |
208 | *str; | |
209 | ||
210 | long | |
211 | tagindx, | |
212 | taglen; | |
213 | ||
214 | int | |
215 | i, | |
216 | tagcount = sizeof(tags) / sizeof(tag_spec); | |
217 | ||
218 | char | |
219 | c; | |
220 | ||
221 | foundiptc = 0; /* found the IPTC-Header */ | |
222 | tagsfound = 0; /* number of tags found */ | |
223 | ||
224 | c = getc(ifile); | |
225 | while (c != EOF) | |
226 | { | |
227 | if (c == 0x1c) | |
228 | foundiptc = 1; | |
229 | else | |
230 | { | |
231 | if (foundiptc) | |
232 | return -1; | |
233 | else | |
234 | continue; | |
235 | } | |
236 | ||
237 | /* we found the 0x1c tag and now grab the dataset and record number tags */ | |
238 | dataset = getc(ifile); | |
239 | if ((char) dataset == EOF) | |
240 | return -1; | |
241 | recnum = getc(ifile); | |
242 | if ((char) recnum == EOF) | |
243 | return -1; | |
244 | /* try to match this record to one of the ones in our named table */ | |
245 | for (i=0; i< tagcount; i++) | |
246 | { | |
247 | if (tags[i].id == recnum) | |
248 | break; | |
249 | } | |
250 | if (i < tagcount) | |
251 | readable = tags[i].name; | |
252 | else | |
253 | readable = ""; | |
254 | ||
255 | /* then we decode the length of the block that follows - long or short fmt */ | |
256 | c = getc(ifile); | |
257 | if (c == EOF) | |
258 | return 0; | |
259 | if (c & (unsigned char) 0x80) | |
260 | { | |
261 | unsigned char | |
262 | buffer[4]; | |
263 | ||
264 | for (i=0; i<4; i++) | |
265 | { | |
266 | c = buffer[i] = getc(ifile); | |
267 | if (c == EOF) | |
268 | return -1; | |
269 | } | |
270 | taglen = (((long) buffer[ 0 ]) << 24) | | |
271 | (((long) buffer[ 1 ]) << 16) | | |
272 | (((long) buffer[ 2 ]) << 8) | | |
273 | (((long) buffer[ 3 ])); | |
274 | } | |
275 | else | |
276 | { | |
277 | unsigned char | |
278 | x = c; | |
279 | ||
280 | taglen = ((long) x) << 8; | |
281 | x = getc(ifile); | |
282 | if ((char)x == EOF) | |
283 | return -1; | |
284 | taglen |= (long) x; | |
285 | } | |
286 | /* make a buffer to hold the tag data and snag it from the input stream */ | |
287 | str = (char *) malloc((unsigned int) (taglen+1)); | |
288 | if (str == (char *) NULL) | |
289 | { | |
290 | printf("Memory allocation failed"); | |
291 | return 0; | |
292 | } | |
293 | for (tagindx=0; tagindx<taglen; tagindx++) | |
294 | { | |
295 | c = str[tagindx] = getc(ifile); | |
296 | if (c == EOF) | |
297 | return -1; | |
298 | } | |
299 | str[ taglen ] = 0; | |
300 | ||
301 | /* now finish up by formatting this binary data into ASCII equivalent */ | |
302 | if (strlen(readable) > 0) | |
303 | fprintf(ofile, "%d#%d#%s=",(unsigned int)dataset, (unsigned int) recnum, readable); | |
304 | else | |
305 | fprintf(ofile, "%d#%d=",(unsigned int)dataset, (unsigned int) recnum); | |
306 | formatString( ofile, str, taglen ); | |
307 | free(str); | |
308 | ||
309 | tagsfound++; | |
310 | ||
311 | c = getc(ifile); | |
312 | } | |
313 | return tagsfound; | |
314 | } | |
315 | ||
316 | int tokenizer(unsigned inflag,char *token,int tokmax,char *line, | |
317 | char *white,char *brkchar,char *quote,char eschar,char *brkused, | |
318 | int *next,char *quoted); | |
319 | ||
320 | char *super_fgets(char *b, int *blen, FILE *file) | |
321 | { | |
322 | int | |
323 | c, | |
324 | len; | |
325 | ||
326 | char | |
327 | *q; | |
328 | ||
329 | len=*blen; | |
330 | for (q=b; ; q++) | |
331 | { | |
332 | c=fgetc(file); | |
333 | if (c == EOF || c == '\n') | |
334 | break; | |
335 | if (((int)q - (int)b + 1 ) >= (int) len) | |
336 | { | |
337 | int | |
338 | tlen; | |
339 | ||
340 | tlen=(int)q-(int)b; | |
341 | len<<=1; | |
342 | b=(char *) realloc((char *) b,(len+2)); | |
343 | if ((char *) b == (char *) NULL) | |
344 | break; | |
345 | q=b+tlen; | |
346 | } | |
347 | *q=(unsigned char) c; | |
348 | } | |
349 | *blen=0; | |
350 | if ((unsigned char *)b != (unsigned char *) NULL) | |
351 | { | |
352 | int | |
353 | tlen; | |
354 | ||
355 | tlen=(int)q - (int)b; | |
356 | if (tlen == 0) | |
357 | return (char *) NULL; | |
358 | b[tlen] = '\0'; | |
359 | *blen=++tlen; | |
360 | } | |
361 | return b; | |
362 | } | |
363 | ||
364 | #define BUFFER_SZ 4096 | |
365 | ||
366 | int main(int argc, char *argv[]) | |
367 | { | |
368 | unsigned int | |
369 | length; | |
370 | ||
371 | unsigned char | |
372 | *buffer; | |
373 | ||
374 | int | |
375 | i, | |
376 | mode; /* iptc binary, or iptc text */ | |
377 | ||
378 | FILE | |
379 | *ifile = stdin, | |
380 | *ofile = stdout; | |
381 | ||
382 | char | |
383 | c, | |
384 | *usage = "usage: iptcutil -t | -b [-i file] [-o file] <input >output"; | |
385 | ||
386 | if( argc < 2 ) | |
387 | { | |
388 | printf(usage); | |
389 | return 1; | |
390 | } | |
391 | ||
392 | mode = 0; | |
393 | length = -1; | |
394 | buffer = (unsigned char *)NULL; | |
395 | ||
396 | for (i=1; i<argc; i++) | |
397 | { | |
398 | c = argv[i][0]; | |
399 | if (c == '-' || c == '/') | |
400 | { | |
401 | c = argv[i][1]; | |
402 | switch( c ) | |
403 | { | |
404 | case 't': | |
405 | mode = 1; | |
406 | #ifdef WIN32 | |
407 | /* Set "stdout" to binary mode: */ | |
408 | _setmode( _fileno( ofile ), _O_BINARY ); | |
409 | #endif | |
410 | break; | |
411 | case 'b': | |
412 | mode = 0; | |
413 | #ifdef WIN32 | |
414 | /* Set "stdin" to binary mode: */ | |
415 | _setmode( _fileno( ifile ), _O_BINARY ); | |
416 | #endif | |
417 | break; | |
418 | case 'i': | |
419 | if (mode == 0) | |
420 | ifile = fopen(argv[++i], "rb"); | |
421 | else | |
422 | ifile = fopen(argv[++i], "rt"); | |
423 | if (ifile == (FILE *)NULL) | |
424 | { | |
425 | printf("Unable to open: %s\n", argv[i]); | |
426 | return 1; | |
427 | } | |
428 | break; | |
429 | case 'o': | |
430 | if (mode == 0) | |
431 | ofile = fopen(argv[++i], "wt"); | |
432 | else | |
433 | ofile = fopen(argv[++i], "wb"); | |
434 | if (ofile == (FILE *)NULL) | |
435 | { | |
436 | printf("Unable to open: %s\n", argv[i]); | |
437 | return 1; | |
438 | } | |
439 | break; | |
440 | default: | |
441 | printf("Unknown option: %s\n", argv[i]); | |
442 | return 1; | |
443 | } | |
444 | } | |
445 | else | |
446 | { | |
447 | printf(usage); | |
448 | return 1; | |
449 | } | |
450 | } | |
451 | ||
452 | if (mode == 0) /* handle binary iptc info */ | |
453 | formatIPTC(ifile, ofile); | |
454 | ||
455 | if (mode == 1) /* handle text form of iptc info */ | |
456 | { | |
457 | char | |
458 | brkused, | |
459 | quoted, | |
460 | *line, | |
461 | *token, | |
462 | *newstr; | |
463 | ||
464 | int | |
465 | state, | |
466 | next; | |
467 | ||
468 | unsigned char | |
469 | recnum = 0, | |
470 | dataset = 0; | |
471 | ||
472 | int | |
473 | inputlen = BUFFER_SZ; | |
474 | ||
475 | line = (char *) malloc(inputlen); | |
476 | token = (char *)NULL; | |
477 | while((line = super_fgets(line,&inputlen,ifile))!=NULL) | |
478 | { | |
479 | state=0; | |
480 | next=0; | |
481 | ||
482 | token = (char *) malloc(inputlen); | |
483 | newstr = (char *) malloc(inputlen); | |
484 | while(tokenizer(0, token, inputlen, line, "", "=", "\"", 0, | |
485 | &brkused,&next,"ed)==0) | |
486 | { | |
487 | if (state == 0) | |
488 | { | |
489 | int | |
490 | state, | |
491 | next; | |
492 | ||
493 | char | |
494 | brkused, | |
495 | quoted; | |
496 | ||
497 | state=0; | |
498 | next=0; | |
499 | while(tokenizer(0, newstr, inputlen, token, "", "#", "", 0, | |
500 | &brkused, &next, "ed)==0) | |
501 | { | |
502 | if (state == 0) | |
503 | dataset = (unsigned char) atoi(newstr); | |
504 | else | |
505 | if (state == 1) | |
506 | recnum = (unsigned char) atoi(newstr); | |
507 | state++; | |
508 | } | |
509 | } | |
510 | else | |
511 | if (state == 1) | |
512 | { | |
513 | int | |
514 | next; | |
515 | ||
516 | unsigned long | |
517 | len; | |
518 | ||
519 | char | |
520 | brkused, | |
521 | quoted; | |
522 | ||
523 | next=0; | |
524 | len = strlen(token); | |
525 | while(tokenizer(0, newstr, inputlen, token, "", "&", "", 0, | |
526 | &brkused, &next, "ed)==0) | |
527 | { | |
528 | if (brkused && next > 0) | |
529 | { | |
530 | char | |
531 | *s = &token[next-1]; | |
532 | ||
533 | len -= convertHTMLcodes(s, strlen(s)); | |
534 | } | |
535 | } | |
536 | ||
537 | fputc(0x1c, ofile); | |
538 | fputc(dataset, ofile); | |
539 | fputc(recnum, ofile); | |
540 | if (len < 0x10000) | |
541 | { | |
542 | fputc((len >> 8) & 255, ofile); | |
543 | fputc(len & 255, ofile); | |
544 | } | |
545 | else | |
546 | { | |
547 | fputc(((len >> 24) & 255) | 0x80, ofile); | |
548 | fputc((len >> 16) & 255, ofile); | |
549 | fputc((len >> 8) & 255, ofile); | |
550 | fputc(len & 255, ofile); | |
551 | } | |
552 | next=0; | |
553 | while (len--) | |
554 | fputc(token[next++], ofile); | |
555 | } | |
556 | state++; | |
557 | } | |
558 | free(token); | |
559 | token = (char *)NULL; | |
560 | free(newstr); | |
561 | newstr = (char *)NULL; | |
562 | } | |
563 | free(line); | |
564 | ||
565 | fclose( ifile ); | |
566 | fclose( ofile ); | |
567 | } | |
568 | ||
569 | return 0; | |
570 | } | |
571 | ||
572 | /* | |
573 | This routine is a generalized, finite state token parser. It allows | |
574 | you extract tokens one at a time from a string of characters. The | |
575 | characters used for white space, for break characters, and for quotes | |
576 | can be specified. Also, characters in the string can be preceded by | |
577 | a specifiable escape character which removes any special meaning the | |
578 | character may have. | |
579 | ||
580 | There are a lot of formal parameters in this subroutine call, but | |
581 | once you get familiar with them, this routine is fairly easy to use. | |
582 | "#define" macros can be used to generate simpler looking calls for | |
583 | commonly used applications of this routine. | |
584 | ||
585 | First, some terminology: | |
586 | ||
587 | token: used here, a single unit of information in | |
588 | the form of a group of characters. | |
589 | ||
590 | white space: space that gets ignored (except within quotes | |
591 | or when escaped), like blanks and tabs. in | |
592 | addition, white space terminates a non-quoted | |
593 | token. | |
594 | ||
595 | break character: a character that separates non-quoted tokens. | |
596 | commas are a common break character. the | |
597 | usage of break characters to signal the end | |
598 | of a token is the same as that of white space, | |
599 | except multiple break characters with nothing | |
600 | or only white space between generate a null | |
601 | token for each two break characters together. | |
602 | ||
603 | for example, if blank is set to be the white | |
604 | space and comma is set to be the break | |
605 | character, the line ... | |
606 | ||
607 | A, B, C , , DEF | |
608 | ||
609 | ... consists of 5 tokens: | |
610 | ||
611 | 1) "A" | |
612 | 2) "B" | |
613 | 3) "C" | |
614 | 4) "" (the null string) | |
615 | 5) "DEF" | |
616 | ||
617 | quote character: a character that, when surrounding a group | |
618 | of other characters, causes the group of | |
619 | characters to be treated as a single token, | |
620 | no matter how many white spaces or break | |
621 | characters exist in the group. also, a | |
622 | token always terminates after the closing | |
623 | quote. for example, if ' is the quote | |
624 | character, blank is white space, and comma | |
625 | is the break character, the following | |
626 | string ... | |
627 | ||
628 | A, ' B, CD'EF GHI | |
629 | ||
630 | ... consists of 4 tokens: | |
631 | ||
632 | 1) "A" | |
633 | 2) " B, CD" (note the blanks & comma) | |
634 | 3) "EF" | |
635 | 4) "GHI" | |
636 | ||
637 | the quote characters themselves do | |
638 | not appear in the resultant tokens. the | |
639 | double quotes are delimiters i use here for | |
640 | documentation purposes only. | |
641 | ||
642 | escape character: a character which itself is ignored but | |
643 | which causes the next character to be | |
644 | used as is. ^ and \ are often used as | |
645 | escape characters. an escape in the last | |
646 | position of the string gets treated as a | |
647 | "normal" (i.e., non-quote, non-white, | |
648 | non-break, and non-escape) character. | |
649 | for example, assume white space, break | |
650 | character, and quote are the same as in the | |
651 | above examples, and further, assume that | |
652 | ^ is the escape character. then, in the | |
653 | string ... | |
654 | ||
655 | ABC, ' DEF ^' GH' I ^ J K^ L ^ | |
656 | ||
657 | ... there are 7 tokens: | |
658 | ||
659 | 1) "ABC" | |
660 | 2) " DEF ' GH" | |
661 | 3) "I" | |
662 | 4) " " (a lone blank) | |
663 | 5) "J" | |
664 | 6) "K L" | |
665 | 7) "^" (passed as is at end of line) | |
666 | ||
667 | ||
668 | OK, now that you have this background, here's how to call "tokenizer": | |
669 | ||
670 | result=tokenizer(flag,token,maxtok,string,white,break,quote,escape, | |
671 | brkused,next,quoted) | |
672 | ||
673 | result: 0 if we haven't reached EOS (end of string), and | |
674 | 1 if we have (this is an "int"). | |
675 | ||
676 | flag: right now, only the low order 3 bits are used. | |
677 | 1 => convert non-quoted tokens to upper case | |
678 | 2 => convert non-quoted tokens to lower case | |
679 | 0 => do not convert non-quoted tokens | |
680 | (this is a "char"). | |
681 | ||
682 | token: a character string containing the returned next token | |
683 | (this is a "char[]"). | |
684 | ||
685 | maxtok: the maximum size of "token". characters beyond | |
686 | "maxtok" are truncated (this is an "int"). | |
687 | ||
688 | string: the string to be parsed (this is a "char[]"). | |
689 | ||
690 | white: a string of the valid white spaces. example: | |
691 | ||
692 | char whitesp[]={" \t"}; | |
693 | ||
694 | blank and tab will be valid white space (this is | |
695 | a "char[]"). | |
696 | ||
697 | break: a string of the valid break characters. example: | |
698 | ||
699 | char breakch[]={";,"}; | |
700 | ||
701 | semicolon and comma will be valid break characters | |
702 | (this is a "char[]"). | |
703 | ||
704 | IMPORTANT: do not use the name "break" as a C | |
705 | variable, as this is a reserved word in C. | |
706 | ||
707 | quote: a string of the valid quote characters. an example | |
708 | would be | |
709 | ||
710 | char whitesp[]={"'\""); | |
711 | ||
712 | (this causes single and double quotes to be valid) | |
713 | note that a token starting with one of these characters | |
714 | needs the same quote character to terminate it. | |
715 | ||
716 | for example, | |
717 | ||
718 | "ABC ' | |
719 | ||
720 | is unterminated, but | |
721 | ||
722 | "DEF" and 'GHI' | |
723 | ||
724 | are properly terminated. note that different quote | |
725 | characters can appear on the same line; only for | |
726 | a given token do the quote characters have to be | |
727 | the same (this is a "char[]"). | |
728 | ||
729 | escape: the escape character (NOT a string ... only one | |
730 | allowed). use zero if none is desired (this is | |
731 | a "char"). | |
732 | ||
733 | brkused: the break character used to terminate the current | |
734 | token. if the token was quoted, this will be the | |
735 | quote used. if the token is the last one on the | |
736 | line, this will be zero (this is a pointer to a | |
737 | "char"). | |
738 | ||
739 | next: this variable points to the first character of the | |
740 | next token. it gets reset by "tokenizer" as it steps | |
741 | through the string. set it to 0 upon initialization, | |
742 | and leave it alone after that. you can change it | |
743 | if you want to jump around in the string or re-parse | |
744 | from the beginning, but be careful (this is a | |
745 | pointer to an "int"). | |
746 | ||
747 | quoted: set to 1 (true) if the token was quoted and 0 (false) | |
748 | if not. you may need this information (for example: | |
749 | in C, a string with quotes around it is a character | |
750 | string, while one without is an identifier). | |
751 | ||
752 | (this is a pointer to a "char"). | |
753 | */ | |
754 | ||
755 | /* states */ | |
756 | ||
757 | #define IN_WHITE 0 | |
758 | #define IN_TOKEN 1 | |
759 | #define IN_QUOTE 2 | |
760 | #define IN_OZONE 3 | |
761 | ||
762 | int _p_state; /* current state */ | |
763 | unsigned _p_flag; /* option flag */ | |
764 | char _p_curquote; /* current quote char */ | |
765 | int _p_tokpos; /* current token pos */ | |
766 | ||
767 | /* routine to find character in string ... used only by "tokenizer" */ | |
768 | ||
769 | int sindex(char ch,char *string) | |
770 | { | |
771 | char *cp; | |
772 | for(cp=string;*cp;++cp) | |
773 | if(ch==*cp) | |
774 | return (int)(cp-string); /* return postion of character */ | |
775 | return -1; /* eol ... no match found */ | |
776 | } | |
777 | ||
778 | /* routine to store a character in a string ... used only by "tokenizer" */ | |
779 | ||
780 | void chstore(char *string,int max,char ch) | |
781 | { | |
782 | char c; | |
783 | if(_p_tokpos>=0&&_p_tokpos<max-1) | |
784 | { | |
785 | if(_p_state==IN_QUOTE) | |
786 | c=ch; | |
787 | else | |
788 | switch(_p_flag&3) | |
789 | { | |
790 | case 1: /* convert to upper */ | |
791 | c=toupper(ch); | |
792 | break; | |
793 | ||
794 | case 2: /* convert to lower */ | |
795 | c=tolower(ch); | |
796 | break; | |
797 | ||
798 | default: /* use as is */ | |
799 | c=ch; | |
800 | break; | |
801 | } | |
802 | string[_p_tokpos++]=c; | |
803 | } | |
804 | return; | |
805 | } | |
806 | ||
807 | int tokenizer(unsigned inflag,char *token,int tokmax,char *line, | |
808 | char *white,char *brkchar,char *quote,char eschar,char *brkused, | |
809 | int *next,char *quoted) | |
810 | { | |
811 | int qp; | |
812 | char c,nc; | |
813 | ||
814 | *brkused=0; /* initialize to null */ | |
815 | *quoted=0; /* assume not quoted */ | |
816 | ||
817 | if(!line[*next]) /* if we're at end of line, indicate such */ | |
818 | return 1; | |
819 | ||
820 | _p_state=IN_WHITE; /* initialize state */ | |
821 | _p_curquote=0; /* initialize previous quote char */ | |
822 | _p_flag=inflag; /* set option flag */ | |
823 | ||
824 | for(_p_tokpos=0;(c=line[*next]);++(*next)) /* main loop */ | |
825 | { | |
826 | if((qp=sindex(c,brkchar))>=0) /* break */ | |
827 | { | |
828 | switch(_p_state) | |
829 | { | |
830 | case IN_WHITE: /* these are the same here ... */ | |
831 | case IN_TOKEN: /* ... just get out */ | |
832 | case IN_OZONE: /* ditto */ | |
833 | ++(*next); | |
834 | *brkused=brkchar[qp]; | |
835 | goto byebye; | |
836 | ||
837 | case IN_QUOTE: /* just keep going */ | |
838 | chstore(token,tokmax,c); | |
839 | break; | |
840 | } | |
841 | } | |
842 | else if((qp=sindex(c,quote))>=0) /* quote */ | |
843 | { | |
844 | switch(_p_state) | |
845 | { | |
846 | case IN_WHITE: /* these are identical, */ | |
847 | _p_state=IN_QUOTE; /* change states */ | |
848 | _p_curquote=quote[qp]; /* save quote char */ | |
849 | *quoted=1; /* set to true as long as something is in quotes */ | |
850 | break; | |
851 | ||
852 | case IN_QUOTE: | |
853 | if(quote[qp]==_p_curquote) /* same as the beginning quote? */ | |
854 | { | |
855 | _p_state=IN_OZONE; | |
856 | _p_curquote=0; | |
857 | } | |
858 | else | |
859 | chstore(token,tokmax,c); /* treat as regular char */ | |
860 | break; | |
861 | ||
862 | case IN_TOKEN: | |
863 | case IN_OZONE: | |
864 | *brkused=c; /* uses quote as break char */ | |
865 | goto byebye; | |
866 | } | |
867 | } | |
868 | else if((qp=sindex(c,white))>=0) /* white */ | |
869 | { | |
870 | switch(_p_state) | |
871 | { | |
872 | case IN_WHITE: | |
873 | case IN_OZONE: | |
874 | break; /* keep going */ | |
875 | ||
876 | case IN_TOKEN: | |
877 | _p_state=IN_OZONE; | |
878 | break; | |
879 | ||
880 | case IN_QUOTE: | |
881 | chstore(token,tokmax,c); /* it's valid here */ | |
882 | break; | |
883 | } | |
884 | } | |
885 | else if(c==eschar) /* escape */ | |
886 | { | |
887 | nc=line[(*next)+1]; | |
888 | if(nc==0) /* end of line */ | |
889 | { | |
890 | *brkused=0; | |
891 | chstore(token,tokmax,c); | |
892 | ++(*next); | |
893 | goto byebye; | |
894 | } | |
895 | switch(_p_state) | |
896 | { | |
897 | case IN_WHITE: | |
898 | --(*next); | |
899 | _p_state=IN_TOKEN; | |
900 | break; | |
901 | ||
902 | case IN_TOKEN: | |
903 | case IN_QUOTE: | |
904 | ++(*next); | |
905 | chstore(token,tokmax,nc); | |
906 | break; | |
907 | ||
908 | case IN_OZONE: | |
909 | goto byebye; | |
910 | } | |
911 | } | |
912 | else /* anything else is just a real character */ | |
913 | { | |
914 | switch(_p_state) | |
915 | { | |
916 | case IN_WHITE: | |
917 | _p_state=IN_TOKEN; /* switch states */ | |
918 | ||
919 | case IN_TOKEN: /* these 2 are */ | |
920 | case IN_QUOTE: /* identical here */ | |
921 | chstore(token,tokmax,c); | |
922 | break; | |
923 | ||
924 | case IN_OZONE: | |
925 | goto byebye; | |
926 | } | |
927 | } | |
928 | } /* end of main loop */ | |
929 | ||
930 | byebye: | |
931 | token[_p_tokpos]=0; /* make sure token ends with EOS */ | |
932 | ||
933 | return 0; | |
934 | } |