]>
Commit | Line | Data |
---|---|---|
8414a40c VZ |
1 | /* $Id$ */ |
2 | ||
3 | #include "tif_config.h" | |
4 | ||
5 | #include <stdio.h> | |
6 | #include <stdlib.h> | |
7 | #include <string.h> | |
8414a40c VZ |
8 | #include <ctype.h> |
9 | ||
10 | #ifdef HAVE_STRINGS_H | |
11 | # include <strings.h> | |
12 | #endif | |
13 | ||
14 | #ifdef HAVE_IO_H | |
15 | # include <io.h> | |
16 | #endif | |
17 | ||
18 | #ifdef HAVE_FCNTL_H | |
19 | # include <fcntl.h> | |
20 | #endif | |
21 | ||
22 | #ifdef WIN32 | |
23 | #define STRNICMP strnicmp | |
24 | #else | |
25 | #define STRNICMP strncasecmp | |
26 | #endif | |
27 | ||
28 | typedef struct _tag_spec | |
29 | { | |
30 | short | |
31 | id; | |
32 | ||
33 | char | |
34 | *name; | |
35 | } tag_spec; | |
36 | ||
37 | static tag_spec tags[] = { | |
38 | { 5,"Image Name" }, | |
39 | { 7,"Edit Status" }, | |
40 | { 10,"Priority" }, | |
41 | { 15,"Category" }, | |
42 | { 20,"Supplemental Category" }, | |
43 | { 22,"Fixture Identifier" }, | |
44 | { 25,"Keyword" }, | |
45 | { 30,"Release Date" }, | |
46 | { 35,"Release Time" }, | |
47 | { 40,"Special Instructions" }, | |
48 | { 45,"Reference Service" }, | |
49 | { 47,"Reference Date" }, | |
50 | { 50,"Reference Number" }, | |
51 | { 55,"Created Date" }, | |
52 | { 60,"Created Time" }, | |
53 | { 65,"Originating Program" }, | |
54 | { 70,"Program Version" }, | |
55 | { 75,"Object Cycle" }, | |
56 | { 80,"Byline" }, | |
57 | { 85,"Byline Title" }, | |
58 | { 90,"City" }, | |
59 | { 95,"Province State" }, | |
60 | { 100,"Country Code" }, | |
61 | { 101,"Country" }, | |
62 | { 103,"Original Transmission Reference" }, | |
63 | { 105,"Headline" }, | |
64 | { 110,"Credit" }, | |
65 | { 115,"Source" }, | |
66 | { 116,"Copyright String" }, | |
67 | { 120,"Caption" }, | |
68 | { 121,"Local Caption" }, | |
69 | { 122,"Caption Writer" }, | |
70 | { 200,"Custom Field 1" }, | |
71 | { 201,"Custom Field 2" }, | |
72 | { 202,"Custom Field 3" }, | |
73 | { 203,"Custom Field 4" }, | |
74 | { 204,"Custom Field 5" }, | |
75 | { 205,"Custom Field 6" }, | |
76 | { 206,"Custom Field 7" }, | |
77 | { 207,"Custom Field 8" }, | |
78 | { 208,"Custom Field 9" }, | |
79 | { 209,"Custom Field 10" }, | |
80 | { 210,"Custom Field 11" }, | |
81 | { 211,"Custom Field 12" }, | |
82 | { 212,"Custom Field 13" }, | |
83 | { 213,"Custom Field 14" }, | |
84 | { 214,"Custom Field 15" }, | |
85 | { 215,"Custom Field 16" }, | |
86 | { 216,"Custom Field 17" }, | |
87 | { 217,"Custom Field 18" }, | |
88 | { 218,"Custom Field 19" }, | |
89 | { 219,"Custom Field 20" } | |
90 | }; | |
91 | ||
92 | /* | |
93 | * We format the output using HTML conventions | |
94 | * to preserve control characters and such. | |
95 | */ | |
96 | void formatString(FILE *ofile, const char *s, int len) | |
97 | { | |
98 | putc('"', ofile); | |
99 | for (; len > 0; --len, ++s) { | |
100 | int c = *s; | |
101 | switch (c) { | |
102 | case '&': | |
103 | fputs("&", ofile); | |
104 | break; | |
105 | #ifdef HANDLE_GT_LT | |
106 | case '<': | |
107 | fputs("<", ofile); | |
108 | break; | |
109 | case '>': | |
110 | fputs(">", ofile); | |
111 | break; | |
112 | #endif | |
113 | case '"': | |
114 | fputs(""", ofile); | |
115 | break; | |
116 | default: | |
117 | if (iscntrl(c)) | |
118 | fprintf(ofile, "&#%d;", c); | |
119 | else | |
120 | putc(*s, ofile); | |
121 | break; | |
122 | } | |
123 | } | |
124 | fputs("\"\n", ofile); | |
125 | } | |
126 | ||
127 | typedef struct _html_code | |
128 | { | |
129 | short | |
130 | len; | |
131 | const char | |
132 | *code, | |
133 | val; | |
134 | } html_code; | |
135 | ||
136 | static html_code html_codes[] = { | |
137 | #ifdef HANDLE_GT_LT | |
138 | { 4,"<",'<' }, | |
139 | { 4,">",'>' }, | |
140 | #endif | |
141 | { 5,"&",'&' }, | |
142 | { 6,""",'"' } | |
143 | }; | |
144 | ||
145 | /* | |
146 | * This routine converts HTML escape sequence | |
147 | * back to the original ASCII representation. | |
148 | * - returns the number of characters dropped. | |
149 | */ | |
150 | int convertHTMLcodes(char *s, int len) | |
151 | { | |
152 | if (len <=0 || s==(char*)NULL || *s=='\0') | |
153 | return 0; | |
154 | ||
155 | if (s[1] == '#') | |
156 | { | |
157 | int val, o; | |
158 | ||
159 | if (sscanf(s,"&#%d;",&val) == 1) | |
160 | { | |
161 | o = 3; | |
162 | while (s[o] != ';') | |
163 | { | |
164 | o++; | |
165 | if (o > 5) | |
166 | break; | |
167 | } | |
168 | if (o < 5) | |
169 | strcpy(s+1, s+1+o); | |
170 | *s = val; | |
171 | return o; | |
172 | } | |
173 | } | |
174 | else | |
175 | { | |
176 | int | |
177 | i, | |
178 | codes = sizeof(html_codes) / sizeof(html_code); | |
179 | ||
180 | for (i=0; i < codes; i++) | |
181 | { | |
182 | if (html_codes[i].len <= len) | |
183 | if (STRNICMP(s, html_codes[i].code, html_codes[i].len) == 0) | |
184 | { | |
185 | strcpy(s+1, s+html_codes[i].len); | |
186 | *s = html_codes[i].val; | |
187 | return html_codes[i].len-1; | |
188 | } | |
189 | } | |
190 | } | |
191 | ||
192 | return 0; | |
193 | } | |
194 | ||
195 | int formatIPTC(FILE *ifile, FILE *ofile) | |
196 | { | |
197 | unsigned int | |
198 | foundiptc, | |
199 | tagsfound; | |
200 | ||
201 | unsigned char | |
202 | recnum, | |
203 | dataset; | |
204 | ||
205 | char | |
206 | *readable, | |
207 | *str; | |
208 | ||
209 | long | |
210 | tagindx, | |
211 | taglen; | |
212 | ||
213 | int | |
214 | i, | |
215 | tagcount = sizeof(tags) / sizeof(tag_spec); | |
216 | ||
217 | char | |
218 | c; | |
219 | ||
220 | foundiptc = 0; /* found the IPTC-Header */ | |
221 | tagsfound = 0; /* number of tags found */ | |
222 | ||
223 | c = getc(ifile); | |
224 | while (c != EOF) | |
225 | { | |
226 | if (c == 0x1c) | |
227 | foundiptc = 1; | |
228 | else | |
229 | { | |
230 | if (foundiptc) | |
231 | return -1; | |
232 | else | |
233 | continue; | |
234 | } | |
235 | ||
236 | /* we found the 0x1c tag and now grab the dataset and record number tags */ | |
237 | dataset = getc(ifile); | |
238 | if ((char) dataset == EOF) | |
239 | return -1; | |
240 | recnum = getc(ifile); | |
241 | if ((char) recnum == EOF) | |
242 | return -1; | |
243 | /* try to match this record to one of the ones in our named table */ | |
244 | for (i=0; i< tagcount; i++) | |
245 | { | |
246 | if (tags[i].id == recnum) | |
247 | break; | |
248 | } | |
249 | if (i < tagcount) | |
250 | readable = tags[i].name; | |
251 | else | |
252 | readable = ""; | |
253 | ||
254 | /* then we decode the length of the block that follows - long or short fmt */ | |
255 | c = getc(ifile); | |
256 | if (c == EOF) | |
257 | return 0; | |
258 | if (c & (unsigned char) 0x80) | |
259 | { | |
260 | unsigned char | |
261 | buffer[4]; | |
262 | ||
263 | for (i=0; i<4; i++) | |
264 | { | |
265 | c = buffer[i] = getc(ifile); | |
266 | if (c == EOF) | |
267 | return -1; | |
268 | } | |
269 | taglen = (((long) buffer[ 0 ]) << 24) | | |
270 | (((long) buffer[ 1 ]) << 16) | | |
271 | (((long) buffer[ 2 ]) << 8) | | |
272 | (((long) buffer[ 3 ])); | |
273 | } | |
274 | else | |
275 | { | |
276 | unsigned char | |
277 | x = c; | |
278 | ||
279 | taglen = ((long) x) << 8; | |
280 | x = getc(ifile); | |
281 | if ((char)x == EOF) | |
282 | return -1; | |
283 | taglen |= (long) x; | |
284 | } | |
285 | /* make a buffer to hold the tag data and snag it from the input stream */ | |
286 | str = (char *) malloc((unsigned int) (taglen+1)); | |
287 | if (str == (char *) NULL) | |
288 | { | |
289 | printf("Memory allocation failed"); | |
290 | return 0; | |
291 | } | |
292 | for (tagindx=0; tagindx<taglen; tagindx++) | |
293 | { | |
294 | c = str[tagindx] = getc(ifile); | |
295 | if (c == EOF) | |
80ed523f VZ |
296 | { |
297 | free(str); | |
298 | return -1; | |
299 | } | |
8414a40c VZ |
300 | } |
301 | str[ taglen ] = 0; | |
302 | ||
303 | /* now finish up by formatting this binary data into ASCII equivalent */ | |
304 | if (strlen(readable) > 0) | |
305 | fprintf(ofile, "%d#%d#%s=",(unsigned int)dataset, (unsigned int) recnum, readable); | |
306 | else | |
307 | fprintf(ofile, "%d#%d=",(unsigned int)dataset, (unsigned int) recnum); | |
308 | formatString( ofile, str, taglen ); | |
309 | free(str); | |
310 | ||
311 | tagsfound++; | |
312 | ||
313 | c = getc(ifile); | |
314 | } | |
315 | return tagsfound; | |
316 | } | |
317 | ||
318 | int tokenizer(unsigned inflag,char *token,int tokmax,char *line, | |
319 | char *white,char *brkchar,char *quote,char eschar,char *brkused, | |
320 | int *next,char *quoted); | |
321 | ||
322 | char *super_fgets(char *b, int *blen, FILE *file) | |
323 | { | |
324 | int | |
325 | c, | |
326 | len; | |
327 | ||
328 | char | |
329 | *q; | |
330 | ||
331 | len=*blen; | |
332 | for (q=b; ; q++) | |
333 | { | |
334 | c=fgetc(file); | |
335 | if (c == EOF || c == '\n') | |
336 | break; | |
80ed523f | 337 | if (((long)q - (long)b + 1 ) >= (long) len) |
8414a40c | 338 | { |
80ed523f | 339 | long |
8414a40c VZ |
340 | tlen; |
341 | ||
80ed523f | 342 | tlen=(long)q-(long)b; |
8414a40c VZ |
343 | len<<=1; |
344 | b=(char *) realloc((char *) b,(len+2)); | |
345 | if ((char *) b == (char *) NULL) | |
346 | break; | |
347 | q=b+tlen; | |
348 | } | |
349 | *q=(unsigned char) c; | |
350 | } | |
351 | *blen=0; | |
352 | if ((unsigned char *)b != (unsigned char *) NULL) | |
353 | { | |
354 | int | |
355 | tlen; | |
356 | ||
80ed523f | 357 | tlen=(long)q - (long)b; |
8414a40c VZ |
358 | if (tlen == 0) |
359 | return (char *) NULL; | |
360 | b[tlen] = '\0'; | |
361 | *blen=++tlen; | |
362 | } | |
363 | return b; | |
364 | } | |
365 | ||
366 | #define BUFFER_SZ 4096 | |
367 | ||
368 | int main(int argc, char *argv[]) | |
369 | { | |
370 | unsigned int | |
371 | length; | |
372 | ||
373 | unsigned char | |
374 | *buffer; | |
375 | ||
376 | int | |
377 | i, | |
378 | mode; /* iptc binary, or iptc text */ | |
379 | ||
380 | FILE | |
381 | *ifile = stdin, | |
382 | *ofile = stdout; | |
383 | ||
384 | char | |
385 | c, | |
386 | *usage = "usage: iptcutil -t | -b [-i file] [-o file] <input >output"; | |
387 | ||
388 | if( argc < 2 ) | |
389 | { | |
80ed523f | 390 | puts(usage); |
8414a40c VZ |
391 | return 1; |
392 | } | |
393 | ||
394 | mode = 0; | |
395 | length = -1; | |
396 | buffer = (unsigned char *)NULL; | |
397 | ||
398 | for (i=1; i<argc; i++) | |
399 | { | |
400 | c = argv[i][0]; | |
401 | if (c == '-' || c == '/') | |
402 | { | |
403 | c = argv[i][1]; | |
404 | switch( c ) | |
405 | { | |
406 | case 't': | |
407 | mode = 1; | |
408 | #ifdef WIN32 | |
409 | /* Set "stdout" to binary mode: */ | |
410 | _setmode( _fileno( ofile ), _O_BINARY ); | |
411 | #endif | |
412 | break; | |
413 | case 'b': | |
414 | mode = 0; | |
415 | #ifdef WIN32 | |
416 | /* Set "stdin" to binary mode: */ | |
417 | _setmode( _fileno( ifile ), _O_BINARY ); | |
418 | #endif | |
419 | break; | |
420 | case 'i': | |
421 | if (mode == 0) | |
422 | ifile = fopen(argv[++i], "rb"); | |
423 | else | |
424 | ifile = fopen(argv[++i], "rt"); | |
425 | if (ifile == (FILE *)NULL) | |
426 | { | |
427 | printf("Unable to open: %s\n", argv[i]); | |
428 | return 1; | |
429 | } | |
430 | break; | |
431 | case 'o': | |
432 | if (mode == 0) | |
433 | ofile = fopen(argv[++i], "wt"); | |
434 | else | |
435 | ofile = fopen(argv[++i], "wb"); | |
436 | if (ofile == (FILE *)NULL) | |
437 | { | |
438 | printf("Unable to open: %s\n", argv[i]); | |
439 | return 1; | |
440 | } | |
441 | break; | |
442 | default: | |
443 | printf("Unknown option: %s\n", argv[i]); | |
444 | return 1; | |
445 | } | |
446 | } | |
447 | else | |
448 | { | |
80ed523f | 449 | puts(usage); |
8414a40c VZ |
450 | return 1; |
451 | } | |
452 | } | |
453 | ||
454 | if (mode == 0) /* handle binary iptc info */ | |
455 | formatIPTC(ifile, ofile); | |
456 | ||
457 | if (mode == 1) /* handle text form of iptc info */ | |
458 | { | |
459 | char | |
460 | brkused, | |
461 | quoted, | |
462 | *line, | |
463 | *token, | |
464 | *newstr; | |
465 | ||
466 | int | |
467 | state, | |
468 | next; | |
469 | ||
470 | unsigned char | |
471 | recnum = 0, | |
472 | dataset = 0; | |
473 | ||
474 | int | |
475 | inputlen = BUFFER_SZ; | |
476 | ||
477 | line = (char *) malloc(inputlen); | |
478 | token = (char *)NULL; | |
479 | while((line = super_fgets(line,&inputlen,ifile))!=NULL) | |
480 | { | |
481 | state=0; | |
482 | next=0; | |
483 | ||
484 | token = (char *) malloc(inputlen); | |
485 | newstr = (char *) malloc(inputlen); | |
486 | while(tokenizer(0, token, inputlen, line, "", "=", "\"", 0, | |
487 | &brkused,&next,"ed)==0) | |
488 | { | |
489 | if (state == 0) | |
490 | { | |
491 | int | |
492 | state, | |
493 | next; | |
494 | ||
495 | char | |
496 | brkused, | |
497 | quoted; | |
498 | ||
499 | state=0; | |
500 | next=0; | |
501 | while(tokenizer(0, newstr, inputlen, token, "", "#", "", 0, | |
502 | &brkused, &next, "ed)==0) | |
503 | { | |
504 | if (state == 0) | |
505 | dataset = (unsigned char) atoi(newstr); | |
506 | else | |
507 | if (state == 1) | |
508 | recnum = (unsigned char) atoi(newstr); | |
509 | state++; | |
510 | } | |
511 | } | |
512 | else | |
513 | if (state == 1) | |
514 | { | |
515 | int | |
516 | next; | |
517 | ||
518 | unsigned long | |
519 | len; | |
520 | ||
521 | char | |
522 | brkused, | |
523 | quoted; | |
524 | ||
525 | next=0; | |
526 | len = strlen(token); | |
527 | while(tokenizer(0, newstr, inputlen, token, "", "&", "", 0, | |
528 | &brkused, &next, "ed)==0) | |
529 | { | |
530 | if (brkused && next > 0) | |
531 | { | |
532 | char | |
533 | *s = &token[next-1]; | |
534 | ||
535 | len -= convertHTMLcodes(s, strlen(s)); | |
536 | } | |
537 | } | |
538 | ||
539 | fputc(0x1c, ofile); | |
540 | fputc(dataset, ofile); | |
541 | fputc(recnum, ofile); | |
542 | if (len < 0x10000) | |
543 | { | |
544 | fputc((len >> 8) & 255, ofile); | |
545 | fputc(len & 255, ofile); | |
546 | } | |
547 | else | |
548 | { | |
549 | fputc(((len >> 24) & 255) | 0x80, ofile); | |
550 | fputc((len >> 16) & 255, ofile); | |
551 | fputc((len >> 8) & 255, ofile); | |
552 | fputc(len & 255, ofile); | |
553 | } | |
554 | next=0; | |
555 | while (len--) | |
556 | fputc(token[next++], ofile); | |
557 | } | |
558 | state++; | |
559 | } | |
560 | free(token); | |
561 | token = (char *)NULL; | |
562 | free(newstr); | |
563 | newstr = (char *)NULL; | |
564 | } | |
565 | free(line); | |
566 | ||
567 | fclose( ifile ); | |
568 | fclose( ofile ); | |
569 | } | |
570 | ||
571 | return 0; | |
572 | } | |
573 | ||
574 | /* | |
575 | This routine is a generalized, finite state token parser. It allows | |
576 | you extract tokens one at a time from a string of characters. The | |
577 | characters used for white space, for break characters, and for quotes | |
578 | can be specified. Also, characters in the string can be preceded by | |
579 | a specifiable escape character which removes any special meaning the | |
580 | character may have. | |
581 | ||
582 | There are a lot of formal parameters in this subroutine call, but | |
583 | once you get familiar with them, this routine is fairly easy to use. | |
584 | "#define" macros can be used to generate simpler looking calls for | |
585 | commonly used applications of this routine. | |
586 | ||
587 | First, some terminology: | |
588 | ||
589 | token: used here, a single unit of information in | |
590 | the form of a group of characters. | |
591 | ||
592 | white space: space that gets ignored (except within quotes | |
593 | or when escaped), like blanks and tabs. in | |
594 | addition, white space terminates a non-quoted | |
595 | token. | |
596 | ||
597 | break character: a character that separates non-quoted tokens. | |
598 | commas are a common break character. the | |
599 | usage of break characters to signal the end | |
600 | of a token is the same as that of white space, | |
601 | except multiple break characters with nothing | |
602 | or only white space between generate a null | |
603 | token for each two break characters together. | |
604 | ||
605 | for example, if blank is set to be the white | |
606 | space and comma is set to be the break | |
607 | character, the line ... | |
608 | ||
609 | A, B, C , , DEF | |
610 | ||
611 | ... consists of 5 tokens: | |
612 | ||
613 | 1) "A" | |
614 | 2) "B" | |
615 | 3) "C" | |
616 | 4) "" (the null string) | |
617 | 5) "DEF" | |
618 | ||
619 | quote character: a character that, when surrounding a group | |
620 | of other characters, causes the group of | |
621 | characters to be treated as a single token, | |
622 | no matter how many white spaces or break | |
623 | characters exist in the group. also, a | |
624 | token always terminates after the closing | |
625 | quote. for example, if ' is the quote | |
626 | character, blank is white space, and comma | |
627 | is the break character, the following | |
628 | string ... | |
629 | ||
630 | A, ' B, CD'EF GHI | |
631 | ||
632 | ... consists of 4 tokens: | |
633 | ||
634 | 1) "A" | |
635 | 2) " B, CD" (note the blanks & comma) | |
636 | 3) "EF" | |
637 | 4) "GHI" | |
638 | ||
639 | the quote characters themselves do | |
640 | not appear in the resultant tokens. the | |
641 | double quotes are delimiters i use here for | |
642 | documentation purposes only. | |
643 | ||
644 | escape character: a character which itself is ignored but | |
645 | which causes the next character to be | |
646 | used as is. ^ and \ are often used as | |
647 | escape characters. an escape in the last | |
648 | position of the string gets treated as a | |
649 | "normal" (i.e., non-quote, non-white, | |
650 | non-break, and non-escape) character. | |
651 | for example, assume white space, break | |
652 | character, and quote are the same as in the | |
653 | above examples, and further, assume that | |
654 | ^ is the escape character. then, in the | |
655 | string ... | |
656 | ||
657 | ABC, ' DEF ^' GH' I ^ J K^ L ^ | |
658 | ||
659 | ... there are 7 tokens: | |
660 | ||
661 | 1) "ABC" | |
662 | 2) " DEF ' GH" | |
663 | 3) "I" | |
664 | 4) " " (a lone blank) | |
665 | 5) "J" | |
666 | 6) "K L" | |
667 | 7) "^" (passed as is at end of line) | |
668 | ||
669 | ||
670 | OK, now that you have this background, here's how to call "tokenizer": | |
671 | ||
672 | result=tokenizer(flag,token,maxtok,string,white,break,quote,escape, | |
673 | brkused,next,quoted) | |
674 | ||
675 | result: 0 if we haven't reached EOS (end of string), and | |
676 | 1 if we have (this is an "int"). | |
677 | ||
678 | flag: right now, only the low order 3 bits are used. | |
679 | 1 => convert non-quoted tokens to upper case | |
680 | 2 => convert non-quoted tokens to lower case | |
681 | 0 => do not convert non-quoted tokens | |
682 | (this is a "char"). | |
683 | ||
684 | token: a character string containing the returned next token | |
685 | (this is a "char[]"). | |
686 | ||
687 | maxtok: the maximum size of "token". characters beyond | |
688 | "maxtok" are truncated (this is an "int"). | |
689 | ||
690 | string: the string to be parsed (this is a "char[]"). | |
691 | ||
692 | white: a string of the valid white spaces. example: | |
693 | ||
694 | char whitesp[]={" \t"}; | |
695 | ||
696 | blank and tab will be valid white space (this is | |
697 | a "char[]"). | |
698 | ||
699 | break: a string of the valid break characters. example: | |
700 | ||
701 | char breakch[]={";,"}; | |
702 | ||
703 | semicolon and comma will be valid break characters | |
704 | (this is a "char[]"). | |
705 | ||
706 | IMPORTANT: do not use the name "break" as a C | |
707 | variable, as this is a reserved word in C. | |
708 | ||
709 | quote: a string of the valid quote characters. an example | |
710 | would be | |
711 | ||
712 | char whitesp[]={"'\""); | |
713 | ||
714 | (this causes single and double quotes to be valid) | |
715 | note that a token starting with one of these characters | |
716 | needs the same quote character to terminate it. | |
717 | ||
718 | for example, | |
719 | ||
720 | "ABC ' | |
721 | ||
722 | is unterminated, but | |
723 | ||
724 | "DEF" and 'GHI' | |
725 | ||
726 | are properly terminated. note that different quote | |
727 | characters can appear on the same line; only for | |
728 | a given token do the quote characters have to be | |
729 | the same (this is a "char[]"). | |
730 | ||
731 | escape: the escape character (NOT a string ... only one | |
732 | allowed). use zero if none is desired (this is | |
733 | a "char"). | |
734 | ||
735 | brkused: the break character used to terminate the current | |
736 | token. if the token was quoted, this will be the | |
737 | quote used. if the token is the last one on the | |
738 | line, this will be zero (this is a pointer to a | |
739 | "char"). | |
740 | ||
741 | next: this variable points to the first character of the | |
742 | next token. it gets reset by "tokenizer" as it steps | |
743 | through the string. set it to 0 upon initialization, | |
744 | and leave it alone after that. you can change it | |
745 | if you want to jump around in the string or re-parse | |
746 | from the beginning, but be careful (this is a | |
747 | pointer to an "int"). | |
748 | ||
749 | quoted: set to 1 (true) if the token was quoted and 0 (false) | |
750 | if not. you may need this information (for example: | |
751 | in C, a string with quotes around it is a character | |
752 | string, while one without is an identifier). | |
753 | ||
754 | (this is a pointer to a "char"). | |
755 | */ | |
756 | ||
757 | /* states */ | |
758 | ||
759 | #define IN_WHITE 0 | |
760 | #define IN_TOKEN 1 | |
761 | #define IN_QUOTE 2 | |
762 | #define IN_OZONE 3 | |
763 | ||
764 | int _p_state; /* current state */ | |
765 | unsigned _p_flag; /* option flag */ | |
766 | char _p_curquote; /* current quote char */ | |
767 | int _p_tokpos; /* current token pos */ | |
768 | ||
769 | /* routine to find character in string ... used only by "tokenizer" */ | |
770 | ||
771 | int sindex(char ch,char *string) | |
772 | { | |
773 | char *cp; | |
774 | for(cp=string;*cp;++cp) | |
775 | if(ch==*cp) | |
776 | return (int)(cp-string); /* return postion of character */ | |
777 | return -1; /* eol ... no match found */ | |
778 | } | |
779 | ||
780 | /* routine to store a character in a string ... used only by "tokenizer" */ | |
781 | ||
782 | void chstore(char *string,int max,char ch) | |
783 | { | |
784 | char c; | |
785 | if(_p_tokpos>=0&&_p_tokpos<max-1) | |
786 | { | |
787 | if(_p_state==IN_QUOTE) | |
788 | c=ch; | |
789 | else | |
790 | switch(_p_flag&3) | |
791 | { | |
792 | case 1: /* convert to upper */ | |
793 | c=toupper(ch); | |
794 | break; | |
795 | ||
796 | case 2: /* convert to lower */ | |
797 | c=tolower(ch); | |
798 | break; | |
799 | ||
800 | default: /* use as is */ | |
801 | c=ch; | |
802 | break; | |
803 | } | |
804 | string[_p_tokpos++]=c; | |
805 | } | |
806 | return; | |
807 | } | |
808 | ||
809 | int tokenizer(unsigned inflag,char *token,int tokmax,char *line, | |
810 | char *white,char *brkchar,char *quote,char eschar,char *brkused, | |
811 | int *next,char *quoted) | |
812 | { | |
813 | int qp; | |
814 | char c,nc; | |
815 | ||
816 | *brkused=0; /* initialize to null */ | |
817 | *quoted=0; /* assume not quoted */ | |
818 | ||
819 | if(!line[*next]) /* if we're at end of line, indicate such */ | |
820 | return 1; | |
821 | ||
822 | _p_state=IN_WHITE; /* initialize state */ | |
823 | _p_curquote=0; /* initialize previous quote char */ | |
824 | _p_flag=inflag; /* set option flag */ | |
825 | ||
826 | for(_p_tokpos=0;(c=line[*next]);++(*next)) /* main loop */ | |
827 | { | |
828 | if((qp=sindex(c,brkchar))>=0) /* break */ | |
829 | { | |
830 | switch(_p_state) | |
831 | { | |
832 | case IN_WHITE: /* these are the same here ... */ | |
833 | case IN_TOKEN: /* ... just get out */ | |
834 | case IN_OZONE: /* ditto */ | |
835 | ++(*next); | |
836 | *brkused=brkchar[qp]; | |
837 | goto byebye; | |
838 | ||
839 | case IN_QUOTE: /* just keep going */ | |
840 | chstore(token,tokmax,c); | |
841 | break; | |
842 | } | |
843 | } | |
844 | else if((qp=sindex(c,quote))>=0) /* quote */ | |
845 | { | |
846 | switch(_p_state) | |
847 | { | |
848 | case IN_WHITE: /* these are identical, */ | |
849 | _p_state=IN_QUOTE; /* change states */ | |
850 | _p_curquote=quote[qp]; /* save quote char */ | |
851 | *quoted=1; /* set to true as long as something is in quotes */ | |
852 | break; | |
853 | ||
854 | case IN_QUOTE: | |
855 | if(quote[qp]==_p_curquote) /* same as the beginning quote? */ | |
856 | { | |
857 | _p_state=IN_OZONE; | |
858 | _p_curquote=0; | |
859 | } | |
860 | else | |
861 | chstore(token,tokmax,c); /* treat as regular char */ | |
862 | break; | |
863 | ||
864 | case IN_TOKEN: | |
865 | case IN_OZONE: | |
866 | *brkused=c; /* uses quote as break char */ | |
867 | goto byebye; | |
868 | } | |
869 | } | |
870 | else if((qp=sindex(c,white))>=0) /* white */ | |
871 | { | |
872 | switch(_p_state) | |
873 | { | |
874 | case IN_WHITE: | |
875 | case IN_OZONE: | |
876 | break; /* keep going */ | |
877 | ||
878 | case IN_TOKEN: | |
879 | _p_state=IN_OZONE; | |
880 | break; | |
881 | ||
882 | case IN_QUOTE: | |
883 | chstore(token,tokmax,c); /* it's valid here */ | |
884 | break; | |
885 | } | |
886 | } | |
887 | else if(c==eschar) /* escape */ | |
888 | { | |
889 | nc=line[(*next)+1]; | |
890 | if(nc==0) /* end of line */ | |
891 | { | |
892 | *brkused=0; | |
893 | chstore(token,tokmax,c); | |
894 | ++(*next); | |
895 | goto byebye; | |
896 | } | |
897 | switch(_p_state) | |
898 | { | |
899 | case IN_WHITE: | |
900 | --(*next); | |
901 | _p_state=IN_TOKEN; | |
902 | break; | |
903 | ||
904 | case IN_TOKEN: | |
905 | case IN_QUOTE: | |
906 | ++(*next); | |
907 | chstore(token,tokmax,nc); | |
908 | break; | |
909 | ||
910 | case IN_OZONE: | |
911 | goto byebye; | |
912 | } | |
913 | } | |
914 | else /* anything else is just a real character */ | |
915 | { | |
916 | switch(_p_state) | |
917 | { | |
918 | case IN_WHITE: | |
919 | _p_state=IN_TOKEN; /* switch states */ | |
920 | ||
921 | case IN_TOKEN: /* these 2 are */ | |
922 | case IN_QUOTE: /* identical here */ | |
923 | chstore(token,tokmax,c); | |
924 | break; | |
925 | ||
926 | case IN_OZONE: | |
927 | goto byebye; | |
928 | } | |
929 | } | |
930 | } /* end of main loop */ | |
931 | ||
932 | byebye: | |
933 | token[_p_tokpos]=0; /* make sure token ends with EOS */ | |
934 | ||
935 | return 0; | |
936 | } | |
80ed523f VZ |
937 | /* |
938 | * Local Variables: | |
939 | * mode: c | |
940 | * c-basic-offset: 8 | |
941 | * fill-column: 78 | |
942 | * End: | |
943 | */ |