]> git.saurik.com Git - wxWidgets.git/blob - src/tiff/contrib/iptcutil/iptcutil.c
Use wxMarkupParser in wxStaticText for dealing with markup.
[wxWidgets.git] / src / tiff / contrib / iptcutil / iptcutil.c
1 /* $Id$ */
2
3 #include "tif_config.h"
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <memory.h>
9 #include <ctype.h>
10
11 #ifdef HAVE_STRINGS_H
12 # include <strings.h>
13 #endif
14
15 #ifdef HAVE_IO_H
16 # include <io.h>
17 #endif
18
19 #ifdef HAVE_FCNTL_H
20 # include <fcntl.h>
21 #endif
22
23 #ifdef WIN32
24 #define STRNICMP strnicmp
25 #else
26 #define STRNICMP strncasecmp
27 #endif
28
29 typedef struct _tag_spec
30 {
31 short
32 id;
33
34 char
35 *name;
36 } tag_spec;
37
38 static tag_spec tags[] = {
39 { 5,"Image Name" },
40 { 7,"Edit Status" },
41 { 10,"Priority" },
42 { 15,"Category" },
43 { 20,"Supplemental Category" },
44 { 22,"Fixture Identifier" },
45 { 25,"Keyword" },
46 { 30,"Release Date" },
47 { 35,"Release Time" },
48 { 40,"Special Instructions" },
49 { 45,"Reference Service" },
50 { 47,"Reference Date" },
51 { 50,"Reference Number" },
52 { 55,"Created Date" },
53 { 60,"Created Time" },
54 { 65,"Originating Program" },
55 { 70,"Program Version" },
56 { 75,"Object Cycle" },
57 { 80,"Byline" },
58 { 85,"Byline Title" },
59 { 90,"City" },
60 { 95,"Province State" },
61 { 100,"Country Code" },
62 { 101,"Country" },
63 { 103,"Original Transmission Reference" },
64 { 105,"Headline" },
65 { 110,"Credit" },
66 { 115,"Source" },
67 { 116,"Copyright String" },
68 { 120,"Caption" },
69 { 121,"Local Caption" },
70 { 122,"Caption Writer" },
71 { 200,"Custom Field 1" },
72 { 201,"Custom Field 2" },
73 { 202,"Custom Field 3" },
74 { 203,"Custom Field 4" },
75 { 204,"Custom Field 5" },
76 { 205,"Custom Field 6" },
77 { 206,"Custom Field 7" },
78 { 207,"Custom Field 8" },
79 { 208,"Custom Field 9" },
80 { 209,"Custom Field 10" },
81 { 210,"Custom Field 11" },
82 { 211,"Custom Field 12" },
83 { 212,"Custom Field 13" },
84 { 213,"Custom Field 14" },
85 { 214,"Custom Field 15" },
86 { 215,"Custom Field 16" },
87 { 216,"Custom Field 17" },
88 { 217,"Custom Field 18" },
89 { 218,"Custom Field 19" },
90 { 219,"Custom Field 20" }
91 };
92
93 /*
94 * We format the output using HTML conventions
95 * to preserve control characters and such.
96 */
97 void formatString(FILE *ofile, const char *s, int len)
98 {
99 putc('"', ofile);
100 for (; len > 0; --len, ++s) {
101 int c = *s;
102 switch (c) {
103 case '&':
104 fputs("&amp;", ofile);
105 break;
106 #ifdef HANDLE_GT_LT
107 case '<':
108 fputs("&lt;", ofile);
109 break;
110 case '>':
111 fputs("&gt;", ofile);
112 break;
113 #endif
114 case '"':
115 fputs("&quot;", ofile);
116 break;
117 default:
118 if (iscntrl(c))
119 fprintf(ofile, "&#%d;", c);
120 else
121 putc(*s, ofile);
122 break;
123 }
124 }
125 fputs("\"\n", ofile);
126 }
127
128 typedef struct _html_code
129 {
130 short
131 len;
132 const char
133 *code,
134 val;
135 } html_code;
136
137 static html_code html_codes[] = {
138 #ifdef HANDLE_GT_LT
139 { 4,"&lt;",'<' },
140 { 4,"&gt;",'>' },
141 #endif
142 { 5,"&amp;",'&' },
143 { 6,"&quot;",'"' }
144 };
145
146 /*
147 * This routine converts HTML escape sequence
148 * back to the original ASCII representation.
149 * - returns the number of characters dropped.
150 */
151 int convertHTMLcodes(char *s, int len)
152 {
153 if (len <=0 || s==(char*)NULL || *s=='\0')
154 return 0;
155
156 if (s[1] == '#')
157 {
158 int val, o;
159
160 if (sscanf(s,"&#%d;",&val) == 1)
161 {
162 o = 3;
163 while (s[o] != ';')
164 {
165 o++;
166 if (o > 5)
167 break;
168 }
169 if (o < 5)
170 strcpy(s+1, s+1+o);
171 *s = val;
172 return o;
173 }
174 }
175 else
176 {
177 int
178 i,
179 codes = sizeof(html_codes) / sizeof(html_code);
180
181 for (i=0; i < codes; i++)
182 {
183 if (html_codes[i].len <= len)
184 if (STRNICMP(s, html_codes[i].code, html_codes[i].len) == 0)
185 {
186 strcpy(s+1, s+html_codes[i].len);
187 *s = html_codes[i].val;
188 return html_codes[i].len-1;
189 }
190 }
191 }
192
193 return 0;
194 }
195
196 int formatIPTC(FILE *ifile, FILE *ofile)
197 {
198 unsigned int
199 foundiptc,
200 tagsfound;
201
202 unsigned char
203 recnum,
204 dataset;
205
206 char
207 *readable,
208 *str;
209
210 long
211 tagindx,
212 taglen;
213
214 int
215 i,
216 tagcount = sizeof(tags) / sizeof(tag_spec);
217
218 char
219 c;
220
221 foundiptc = 0; /* found the IPTC-Header */
222 tagsfound = 0; /* number of tags found */
223
224 c = getc(ifile);
225 while (c != EOF)
226 {
227 if (c == 0x1c)
228 foundiptc = 1;
229 else
230 {
231 if (foundiptc)
232 return -1;
233 else
234 continue;
235 }
236
237 /* we found the 0x1c tag and now grab the dataset and record number tags */
238 dataset = getc(ifile);
239 if ((char) dataset == EOF)
240 return -1;
241 recnum = getc(ifile);
242 if ((char) recnum == EOF)
243 return -1;
244 /* try to match this record to one of the ones in our named table */
245 for (i=0; i< tagcount; i++)
246 {
247 if (tags[i].id == recnum)
248 break;
249 }
250 if (i < tagcount)
251 readable = tags[i].name;
252 else
253 readable = "";
254
255 /* then we decode the length of the block that follows - long or short fmt */
256 c = getc(ifile);
257 if (c == EOF)
258 return 0;
259 if (c & (unsigned char) 0x80)
260 {
261 unsigned char
262 buffer[4];
263
264 for (i=0; i<4; i++)
265 {
266 c = buffer[i] = getc(ifile);
267 if (c == EOF)
268 return -1;
269 }
270 taglen = (((long) buffer[ 0 ]) << 24) |
271 (((long) buffer[ 1 ]) << 16) |
272 (((long) buffer[ 2 ]) << 8) |
273 (((long) buffer[ 3 ]));
274 }
275 else
276 {
277 unsigned char
278 x = c;
279
280 taglen = ((long) x) << 8;
281 x = getc(ifile);
282 if ((char)x == EOF)
283 return -1;
284 taglen |= (long) x;
285 }
286 /* make a buffer to hold the tag data and snag it from the input stream */
287 str = (char *) malloc((unsigned int) (taglen+1));
288 if (str == (char *) NULL)
289 {
290 printf("Memory allocation failed");
291 return 0;
292 }
293 for (tagindx=0; tagindx<taglen; tagindx++)
294 {
295 c = str[tagindx] = getc(ifile);
296 if (c == EOF)
297 return -1;
298 }
299 str[ taglen ] = 0;
300
301 /* now finish up by formatting this binary data into ASCII equivalent */
302 if (strlen(readable) > 0)
303 fprintf(ofile, "%d#%d#%s=",(unsigned int)dataset, (unsigned int) recnum, readable);
304 else
305 fprintf(ofile, "%d#%d=",(unsigned int)dataset, (unsigned int) recnum);
306 formatString( ofile, str, taglen );
307 free(str);
308
309 tagsfound++;
310
311 c = getc(ifile);
312 }
313 return tagsfound;
314 }
315
316 int tokenizer(unsigned inflag,char *token,int tokmax,char *line,
317 char *white,char *brkchar,char *quote,char eschar,char *brkused,
318 int *next,char *quoted);
319
320 char *super_fgets(char *b, int *blen, FILE *file)
321 {
322 int
323 c,
324 len;
325
326 char
327 *q;
328
329 len=*blen;
330 for (q=b; ; q++)
331 {
332 c=fgetc(file);
333 if (c == EOF || c == '\n')
334 break;
335 if (((int)q - (int)b + 1 ) >= (int) len)
336 {
337 int
338 tlen;
339
340 tlen=(int)q-(int)b;
341 len<<=1;
342 b=(char *) realloc((char *) b,(len+2));
343 if ((char *) b == (char *) NULL)
344 break;
345 q=b+tlen;
346 }
347 *q=(unsigned char) c;
348 }
349 *blen=0;
350 if ((unsigned char *)b != (unsigned char *) NULL)
351 {
352 int
353 tlen;
354
355 tlen=(int)q - (int)b;
356 if (tlen == 0)
357 return (char *) NULL;
358 b[tlen] = '\0';
359 *blen=++tlen;
360 }
361 return b;
362 }
363
364 #define BUFFER_SZ 4096
365
366 int main(int argc, char *argv[])
367 {
368 unsigned int
369 length;
370
371 unsigned char
372 *buffer;
373
374 int
375 i,
376 mode; /* iptc binary, or iptc text */
377
378 FILE
379 *ifile = stdin,
380 *ofile = stdout;
381
382 char
383 c,
384 *usage = "usage: iptcutil -t | -b [-i file] [-o file] <input >output";
385
386 if( argc < 2 )
387 {
388 printf(usage);
389 return 1;
390 }
391
392 mode = 0;
393 length = -1;
394 buffer = (unsigned char *)NULL;
395
396 for (i=1; i<argc; i++)
397 {
398 c = argv[i][0];
399 if (c == '-' || c == '/')
400 {
401 c = argv[i][1];
402 switch( c )
403 {
404 case 't':
405 mode = 1;
406 #ifdef WIN32
407 /* Set "stdout" to binary mode: */
408 _setmode( _fileno( ofile ), _O_BINARY );
409 #endif
410 break;
411 case 'b':
412 mode = 0;
413 #ifdef WIN32
414 /* Set "stdin" to binary mode: */
415 _setmode( _fileno( ifile ), _O_BINARY );
416 #endif
417 break;
418 case 'i':
419 if (mode == 0)
420 ifile = fopen(argv[++i], "rb");
421 else
422 ifile = fopen(argv[++i], "rt");
423 if (ifile == (FILE *)NULL)
424 {
425 printf("Unable to open: %s\n", argv[i]);
426 return 1;
427 }
428 break;
429 case 'o':
430 if (mode == 0)
431 ofile = fopen(argv[++i], "wt");
432 else
433 ofile = fopen(argv[++i], "wb");
434 if (ofile == (FILE *)NULL)
435 {
436 printf("Unable to open: %s\n", argv[i]);
437 return 1;
438 }
439 break;
440 default:
441 printf("Unknown option: %s\n", argv[i]);
442 return 1;
443 }
444 }
445 else
446 {
447 printf(usage);
448 return 1;
449 }
450 }
451
452 if (mode == 0) /* handle binary iptc info */
453 formatIPTC(ifile, ofile);
454
455 if (mode == 1) /* handle text form of iptc info */
456 {
457 char
458 brkused,
459 quoted,
460 *line,
461 *token,
462 *newstr;
463
464 int
465 state,
466 next;
467
468 unsigned char
469 recnum = 0,
470 dataset = 0;
471
472 int
473 inputlen = BUFFER_SZ;
474
475 line = (char *) malloc(inputlen);
476 token = (char *)NULL;
477 while((line = super_fgets(line,&inputlen,ifile))!=NULL)
478 {
479 state=0;
480 next=0;
481
482 token = (char *) malloc(inputlen);
483 newstr = (char *) malloc(inputlen);
484 while(tokenizer(0, token, inputlen, line, "", "=", "\"", 0,
485 &brkused,&next,&quoted)==0)
486 {
487 if (state == 0)
488 {
489 int
490 state,
491 next;
492
493 char
494 brkused,
495 quoted;
496
497 state=0;
498 next=0;
499 while(tokenizer(0, newstr, inputlen, token, "", "#", "", 0,
500 &brkused, &next, &quoted)==0)
501 {
502 if (state == 0)
503 dataset = (unsigned char) atoi(newstr);
504 else
505 if (state == 1)
506 recnum = (unsigned char) atoi(newstr);
507 state++;
508 }
509 }
510 else
511 if (state == 1)
512 {
513 int
514 next;
515
516 unsigned long
517 len;
518
519 char
520 brkused,
521 quoted;
522
523 next=0;
524 len = strlen(token);
525 while(tokenizer(0, newstr, inputlen, token, "", "&", "", 0,
526 &brkused, &next, &quoted)==0)
527 {
528 if (brkused && next > 0)
529 {
530 char
531 *s = &token[next-1];
532
533 len -= convertHTMLcodes(s, strlen(s));
534 }
535 }
536
537 fputc(0x1c, ofile);
538 fputc(dataset, ofile);
539 fputc(recnum, ofile);
540 if (len < 0x10000)
541 {
542 fputc((len >> 8) & 255, ofile);
543 fputc(len & 255, ofile);
544 }
545 else
546 {
547 fputc(((len >> 24) & 255) | 0x80, ofile);
548 fputc((len >> 16) & 255, ofile);
549 fputc((len >> 8) & 255, ofile);
550 fputc(len & 255, ofile);
551 }
552 next=0;
553 while (len--)
554 fputc(token[next++], ofile);
555 }
556 state++;
557 }
558 free(token);
559 token = (char *)NULL;
560 free(newstr);
561 newstr = (char *)NULL;
562 }
563 free(line);
564
565 fclose( ifile );
566 fclose( ofile );
567 }
568
569 return 0;
570 }
571
572 /*
573 This routine is a generalized, finite state token parser. It allows
574 you extract tokens one at a time from a string of characters. The
575 characters used for white space, for break characters, and for quotes
576 can be specified. Also, characters in the string can be preceded by
577 a specifiable escape character which removes any special meaning the
578 character may have.
579
580 There are a lot of formal parameters in this subroutine call, but
581 once you get familiar with them, this routine is fairly easy to use.
582 "#define" macros can be used to generate simpler looking calls for
583 commonly used applications of this routine.
584
585 First, some terminology:
586
587 token: used here, a single unit of information in
588 the form of a group of characters.
589
590 white space: space that gets ignored (except within quotes
591 or when escaped), like blanks and tabs. in
592 addition, white space terminates a non-quoted
593 token.
594
595 break character: a character that separates non-quoted tokens.
596 commas are a common break character. the
597 usage of break characters to signal the end
598 of a token is the same as that of white space,
599 except multiple break characters with nothing
600 or only white space between generate a null
601 token for each two break characters together.
602
603 for example, if blank is set to be the white
604 space and comma is set to be the break
605 character, the line ...
606
607 A, B, C , , DEF
608
609 ... consists of 5 tokens:
610
611 1) "A"
612 2) "B"
613 3) "C"
614 4) "" (the null string)
615 5) "DEF"
616
617 quote character: a character that, when surrounding a group
618 of other characters, causes the group of
619 characters to be treated as a single token,
620 no matter how many white spaces or break
621 characters exist in the group. also, a
622 token always terminates after the closing
623 quote. for example, if ' is the quote
624 character, blank is white space, and comma
625 is the break character, the following
626 string ...
627
628 A, ' B, CD'EF GHI
629
630 ... consists of 4 tokens:
631
632 1) "A"
633 2) " B, CD" (note the blanks & comma)
634 3) "EF"
635 4) "GHI"
636
637 the quote characters themselves do
638 not appear in the resultant tokens. the
639 double quotes are delimiters i use here for
640 documentation purposes only.
641
642 escape character: a character which itself is ignored but
643 which causes the next character to be
644 used as is. ^ and \ are often used as
645 escape characters. an escape in the last
646 position of the string gets treated as a
647 "normal" (i.e., non-quote, non-white,
648 non-break, and non-escape) character.
649 for example, assume white space, break
650 character, and quote are the same as in the
651 above examples, and further, assume that
652 ^ is the escape character. then, in the
653 string ...
654
655 ABC, ' DEF ^' GH' I ^ J K^ L ^
656
657 ... there are 7 tokens:
658
659 1) "ABC"
660 2) " DEF ' GH"
661 3) "I"
662 4) " " (a lone blank)
663 5) "J"
664 6) "K L"
665 7) "^" (passed as is at end of line)
666
667
668 OK, now that you have this background, here's how to call "tokenizer":
669
670 result=tokenizer(flag,token,maxtok,string,white,break,quote,escape,
671 brkused,next,quoted)
672
673 result: 0 if we haven't reached EOS (end of string), and
674 1 if we have (this is an "int").
675
676 flag: right now, only the low order 3 bits are used.
677 1 => convert non-quoted tokens to upper case
678 2 => convert non-quoted tokens to lower case
679 0 => do not convert non-quoted tokens
680 (this is a "char").
681
682 token: a character string containing the returned next token
683 (this is a "char[]").
684
685 maxtok: the maximum size of "token". characters beyond
686 "maxtok" are truncated (this is an "int").
687
688 string: the string to be parsed (this is a "char[]").
689
690 white: a string of the valid white spaces. example:
691
692 char whitesp[]={" \t"};
693
694 blank and tab will be valid white space (this is
695 a "char[]").
696
697 break: a string of the valid break characters. example:
698
699 char breakch[]={";,"};
700
701 semicolon and comma will be valid break characters
702 (this is a "char[]").
703
704 IMPORTANT: do not use the name "break" as a C
705 variable, as this is a reserved word in C.
706
707 quote: a string of the valid quote characters. an example
708 would be
709
710 char whitesp[]={"'\"");
711
712 (this causes single and double quotes to be valid)
713 note that a token starting with one of these characters
714 needs the same quote character to terminate it.
715
716 for example,
717
718 "ABC '
719
720 is unterminated, but
721
722 "DEF" and 'GHI'
723
724 are properly terminated. note that different quote
725 characters can appear on the same line; only for
726 a given token do the quote characters have to be
727 the same (this is a "char[]").
728
729 escape: the escape character (NOT a string ... only one
730 allowed). use zero if none is desired (this is
731 a "char").
732
733 brkused: the break character used to terminate the current
734 token. if the token was quoted, this will be the
735 quote used. if the token is the last one on the
736 line, this will be zero (this is a pointer to a
737 "char").
738
739 next: this variable points to the first character of the
740 next token. it gets reset by "tokenizer" as it steps
741 through the string. set it to 0 upon initialization,
742 and leave it alone after that. you can change it
743 if you want to jump around in the string or re-parse
744 from the beginning, but be careful (this is a
745 pointer to an "int").
746
747 quoted: set to 1 (true) if the token was quoted and 0 (false)
748 if not. you may need this information (for example:
749 in C, a string with quotes around it is a character
750 string, while one without is an identifier).
751
752 (this is a pointer to a "char").
753 */
754
755 /* states */
756
757 #define IN_WHITE 0
758 #define IN_TOKEN 1
759 #define IN_QUOTE 2
760 #define IN_OZONE 3
761
762 int _p_state; /* current state */
763 unsigned _p_flag; /* option flag */
764 char _p_curquote; /* current quote char */
765 int _p_tokpos; /* current token pos */
766
767 /* routine to find character in string ... used only by "tokenizer" */
768
769 int sindex(char ch,char *string)
770 {
771 char *cp;
772 for(cp=string;*cp;++cp)
773 if(ch==*cp)
774 return (int)(cp-string); /* return postion of character */
775 return -1; /* eol ... no match found */
776 }
777
778 /* routine to store a character in a string ... used only by "tokenizer" */
779
780 void chstore(char *string,int max,char ch)
781 {
782 char c;
783 if(_p_tokpos>=0&&_p_tokpos<max-1)
784 {
785 if(_p_state==IN_QUOTE)
786 c=ch;
787 else
788 switch(_p_flag&3)
789 {
790 case 1: /* convert to upper */
791 c=toupper(ch);
792 break;
793
794 case 2: /* convert to lower */
795 c=tolower(ch);
796 break;
797
798 default: /* use as is */
799 c=ch;
800 break;
801 }
802 string[_p_tokpos++]=c;
803 }
804 return;
805 }
806
807 int tokenizer(unsigned inflag,char *token,int tokmax,char *line,
808 char *white,char *brkchar,char *quote,char eschar,char *brkused,
809 int *next,char *quoted)
810 {
811 int qp;
812 char c,nc;
813
814 *brkused=0; /* initialize to null */
815 *quoted=0; /* assume not quoted */
816
817 if(!line[*next]) /* if we're at end of line, indicate such */
818 return 1;
819
820 _p_state=IN_WHITE; /* initialize state */
821 _p_curquote=0; /* initialize previous quote char */
822 _p_flag=inflag; /* set option flag */
823
824 for(_p_tokpos=0;(c=line[*next]);++(*next)) /* main loop */
825 {
826 if((qp=sindex(c,brkchar))>=0) /* break */
827 {
828 switch(_p_state)
829 {
830 case IN_WHITE: /* these are the same here ... */
831 case IN_TOKEN: /* ... just get out */
832 case IN_OZONE: /* ditto */
833 ++(*next);
834 *brkused=brkchar[qp];
835 goto byebye;
836
837 case IN_QUOTE: /* just keep going */
838 chstore(token,tokmax,c);
839 break;
840 }
841 }
842 else if((qp=sindex(c,quote))>=0) /* quote */
843 {
844 switch(_p_state)
845 {
846 case IN_WHITE: /* these are identical, */
847 _p_state=IN_QUOTE; /* change states */
848 _p_curquote=quote[qp]; /* save quote char */
849 *quoted=1; /* set to true as long as something is in quotes */
850 break;
851
852 case IN_QUOTE:
853 if(quote[qp]==_p_curquote) /* same as the beginning quote? */
854 {
855 _p_state=IN_OZONE;
856 _p_curquote=0;
857 }
858 else
859 chstore(token,tokmax,c); /* treat as regular char */
860 break;
861
862 case IN_TOKEN:
863 case IN_OZONE:
864 *brkused=c; /* uses quote as break char */
865 goto byebye;
866 }
867 }
868 else if((qp=sindex(c,white))>=0) /* white */
869 {
870 switch(_p_state)
871 {
872 case IN_WHITE:
873 case IN_OZONE:
874 break; /* keep going */
875
876 case IN_TOKEN:
877 _p_state=IN_OZONE;
878 break;
879
880 case IN_QUOTE:
881 chstore(token,tokmax,c); /* it's valid here */
882 break;
883 }
884 }
885 else if(c==eschar) /* escape */
886 {
887 nc=line[(*next)+1];
888 if(nc==0) /* end of line */
889 {
890 *brkused=0;
891 chstore(token,tokmax,c);
892 ++(*next);
893 goto byebye;
894 }
895 switch(_p_state)
896 {
897 case IN_WHITE:
898 --(*next);
899 _p_state=IN_TOKEN;
900 break;
901
902 case IN_TOKEN:
903 case IN_QUOTE:
904 ++(*next);
905 chstore(token,tokmax,nc);
906 break;
907
908 case IN_OZONE:
909 goto byebye;
910 }
911 }
912 else /* anything else is just a real character */
913 {
914 switch(_p_state)
915 {
916 case IN_WHITE:
917 _p_state=IN_TOKEN; /* switch states */
918
919 case IN_TOKEN: /* these 2 are */
920 case IN_QUOTE: /* identical here */
921 chstore(token,tokmax,c);
922 break;
923
924 case IN_OZONE:
925 goto byebye;
926 }
927 }
928 } /* end of main loop */
929
930 byebye:
931 token[_p_tokpos]=0; /* make sure token ends with EOS */
932
933 return 0;
934 }