]> git.saurik.com Git - apple/icu.git/blob - icuSources/extra/uconv/uconv.cpp
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / extra / uconv / uconv.cpp
1 /*****************************************************************************
2 *
3 * Copyright (C) 1999-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *
6 ******************************************************************************/
7
8 /*
9 * uconv(1): an iconv(1)-like converter using ICU.
10 *
11 * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se>
12 * contributed in 1999.
13 *
14 * Conversion to the C conversion API and many improvements by
15 * Yves Arrouye <yves@realnames.com>, current maintainer.
16 *
17 * Markus Scherer maintainer from 2003.
18 * See source code repository history for changes.
19 */
20
21 #include <unicode/utypes.h>
22 #include <unicode/putil.h>
23 #include <unicode/ucnv.h>
24 #include <unicode/uenum.h>
25 #include <unicode/unistr.h>
26 #include <unicode/translit.h>
27 #include <unicode/uset.h>
28 #include <unicode/uclean.h>
29
30 #include <stdio.h>
31 #include <errno.h>
32 #include <string.h>
33 #include <stdlib.h>
34
35 #include "cmemory.h"
36 #include "cstring.h"
37 #include "ustrfmt.h"
38
39 #include "unicode/uwmsg.h"
40
41 #if (defined(WIN32) || defined(U_CYGWIN)) && !defined(__STRICT_ANSI__)
42 #include <io.h>
43 #include <fcntl.h>
44 #define USE_FILENO_BINARY_MODE 1
45 #endif
46
47 #ifdef UCONVMSG_LINK
48 /* below from the README */
49 #include "unicode/utypes.h"
50 #include "unicode/udata.h"
51 U_CFUNC char uconvmsg_dat[];
52 #endif
53
54 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
55
56 #define DEFAULT_BUFSZ 4096
57 #define UCONVMSG "uconvmsg"
58
59 static UResourceBundle *gBundle = 0; /* Bundle containing messages. */
60
61 /*
62 * Initialize the message bundle so that message strings can be fetched
63 * by u_wmsg().
64 *
65 */
66
67 static void initMsg(const char *pname) {
68 static int ps = 0;
69
70 if (!ps) {
71 char dataPath[2048]; /* XXX Sloppy: should be PATH_MAX. */
72 UErrorCode err = U_ZERO_ERROR;
73
74 ps = 1;
75
76 /* Set up our static data - if any */
77 #ifdef UCONVMSG_LINK
78 udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err);
79 if (U_FAILURE(err)) {
80 fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
81 pname, u_errorName(err));
82 err = U_ZERO_ERROR; /* It may still fail */
83 }
84 #endif
85
86 /* Get messages. */
87 gBundle = u_wmsg_setPath(UCONVMSG, &err);
88 if (U_FAILURE(err)) {
89 fprintf(stderr,
90 "%s: warning: couldn't open bundle %s: %s\n",
91 pname, UCONVMSG, u_errorName(err));
92 #ifdef UCONVMSG_LINK
93 fprintf(stderr,
94 "%s: setAppData was called, internal data %s failed to load\n",
95 pname, UCONVMSG);
96 #endif
97
98 err = U_ZERO_ERROR;
99 /* that was try #1, try again with a path */
100 uprv_strcpy(dataPath, u_getDataDirectory());
101 uprv_strcat(dataPath, U_FILE_SEP_STRING);
102 uprv_strcat(dataPath, UCONVMSG);
103
104 gBundle = u_wmsg_setPath(dataPath, &err);
105 if (U_FAILURE(err)) {
106 fprintf(stderr,
107 "%s: warning: still couldn't open bundle %s: %s\n",
108 pname, dataPath, u_errorName(err));
109 fprintf(stderr, "%s: warning: messages will not be displayed\n", pname);
110 }
111 }
112 }
113 }
114
115 /* Mapping of callback names to the callbacks passed to the converter
116 API. */
117
118 static struct callback_ent {
119 const char *name;
120 UConverterFromUCallback fromu;
121 const void *fromuctxt;
122 UConverterToUCallback tou;
123 const void *touctxt;
124 } transcode_callbacks[] = {
125 { "substitute",
126 UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0,
127 UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 },
128 { "skip",
129 UCNV_FROM_U_CALLBACK_SKIP, 0,
130 UCNV_TO_U_CALLBACK_SKIP, 0 },
131 { "stop",
132 UCNV_FROM_U_CALLBACK_STOP, 0,
133 UCNV_TO_U_CALLBACK_STOP, 0 },
134 { "escape",
135 UCNV_FROM_U_CALLBACK_ESCAPE, 0,
136 UCNV_TO_U_CALLBACK_ESCAPE, 0},
137 { "escape-icu",
138 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU,
139 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU },
140 { "escape-java",
141 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA,
142 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA },
143 { "escape-c",
144 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C,
145 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C },
146 { "escape-xml",
147 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
148 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
149 { "escape-xml-hex",
150 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
151 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
152 { "escape-xml-dec",
153 UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC,
154 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
155 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE,
156 UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE }
157 };
158
159 /* Return a pointer to a callback record given its name. */
160
161 static const struct callback_ent *findCallback(const char *name) {
162 int i, count =
163 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
164
165 /* We'll do a linear search, there aren't many of them and bsearch()
166 may not be that portable. */
167
168 for (i = 0; i < count; ++i) {
169 if (!uprv_stricmp(name, transcode_callbacks[i].name)) {
170 return &transcode_callbacks[i];
171 }
172 }
173
174 return 0;
175 }
176
177 /* Print converter information. If lookfor is set, only that converter will
178 be printed, otherwise all converters will be printed. If canon is non
179 zero, tags and aliases for each converter are printed too, in the format
180 expected for convrters.txt(5). */
181
182 static int printConverters(const char *pname, const char *lookfor,
183 UBool canon)
184 {
185 UErrorCode err = U_ZERO_ERROR;
186 int32_t num;
187 uint16_t num_stds;
188 const char **stds;
189
190 /* If there is a specified name, just handle that now. */
191
192 if (lookfor) {
193 if (!canon) {
194 printf("%s\n", lookfor);
195 return 0;
196 } else {
197 /* Because we are printing a canonical name, we need the
198 true converter name. We've done that already except for
199 the default name (because we want to print the exact
200 name one would get when calling ucnv_getDefaultName()
201 in non-canon mode). But since we do not know at this
202 point if we have the default name or something else, we
203 need to normalize again to the canonical converter
204 name. */
205
206 const char *truename = ucnv_getAlias(lookfor, 0, &err);
207 if (U_SUCCESS(err)) {
208 lookfor = truename;
209 } else {
210 err = U_ZERO_ERROR;
211 }
212 }
213 }
214
215 /* Print converter names. We come here for one of two reasons: we
216 are printing all the names (lookfor was null), or we have a
217 single converter to print but in canon mode, hence we need to
218 get to it in order to print everything. */
219
220 num = ucnv_countAvailable();
221 if (num <= 0) {
222 initMsg(pname);
223 u_wmsg(stderr, "cantGetNames");
224 return -1;
225 }
226 if (lookfor) {
227 num = 1; /* We know where we want to be. */
228 }
229
230 num_stds = ucnv_countStandards();
231 stds = (const char **) uprv_malloc(num_stds * sizeof(*stds));
232 if (!stds) {
233 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR));
234 return -1;
235 } else {
236 uint16_t s;
237
238 if (canon) {
239 printf("{ ");
240 }
241 for (s = 0; s < num_stds; ++s) {
242 stds[s] = ucnv_getStandard(s, &err);
243 if (canon) {
244 printf("%s ", stds[s]);
245 }
246 if (U_FAILURE(err)) {
247 u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
248 return -1;
249 }
250 }
251 if (canon) {
252 puts("}");
253 }
254 }
255
256 for (int32_t i = 0; i < num; i++) {
257 const char *name;
258 uint16_t num_aliases;
259
260 /* Set the name either to what we are looking for, or
261 to the current converter name. */
262
263 if (lookfor) {
264 name = lookfor;
265 } else {
266 name = ucnv_getAvailableName(i);
267 }
268
269 /* Get all the aliases associated to the name. */
270
271 err = U_ZERO_ERROR;
272 num_aliases = ucnv_countAliases(name, &err);
273 if (U_FAILURE(err)) {
274 printf("%s", name);
275
276 UnicodeString str(name, "");
277 putchar('\t');
278 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
279 u_wmsg_errorName(err));
280 return -1;
281 } else {
282 uint16_t a, s, t;
283
284 /* Write all the aliases and their tags. */
285
286 for (a = 0; a < num_aliases; ++a) {
287 const char *alias = ucnv_getAlias(name, a, &err);
288
289 if (U_FAILURE(err)) {
290 UnicodeString str(name, "");
291 putchar('\t');
292 u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
293 u_wmsg_errorName(err));
294 return -1;
295 }
296
297 /* Print the current alias so that it looks right. */
298 printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") ,
299 alias,
300 (canon ? "" : " "));
301
302 /* Look (slowly, linear searching) for a tag. */
303
304 if (canon) {
305 /* -1 to skip the last standard */
306 for (s = t = 0; s < num_stds-1; ++s) {
307 UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err);
308 if (U_SUCCESS(err)) {
309 /* List the standard tags */
310 const char *standardName;
311 UBool isFirst = TRUE;
312 UErrorCode enumError = U_ZERO_ERROR;
313 while ((standardName = uenum_next(nameEnum, NULL, &enumError))) {
314 /* See if this alias is supported by this standard. */
315 if (!strcmp(standardName, alias)) {
316 if (!t) {
317 printf(" {");
318 t = 1;
319 }
320 /* Print a * after the default standard name */
321 printf(" %s%s", stds[s], (isFirst ? "*" : ""));
322 }
323 isFirst = FALSE;
324 }
325 }
326 }
327 if (t) {
328 printf(" }");
329 }
330 }
331 /* Terminate this entry. */
332 if (canon) {
333 puts("");
334 }
335
336 /* Move on. */
337 }
338 /* Terminate this entry. */
339 if (!canon) {
340 puts("");
341 }
342 }
343 }
344
345 /* Free temporary data. */
346
347 uprv_free(stds);
348
349 /* Success. */
350
351 return 0;
352 }
353
354 /* Print all available transliterators. If canon is non zero, print
355 one transliterator per line. */
356
357 static int printTransliterators(UBool canon)
358 {
359 #if UCONFIG_NO_TRANSLITERATION
360 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
361 return 1;
362 #else
363 int32_t numtrans = utrans_countAvailableIDs(), i;
364 int buflen = 512;
365 char *buf = (char *) uprv_malloc(buflen);
366 char staticbuf[512];
367
368 char sepchar = canon ? '\n' : ' ';
369
370 if (!buf) {
371 buf = staticbuf;
372 buflen = sizeof(staticbuf);
373 }
374
375 for (i = 0; i < numtrans; ++i) {
376 int32_t len = utrans_getAvailableID(i, buf, buflen);
377 if (len >= buflen - 1) {
378 if (buf != staticbuf) {
379 buflen <<= 1;
380 if (buflen < len) {
381 buflen = len + 64;
382 }
383 buf = (char *) uprv_realloc(buf, buflen);
384 if (!buf) {
385 buf = staticbuf;
386 buflen = sizeof(staticbuf);
387 }
388 }
389 utrans_getAvailableID(i, buf, buflen);
390 if (len >= buflen) {
391 uprv_strcpy(buf + buflen - 4, "..."); /* Truncate the name. */
392 }
393 }
394
395 printf("%s", buf);
396 if (i < numtrans - 1) {
397 putchar(sepchar);
398 }
399 }
400
401 /* Add a terminating newline if needed. */
402
403 if (sepchar != '\n') {
404 putchar('\n');
405 }
406
407 /* Free temporary data. */
408
409 if (buf != staticbuf) {
410 uprv_free(buf);
411 }
412
413 /* Success. */
414
415 return 0;
416 #endif
417 }
418
419 enum {
420 uSP = 0x20, // space
421 uCR = 0xd, // carriage return
422 uLF = 0xa, // line feed
423 uNL = 0x85, // newline
424 uLS = 0x2028, // line separator
425 uPS = 0x2029, // paragraph separator
426 uSig = 0xfeff // signature/BOM character
427 };
428
429 static inline int32_t
430 getChunkLimit(const UnicodeString &prev, const UnicodeString &s) {
431 // find one of
432 // CR, LF, CRLF, NL, LS, PS
433 // for paragraph ends (see UAX #13/Unicode 4)
434 // and include it in the chunk
435 // all of these characters are on the BMP
436 // do not include FF or VT in case they are part of a paragraph
437 // (important for bidi contexts)
438 static const UChar paraEnds[] = {
439 0xd, 0xa, 0x85, 0x2028, 0x2029
440 };
441 enum {
442 iCR, iLF, iNL, iLS, iPS, iCount
443 };
444
445 // first, see if there is a CRLF split between prev and s
446 if (prev.endsWith(paraEnds + iCR, 1)) {
447 if (s.startsWith(paraEnds + iLF, 1)) {
448 return 1; // split CRLF, include the LF
449 } else if (!s.isEmpty()) {
450 return 0; // complete the last chunk
451 } else {
452 return -1; // wait for actual further contents to arrive
453 }
454 }
455
456 const UChar *u = s.getBuffer(), *limit = u + s.length();
457 UChar c;
458
459 while (u < limit) {
460 c = *u++;
461 if (
462 ((c < uSP) && (c == uCR || c == uLF)) ||
463 (c == uNL) ||
464 ((c & uLS) == uLS)
465 ) {
466 if (c == uCR) {
467 // check for CRLF
468 if (u == limit) {
469 return -1; // LF may be in the next chunk
470 } else if (*u == uLF) {
471 ++u; // include the LF in this chunk
472 }
473 }
474 return (int32_t)(u - s.getBuffer());
475 }
476 }
477
478 return -1; // continue collecting the chunk
479 }
480
481 enum {
482 CNV_NO_FEFF, // cannot convert the U+FEFF Unicode signature character (BOM)
483 CNV_WITH_FEFF, // can convert the U+FEFF signature character
484 CNV_ADDS_FEFF // automatically adds/detects the U+FEFF signature character
485 };
486
487 static inline UChar
488 nibbleToHex(uint8_t n) {
489 n &= 0xf;
490 return
491 n <= 9 ?
492 (UChar)(0x30 + n) :
493 (UChar)((0x61 - 10) + n);
494 }
495
496 // check the converter's Unicode signature properties;
497 // the fromUnicode side of the converter must be in its initial state
498 // and will be reset again if it was used
499 static int32_t
500 cnvSigType(UConverter *cnv) {
501 UErrorCode err;
502 int32_t result;
503
504 // test if the output charset can convert U+FEFF
505 USet *set = uset_open(1, 0);
506 err = U_ZERO_ERROR;
507 ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err);
508 if (U_SUCCESS(err) && uset_contains(set, uSig)) {
509 result = CNV_WITH_FEFF;
510 } else {
511 result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted
512 }
513 uset_close(set);
514
515 if (result == CNV_WITH_FEFF) {
516 // test if the output charset emits a signature anyway
517 const UChar a[1] = { 0x61 }; // "a"
518 const UChar *in;
519
520 char buffer[20];
521 char *out;
522
523 in = a;
524 out = buffer;
525 err = U_ZERO_ERROR;
526 ucnv_fromUnicode(cnv,
527 &out, buffer + sizeof(buffer),
528 &in, a + 1,
529 NULL, TRUE, &err);
530 ucnv_resetFromUnicode(cnv);
531
532 if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) &&
533 U_SUCCESS(err)
534 ) {
535 result = CNV_ADDS_FEFF;
536 }
537 }
538
539 return result;
540 }
541
542 class ConvertFile {
543 public:
544 ConvertFile() :
545 buf(NULL), outbuf(NULL), fromoffsets(NULL),
546 bufsz(0), signature(0) {}
547
548 void
549 setBufferSize(size_t bufferSize) {
550 bufsz = bufferSize;
551
552 buf = new char[2 * bufsz];
553 outbuf = buf + bufsz;
554
555 // +1 for an added U+FEFF in the intermediate Unicode buffer
556 fromoffsets = new int32_t[bufsz + 1];
557 }
558
559 ~ConvertFile() {
560 delete [] buf;
561 delete [] fromoffsets;
562 }
563
564 UBool convertFile(const char *pname,
565 const char *fromcpage,
566 UConverterToUCallback toucallback,
567 const void *touctxt,
568 const char *tocpage,
569 UConverterFromUCallback fromucallback,
570 const void *fromuctxt,
571 UBool fallback,
572 const char *translit,
573 const char *infilestr,
574 FILE * outfile, int verbose);
575 private:
576 friend int main(int argc, char **argv);
577
578 char *buf, *outbuf;
579 int32_t *fromoffsets;
580
581 size_t bufsz;
582 int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character
583 };
584
585 // Convert a file from one encoding to another
586 UBool
587 ConvertFile::convertFile(const char *pname,
588 const char *fromcpage,
589 UConverterToUCallback toucallback,
590 const void *touctxt,
591 const char *tocpage,
592 UConverterFromUCallback fromucallback,
593 const void *fromuctxt,
594 UBool fallback,
595 const char *translit,
596 const char *infilestr,
597 FILE * outfile, int verbose)
598 {
599 FILE *infile;
600 UBool ret = TRUE;
601 UConverter *convfrom = 0;
602 UConverter *convto = 0;
603 UErrorCode err = U_ZERO_ERROR;
604 UBool flush;
605 const char *cbufp, *prevbufp;
606 char *bufp;
607
608 uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */
609
610 const UChar *unibuf, *unibufbp;
611 UChar *unibufp;
612
613 size_t rd, wr;
614
615 #if !UCONFIG_NO_TRANSLITERATION
616 Transliterator *t = 0; // Transliterator acting on Unicode data.
617 UnicodeString chunk; // One chunk of the text being collected for transformation.
618 #endif
619 UnicodeString u; // String to do the transliteration.
620 int32_t ulen;
621
622 // use conversion offsets for error messages
623 // unless a transliterator is used -
624 // a text transformation will reorder characters in unpredictable ways
625 UBool useOffsets = TRUE;
626
627 // Open the correct input file or connect to stdin for reading input
628
629 if (infilestr != 0 && strcmp(infilestr, "-")) {
630 infile = fopen(infilestr, "rb");
631 if (infile == 0) {
632 UnicodeString str1(infilestr, "");
633 str1.append((UChar32) 0);
634 UnicodeString str2(strerror(errno), "");
635 str2.append((UChar32) 0);
636 initMsg(pname);
637 u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer());
638 return FALSE;
639 }
640 } else {
641 infilestr = "-";
642 infile = stdin;
643 #ifdef USE_FILENO_BINARY_MODE
644 if (setmode(fileno(stdin), O_BINARY) == -1) {
645 initMsg(pname);
646 u_wmsg(stderr, "cantSetInBinMode");
647 return FALSE;
648 }
649 #endif
650 }
651
652 if (verbose) {
653 fprintf(stderr, "%s:\n", infilestr);
654 }
655
656 #if !UCONFIG_NO_TRANSLITERATION
657 // Create transliterator as needed.
658
659 if (translit != NULL && *translit) {
660 UParseError parse;
661 UnicodeString str(translit), pestr;
662
663 /* Create from rules or by ID as needed. */
664
665 parse.line = -1;
666
667 if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
668 t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err);
669 } else {
670 t = Transliterator::createInstance(translit, UTRANS_FORWARD, err);
671 }
672
673 if (U_FAILURE(err)) {
674 str.append((UChar32) 0);
675 initMsg(pname);
676
677 if (parse.line >= 0) {
678 UChar linebuf[20], offsetbuf[20];
679 uprv_itou(linebuf, 20, parse.line, 10, 0);
680 uprv_itou(offsetbuf, 20, parse.offset, 10, 0);
681 u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(),
682 u_wmsg_errorName(err), linebuf, offsetbuf);
683 } else {
684 u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(),
685 u_wmsg_errorName(err));
686 }
687
688 if (t) {
689 delete t;
690 t = 0;
691 }
692 goto error_exit;
693 }
694
695 useOffsets = FALSE;
696 }
697 #endif
698
699 // Create codepage converter. If the codepage or its aliases weren't
700 // available, it returns NULL and a failure code. We also set the
701 // callbacks, and return errors in the same way.
702
703 convfrom = ucnv_open(fromcpage, &err);
704 if (U_FAILURE(err)) {
705 UnicodeString str(fromcpage, "");
706 initMsg(pname);
707 u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(),
708 u_wmsg_errorName(err));
709 goto error_exit;
710 }
711 ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
712 if (U_FAILURE(err)) {
713 initMsg(pname);
714 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
715 goto error_exit;
716 }
717
718 convto = ucnv_open(tocpage, &err);
719 if (U_FAILURE(err)) {
720 UnicodeString str(tocpage, "");
721 initMsg(pname);
722 u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(),
723 u_wmsg_errorName(err));
724 goto error_exit;
725 }
726 ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
727 if (U_FAILURE(err)) {
728 initMsg(pname);
729 u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
730 goto error_exit;
731 }
732 ucnv_setFallback(convto, fallback);
733
734 UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode;
735 int8_t sig;
736
737 // OK, we can convert now.
738 sig = signature;
739 rd = 0;
740
741 do {
742 willexit = FALSE;
743
744 // input file offset at the beginning of the next buffer
745 infoffset += rd;
746
747 rd = fread(buf, 1, bufsz, infile);
748 if (ferror(infile) != 0) {
749 UnicodeString str(strerror(errno));
750 initMsg(pname);
751 u_wmsg(stderr, "cantRead", str.getTerminatedBuffer());
752 goto error_exit;
753 }
754
755 // Convert the read buffer into the new encoding via Unicode.
756 // After the call 'unibufp' will be placed behind the last
757 // character that was converted in the 'unibuf'.
758 // Also the 'cbufp' is positioned behind the last converted
759 // character.
760 // At the last conversion in the file, flush should be set to
761 // true so that we get all characters converted.
762 //
763 // The converter must be flushed at the end of conversion so
764 // that characters on hold also will be written.
765
766 cbufp = buf;
767 flush = (UBool)(rd != bufsz);
768
769 // convert until the input is consumed
770 do {
771 // remember the start of the current byte-to-Unicode conversion
772 prevbufp = cbufp;
773
774 unibuf = unibufp = u.getBuffer((int32_t)bufsz);
775
776 // Use bufsz instead of u.getCapacity() for the targetLimit
777 // so that we don't overflow fromoffsets[].
778 ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp,
779 buf + rd, useOffsets ? fromoffsets : NULL, flush, &err);
780
781 ulen = (int32_t)(unibufp - unibuf);
782 u.releaseBuffer(ulen);
783
784 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
785 // converting all of the input bytes.
786 // It works like this because ucnv_toUnicode() returns only under the
787 // following conditions:
788 // - an error occurred during conversion (an error code is set)
789 // - the target buffer is filled (the error code indicates an overflow)
790 // - the source is consumed
791 // That is, if the error code does not indicate a failure,
792 // not even an overflow, then the source must be consumed entirely.
793 fromSawEndOfBytes = (UBool)U_SUCCESS(err);
794
795 if (err == U_BUFFER_OVERFLOW_ERROR) {
796 err = U_ZERO_ERROR;
797 } else if (U_FAILURE(err)) {
798 char pos[32], errorBytes[32];
799 int8_t i, length, errorLength;
800
801 UErrorCode localError = U_ZERO_ERROR;
802 errorLength = (int8_t)sizeof(errorBytes);
803 ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError);
804 if (U_FAILURE(localError) || errorLength == 0) {
805 errorLength = 1;
806 }
807
808 // print the input file offset of the start of the error bytes:
809 // input file offset of the current byte buffer +
810 // length of the just consumed bytes -
811 // length of the error bytes
812 length =
813 (int8_t)sprintf(pos, "%d",
814 (int)(infoffset + (cbufp - buf) - errorLength));
815
816 // output the bytes that caused the error
817 UnicodeString str;
818 for (i = 0; i < errorLength; ++i) {
819 if (i > 0) {
820 str.append((UChar)uSP);
821 }
822 str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4));
823 str.append(nibbleToHex((uint8_t)errorBytes[i]));
824 }
825
826 initMsg(pname);
827 u_wmsg(stderr, "problemCvtToU",
828 UnicodeString(pos, length, "").getTerminatedBuffer(),
829 str.getTerminatedBuffer(),
830 u_wmsg_errorName(err));
831
832 willexit = TRUE;
833 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
834 }
835
836 // Replaced a check for whether the input was consumed by
837 // looping until it is; message key "premEndInput" now obsolete.
838
839 if (ulen == 0) {
840 continue;
841 }
842
843 // remove a U+FEFF Unicode signature character if requested
844 if (sig < 0) {
845 if (u.charAt(0) == uSig) {
846 u.remove(0, 1);
847
848 // account for the removed UChar and offset
849 --ulen;
850
851 if (useOffsets) {
852 // remove an offset from fromoffsets[] as well
853 // to keep the array parallel with the UChars
854 memmove(fromoffsets, fromoffsets + 1, ulen * 4);
855 }
856
857 }
858 sig = 0;
859 }
860
861 #if !UCONFIG_NO_TRANSLITERATION
862 // Transliterate/transform if needed.
863
864 // For transformation, we use chunking code -
865 // collect Unicode input until, for example, an end-of-line,
866 // then transform and output-convert that and continue collecting.
867 // This makes the transformation result independent of the buffer size
868 // while avoiding the slower keyboard mode.
869 // The end-of-chunk characters are completely included in the
870 // transformed string in case they are to be transformed themselves.
871 if (t != NULL) {
872 UnicodeString out;
873 int32_t chunkLimit;
874
875 do {
876 chunkLimit = getChunkLimit(chunk, u);
877 if (chunkLimit < 0 && flush && fromSawEndOfBytes) {
878 // use all of the rest at the end of the text
879 chunkLimit = u.length();
880 }
881 if (chunkLimit >= 0) {
882 // complete the chunk and transform it
883 chunk.append(u, 0, chunkLimit);
884 u.remove(0, chunkLimit);
885 t->transliterate(chunk);
886
887 // append the transformation result to the result and empty the chunk
888 out.append(chunk);
889 chunk.remove();
890 } else {
891 // continue collecting the chunk
892 chunk.append(u);
893 break;
894 }
895 } while (!u.isEmpty());
896
897 u = out;
898 ulen = u.length();
899 }
900 #endif
901
902 // add a U+FEFF Unicode signature character if requested
903 // and possible/necessary
904 if (sig > 0) {
905 if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) {
906 u.insert(0, (UChar)uSig);
907
908 if (useOffsets) {
909 // insert a pseudo-offset into fromoffsets[] as well
910 // to keep the array parallel with the UChars
911 memmove(fromoffsets + 1, fromoffsets, ulen * 4);
912 fromoffsets[0] = -1;
913 }
914
915 // account for the additional UChar and offset
916 ++ulen;
917 }
918 sig = 0;
919 }
920
921 // Convert the Unicode buffer into the destination codepage
922 // Again 'bufp' will be placed behind the last converted character
923 // And 'unibufp' will be placed behind the last converted unicode character
924 // At the last conversion flush should be set to true to ensure that
925 // all characters left get converted
926
927 unibuf = unibufbp = u.getBuffer();
928
929 do {
930 bufp = outbuf;
931
932 // Use fromSawEndOfBytes in addition to the flush flag -
933 // it indicates whether the intermediate Unicode string
934 // contains the very last UChars for the very last input bytes.
935 ucnv_fromUnicode(convto, &bufp, outbuf + bufsz,
936 &unibufbp,
937 unibuf + ulen,
938 NULL, (UBool)(flush && fromSawEndOfBytes), &err);
939
940 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
941 // converting all of the intermediate UChars.
942 // See comment for fromSawEndOfBytes.
943 toSawEndOfUnicode = (UBool)U_SUCCESS(err);
944
945 if (err == U_BUFFER_OVERFLOW_ERROR) {
946 err = U_ZERO_ERROR;
947 } else if (U_FAILURE(err)) {
948 UChar errorUChars[4];
949 const char *errtag;
950 char pos[32];
951 UChar32 c;
952 int8_t i, length, errorLength;
953
954 UErrorCode localError = U_ZERO_ERROR;
955 errorLength = (int8_t)LENGTHOF(errorUChars);
956 ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError);
957 if (U_FAILURE(localError) || errorLength == 0) {
958 // need at least 1 so that we don't access beyond the length of fromoffsets[]
959 errorLength = 1;
960 }
961
962 int32_t ferroffset;
963
964 if (useOffsets) {
965 // Unicode buffer offset of the start of the error UChars
966 ferroffset = (int32_t)((unibufbp - unibuf) - errorLength);
967 if (ferroffset < 0) {
968 // approximation - the character started in the previous Unicode buffer
969 ferroffset = 0;
970 }
971
972 // get the corresponding byte offset out of fromoffsets[]
973 // go back if the offset is not known for some of the UChars
974 int32_t fromoffset;
975 do {
976 fromoffset = fromoffsets[ferroffset];
977 } while (fromoffset < 0 && --ferroffset >= 0);
978
979 // total input file offset =
980 // input file offset of the current byte buffer +
981 // byte buffer offset of where the current Unicode buffer is converted from +
982 // fromoffsets[Unicode offset]
983 ferroffset = infoffset + (prevbufp - buf) + fromoffset;
984 errtag = "problemCvtFromU";
985 } else {
986 // Do not use fromoffsets if (t != NULL) because the Unicode text may
987 // be different from what the offsets refer to.
988
989 // output file offset
990 ferroffset = (int32_t)(outfoffset + (bufp - outbuf));
991 errtag = "problemCvtFromUOut";
992 }
993
994 length = (int8_t)sprintf(pos, "%u", (int)ferroffset);
995
996 // output the code points that caused the error
997 UnicodeString str;
998 for (i = 0; i < errorLength;) {
999 if (i > 0) {
1000 str.append((UChar)uSP);
1001 }
1002 U16_NEXT(errorUChars, i, errorLength, c);
1003 if (c >= 0x100000) {
1004 str.append(nibbleToHex((uint8_t)(c >> 20)));
1005 }
1006 if (c >= 0x10000) {
1007 str.append(nibbleToHex((uint8_t)(c >> 16)));
1008 }
1009 str.append(nibbleToHex((uint8_t)(c >> 12)));
1010 str.append(nibbleToHex((uint8_t)(c >> 8)));
1011 str.append(nibbleToHex((uint8_t)(c >> 4)));
1012 str.append(nibbleToHex((uint8_t)c));
1013 }
1014
1015 initMsg(pname);
1016 u_wmsg(stderr, errtag,
1017 UnicodeString(pos, length, "").getTerminatedBuffer(),
1018 str.getTerminatedBuffer(),
1019 u_wmsg_errorName(err));
1020 u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer());
1021
1022 willexit = TRUE;
1023 err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
1024 }
1025
1026 // Replaced a check for whether the intermediate Unicode characters were all consumed by
1027 // looping until they are; message key "premEnd" now obsolete.
1028
1029 // Finally, write the converted buffer to the output file
1030 size_t outlen = (size_t) (bufp - outbuf);
1031 outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile));
1032 if (wr != outlen) {
1033 UnicodeString str(strerror(errno));
1034 initMsg(pname);
1035 u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer());
1036 willexit = TRUE;
1037 }
1038
1039 if (willexit) {
1040 goto error_exit;
1041 }
1042 } while (!toSawEndOfUnicode);
1043 } while (!fromSawEndOfBytes);
1044 } while (!flush); // Stop when we have flushed the
1045 // converters (this means that it's
1046 // the end of output)
1047
1048 goto normal_exit;
1049
1050 error_exit:
1051 ret = FALSE;
1052
1053 normal_exit:
1054 // Cleanup.
1055
1056 ucnv_close(convfrom);
1057 ucnv_close(convto);
1058
1059 #if !UCONFIG_NO_TRANSLITERATION
1060 delete t;
1061 #endif
1062
1063 if (infile != stdin) {
1064 fclose(infile);
1065 }
1066
1067 return ret;
1068 }
1069
1070 static void usage(const char *pname, int ecode) {
1071 const UChar *msg;
1072 int32_t msgLen;
1073 UErrorCode err = U_ZERO_ERROR;
1074 FILE *fp = ecode ? stderr : stdout;
1075 int res;
1076
1077 initMsg(pname);
1078 msg =
1079 ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord",
1080 &msgLen, &err);
1081 UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1));
1082 UnicodeString mname(msg, msgLen + 1);
1083
1084 res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer());
1085 if (!ecode) {
1086 if (!res) {
1087 fputc('\n', fp);
1088 }
1089 if (!u_wmsg(fp, "help")) {
1090 /* Now dump callbacks and finish. */
1091
1092 int i, count =
1093 sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
1094 for (i = 0; i < count; ++i) {
1095 fprintf(fp, " %s", transcode_callbacks[i].name);
1096 }
1097 fputc('\n', fp);
1098 }
1099 }
1100
1101 exit(ecode);
1102 }
1103
1104 extern int
1105 main(int argc, char **argv)
1106 {
1107 FILE *outfile;
1108 int ret = 0;
1109
1110 size_t bufsz = DEFAULT_BUFSZ;
1111
1112 const char *fromcpage = 0;
1113 const char *tocpage = 0;
1114 const char *translit = 0;
1115 const char *outfilestr = 0;
1116 UBool fallback = FALSE;
1117
1118 UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
1119 const void *fromuctxt = 0;
1120 UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
1121 const void *touctxt = 0;
1122
1123 char **iter, **remainArgv, **remainArgvLimit;
1124 char **end = argv + argc;
1125
1126 const char *pname;
1127
1128 UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE;
1129 const char *printName = 0;
1130
1131 UBool verbose = FALSE;
1132 UErrorCode status = U_ZERO_ERROR;
1133
1134 ConvertFile cf;
1135
1136 /* Initialize ICU */
1137 u_init(&status);
1138 if (U_FAILURE(status)) {
1139 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
1140 argv[0], u_errorName(status));
1141 exit(1);
1142 }
1143
1144 // Get and prettify pname.
1145 pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR);
1146 #ifdef WIN32
1147 if (!pname) {
1148 pname = uprv_strrchr(*argv, '/');
1149 }
1150 #endif
1151 if (!pname) {
1152 pname = *argv;
1153 } else {
1154 ++pname;
1155 }
1156
1157 // First, get the arguments from command-line
1158 // to know the codepages to convert between
1159
1160 remainArgv = remainArgvLimit = argv + 1;
1161 for (iter = argv + 1; iter != end; iter++) {
1162 // Check for from charset
1163 if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
1164 iter++;
1165 if (iter != end)
1166 fromcpage = *iter;
1167 else
1168 usage(pname, 1);
1169 } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) {
1170 iter++;
1171 if (iter != end)
1172 tocpage = *iter;
1173 else
1174 usage(pname, 1);
1175 } else if (strcmp("-x", *iter) == 0) {
1176 iter++;
1177 if (iter != end)
1178 translit = *iter;
1179 else
1180 usage(pname, 1);
1181 } else if (!strcmp("--fallback", *iter)) {
1182 fallback = TRUE;
1183 } else if (!strcmp("--no-fallback", *iter)) {
1184 fallback = FALSE;
1185 } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
1186 iter++;
1187 if (iter != end) {
1188 bufsz = atoi(*iter);
1189 if ((int) bufsz <= 0) {
1190 initMsg(pname);
1191 UnicodeString str(*iter);
1192 initMsg(pname);
1193 u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer());
1194 return 3;
1195 }
1196 } else {
1197 usage(pname, 1);
1198 }
1199 } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
1200 if (printTranslits) {
1201 usage(pname, 1);
1202 }
1203 printConvs = TRUE;
1204 } else if (strcmp("--default-code", *iter) == 0) {
1205 if (printTranslits) {
1206 usage(pname, 1);
1207 }
1208 printName = ucnv_getDefaultName();
1209 } else if (strcmp("--list-code", *iter) == 0) {
1210 if (printTranslits) {
1211 usage(pname, 1);
1212 }
1213
1214 iter++;
1215 if (iter != end) {
1216 UErrorCode e = U_ZERO_ERROR;
1217 printName = ucnv_getAlias(*iter, 0, &e);
1218 if (U_FAILURE(e) || !printName) {
1219 UnicodeString str(*iter);
1220 initMsg(pname);
1221 u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer());
1222 return 2;
1223 }
1224 } else
1225 usage(pname, 1);
1226 } else if (strcmp("--canon", *iter) == 0) {
1227 printCanon = TRUE;
1228 } else if (strcmp("-L", *iter) == 0
1229 || !strcmp("--list-transliterators", *iter)) {
1230 if (printConvs) {
1231 usage(pname, 1);
1232 }
1233 printTranslits = TRUE;
1234 } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
1235 || !strcmp("--help", *iter)) {
1236 usage(pname, 0);
1237 } else if (!strcmp("-c", *iter)) {
1238 fromucallback = UCNV_FROM_U_CALLBACK_SKIP;
1239 } else if (!strcmp("--to-callback", *iter)) {
1240 iter++;
1241 if (iter != end) {
1242 const struct callback_ent *cbe = findCallback(*iter);
1243 if (cbe) {
1244 fromucallback = cbe->fromu;
1245 fromuctxt = cbe->fromuctxt;
1246 } else {
1247 UnicodeString str(*iter);
1248 initMsg(pname);
1249 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1250 return 4;
1251 }
1252 } else {
1253 usage(pname, 1);
1254 }
1255 } else if (!strcmp("--from-callback", *iter)) {
1256 iter++;
1257 if (iter != end) {
1258 const struct callback_ent *cbe = findCallback(*iter);
1259 if (cbe) {
1260 toucallback = cbe->tou;
1261 touctxt = cbe->touctxt;
1262 } else {
1263 UnicodeString str(*iter);
1264 initMsg(pname);
1265 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1266 return 4;
1267 }
1268 } else {
1269 usage(pname, 1);
1270 }
1271 } else if (!strcmp("-i", *iter)) {
1272 toucallback = UCNV_TO_U_CALLBACK_SKIP;
1273 } else if (!strcmp("--callback", *iter)) {
1274 iter++;
1275 if (iter != end) {
1276 const struct callback_ent *cbe = findCallback(*iter);
1277 if (cbe) {
1278 fromucallback = cbe->fromu;
1279 fromuctxt = cbe->fromuctxt;
1280 toucallback = cbe->tou;
1281 touctxt = cbe->touctxt;
1282 } else {
1283 UnicodeString str(*iter);
1284 initMsg(pname);
1285 u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1286 return 4;
1287 }
1288 } else {
1289 usage(pname, 1);
1290 }
1291 } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
1292 verbose = FALSE;
1293 } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
1294 verbose = TRUE;
1295 } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
1296 printf("%s v2.1 ICU " U_ICU_VERSION "\n", pname);
1297 return 0;
1298 } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
1299 ++iter;
1300 if (iter != end && !outfilestr) {
1301 outfilestr = *iter;
1302 } else {
1303 usage(pname, 1);
1304 }
1305 } else if (0 == strcmp("--add-signature", *iter)) {
1306 cf.signature = 1;
1307 } else if (0 == strcmp("--remove-signature", *iter)) {
1308 cf.signature = -1;
1309 } else if (**iter == '-' && (*iter)[1]) {
1310 usage(pname, 1);
1311 } else {
1312 // move a non-option up in argv[]
1313 *remainArgvLimit++ = *iter;
1314 }
1315 }
1316
1317 if (printConvs || printName) {
1318 return printConverters(pname, printName, printCanon) ? 2 : 0;
1319 } else if (printTranslits) {
1320 return printTransliterators(printCanon) ? 3 : 0;
1321 }
1322
1323 if (!fromcpage || !uprv_strcmp(fromcpage, "-")) {
1324 fromcpage = ucnv_getDefaultName();
1325 }
1326 if (!tocpage || !uprv_strcmp(tocpage, "-")) {
1327 tocpage = ucnv_getDefaultName();
1328 }
1329
1330 // Open the correct output file or connect to stdout for reading input
1331 if (outfilestr != 0 && strcmp(outfilestr, "-")) {
1332 outfile = fopen(outfilestr, "wb");
1333 if (outfile == 0) {
1334 UnicodeString str1(outfilestr, "");
1335 UnicodeString str2(strerror(errno), "");
1336 initMsg(pname);
1337 u_wmsg(stderr, "cantCreateOutputF",
1338 str1.getBuffer(), str2.getBuffer());
1339 return 1;
1340 }
1341 } else {
1342 outfilestr = "-";
1343 outfile = stdout;
1344 #ifdef USE_FILENO_BINARY_MODE
1345 if (setmode(fileno(outfile), O_BINARY) == -1) {
1346 u_wmsg(stderr, "cantSetOutBinMode");
1347 exit(-1);
1348 }
1349 #endif
1350 }
1351
1352 /* Loop again on the arguments to find all the input files, and
1353 convert them. */
1354
1355 cf.setBufferSize(bufsz);
1356
1357 if(remainArgv < remainArgvLimit) {
1358 for (iter = remainArgv; iter != remainArgvLimit; iter++) {
1359 if (!cf.convertFile(
1360 pname, fromcpage, toucallback, touctxt, tocpage,
1361 fromucallback, fromuctxt, fallback, translit, *iter,
1362 outfile, verbose)
1363 ) {
1364 goto error_exit;
1365 }
1366 }
1367 } else {
1368 if (!cf.convertFile(
1369 pname, fromcpage, toucallback, touctxt, tocpage,
1370 fromucallback, fromuctxt, fallback, translit, 0,
1371 outfile, verbose)
1372 ) {
1373 goto error_exit;
1374 }
1375 }
1376
1377 goto normal_exit;
1378 error_exit:
1379 ret = 1;
1380 normal_exit:
1381
1382 if (outfile != stdout) {
1383 fclose(outfile);
1384 }
1385
1386 return ret;
1387 }
1388
1389
1390 /*
1391 * Hey, Emacs, please set the following:
1392 *
1393 * Local Variables:
1394 * indent-tabs-mode: nil
1395 * End:
1396 *
1397 */