1 /*****************************************************************************
3 * Copyright (C) 1999-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
6 ******************************************************************************/
9 * uconv(1): an iconv(1)-like converter using ICU.
11 * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se>
12 * contributed in 1999.
14 * Conversion to the C conversion API and many improvements by
15 * Yves Arrouye <yves@realnames.com>, current maintainer.
17 * Markus Scherer maintainer from 2003.
18 * See source code repository history for changes.
21 #include <unicode/utypes.h>
22 #include <unicode/putil.h>
23 #include <unicode/ucnv.h>
24 #include <unicode/uenum.h>
25 #include <unicode/unistr.h>
26 #include <unicode/translit.h>
27 #include <unicode/uset.h>
28 #include <unicode/uclean.h>
39 #include "unicode/uwmsg.h"
41 #if (defined(WIN32) || defined(U_CYGWIN)) && !defined(__STRICT_ANSI__)
44 #define USE_FILENO_BINARY_MODE 1
48 /* below from the README */
49 #include "unicode/utypes.h"
50 #include "unicode/udata.h"
51 U_CFUNC
char uconvmsg_dat
[];
54 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
56 #define DEFAULT_BUFSZ 4096
57 #define UCONVMSG "uconvmsg"
59 static UResourceBundle
*gBundle
= 0; /* Bundle containing messages. */
62 * Initialize the message bundle so that message strings can be fetched
67 static void initMsg(const char *pname
) {
71 char dataPath
[2048]; /* XXX Sloppy: should be PATH_MAX. */
72 UErrorCode err
= U_ZERO_ERROR
;
76 /* Set up our static data - if any */
78 udata_setAppData(UCONVMSG
, (const void*) uconvmsg_dat
, &err
);
80 fprintf(stderr
, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
81 pname
, u_errorName(err
));
82 err
= U_ZERO_ERROR
; /* It may still fail */
87 gBundle
= u_wmsg_setPath(UCONVMSG
, &err
);
90 "%s: warning: couldn't open bundle %s: %s\n",
91 pname
, UCONVMSG
, u_errorName(err
));
94 "%s: setAppData was called, internal data %s failed to load\n",
99 /* that was try #1, try again with a path */
100 uprv_strcpy(dataPath
, u_getDataDirectory());
101 uprv_strcat(dataPath
, U_FILE_SEP_STRING
);
102 uprv_strcat(dataPath
, UCONVMSG
);
104 gBundle
= u_wmsg_setPath(dataPath
, &err
);
105 if (U_FAILURE(err
)) {
107 "%s: warning: still couldn't open bundle %s: %s\n",
108 pname
, dataPath
, u_errorName(err
));
109 fprintf(stderr
, "%s: warning: messages will not be displayed\n", pname
);
115 /* Mapping of callback names to the callbacks passed to the converter
118 static struct callback_ent
{
120 UConverterFromUCallback fromu
;
121 const void *fromuctxt
;
122 UConverterToUCallback tou
;
124 } transcode_callbacks
[] = {
126 UCNV_FROM_U_CALLBACK_SUBSTITUTE
, 0,
127 UCNV_TO_U_CALLBACK_SUBSTITUTE
, 0 },
129 UCNV_FROM_U_CALLBACK_SKIP
, 0,
130 UCNV_TO_U_CALLBACK_SKIP
, 0 },
132 UCNV_FROM_U_CALLBACK_STOP
, 0,
133 UCNV_TO_U_CALLBACK_STOP
, 0 },
135 UCNV_FROM_U_CALLBACK_ESCAPE
, 0,
136 UCNV_TO_U_CALLBACK_ESCAPE
, 0},
138 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_ICU
,
139 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_ICU
},
141 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_JAVA
,
142 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_JAVA
},
144 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_C
,
145 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_C
},
147 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
,
148 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
},
150 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
,
151 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
},
153 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_DEC
,
154 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_DEC
},
155 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_UNICODE
,
156 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_UNICODE
}
159 /* Return a pointer to a callback record given its name. */
161 static const struct callback_ent
*findCallback(const char *name
) {
163 sizeof(transcode_callbacks
) / sizeof(*transcode_callbacks
);
165 /* We'll do a linear search, there aren't many of them and bsearch()
166 may not be that portable. */
168 for (i
= 0; i
< count
; ++i
) {
169 if (!uprv_stricmp(name
, transcode_callbacks
[i
].name
)) {
170 return &transcode_callbacks
[i
];
177 /* Print converter information. If lookfor is set, only that converter will
178 be printed, otherwise all converters will be printed. If canon is non
179 zero, tags and aliases for each converter are printed too, in the format
180 expected for convrters.txt(5). */
182 static int printConverters(const char *pname
, const char *lookfor
,
185 UErrorCode err
= U_ZERO_ERROR
;
190 /* If there is a specified name, just handle that now. */
194 printf("%s\n", lookfor
);
197 /* Because we are printing a canonical name, we need the
198 true converter name. We've done that already except for
199 the default name (because we want to print the exact
200 name one would get when calling ucnv_getDefaultName()
201 in non-canon mode). But since we do not know at this
202 point if we have the default name or something else, we
203 need to normalize again to the canonical converter
206 const char *truename
= ucnv_getAlias(lookfor
, 0, &err
);
207 if (U_SUCCESS(err
)) {
215 /* Print converter names. We come here for one of two reasons: we
216 are printing all the names (lookfor was null), or we have a
217 single converter to print but in canon mode, hence we need to
218 get to it in order to print everything. */
220 num
= ucnv_countAvailable();
223 u_wmsg(stderr
, "cantGetNames");
227 num
= 1; /* We know where we want to be. */
230 num_stds
= ucnv_countStandards();
231 stds
= (const char **) uprv_malloc(num_stds
* sizeof(*stds
));
233 u_wmsg(stderr
, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR
));
241 for (s
= 0; s
< num_stds
; ++s
) {
242 stds
[s
] = ucnv_getStandard(s
, &err
);
244 printf("%s ", stds
[s
]);
246 if (U_FAILURE(err
)) {
247 u_wmsg(stderr
, "cantGetTag", u_wmsg_errorName(err
));
256 for (int32_t i
= 0; i
< num
; i
++) {
258 uint16_t num_aliases
;
260 /* Set the name either to what we are looking for, or
261 to the current converter name. */
266 name
= ucnv_getAvailableName(i
);
269 /* Get all the aliases associated to the name. */
272 num_aliases
= ucnv_countAliases(name
, &err
);
273 if (U_FAILURE(err
)) {
276 UnicodeString
str(name
, "");
278 u_wmsg(stderr
, "cantGetAliases", str
.getTerminatedBuffer(),
279 u_wmsg_errorName(err
));
284 /* Write all the aliases and their tags. */
286 for (a
= 0; a
< num_aliases
; ++a
) {
287 const char *alias
= ucnv_getAlias(name
, a
, &err
);
289 if (U_FAILURE(err
)) {
290 UnicodeString
str(name
, "");
292 u_wmsg(stderr
, "cantGetAliases", str
.getTerminatedBuffer(),
293 u_wmsg_errorName(err
));
297 /* Print the current alias so that it looks right. */
298 printf("%s%s%s", (canon
? (a
== 0? "" : "\t" ) : "") ,
302 /* Look (slowly, linear searching) for a tag. */
305 /* -1 to skip the last standard */
306 for (s
= t
= 0; s
< num_stds
-1; ++s
) {
307 UEnumeration
*nameEnum
= ucnv_openStandardNames(name
, stds
[s
], &err
);
308 if (U_SUCCESS(err
)) {
309 /* List the standard tags */
310 const char *standardName
;
311 UBool isFirst
= TRUE
;
312 UErrorCode enumError
= U_ZERO_ERROR
;
313 while ((standardName
= uenum_next(nameEnum
, NULL
, &enumError
))) {
314 /* See if this alias is supported by this standard. */
315 if (!strcmp(standardName
, alias
)) {
320 /* Print a * after the default standard name */
321 printf(" %s%s", stds
[s
], (isFirst
? "*" : ""));
331 /* Terminate this entry. */
338 /* Terminate this entry. */
345 /* Free temporary data. */
354 /* Print all available transliterators. If canon is non zero, print
355 one transliterator per line. */
357 static int printTransliterators(UBool canon
)
359 #if UCONFIG_NO_TRANSLITERATION
360 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
363 int32_t numtrans
= utrans_countAvailableIDs(), i
;
365 char *buf
= (char *) uprv_malloc(buflen
);
368 char sepchar
= canon
? '\n' : ' ';
372 buflen
= sizeof(staticbuf
);
375 for (i
= 0; i
< numtrans
; ++i
) {
376 int32_t len
= utrans_getAvailableID(i
, buf
, buflen
);
377 if (len
>= buflen
- 1) {
378 if (buf
!= staticbuf
) {
383 buf
= (char *) uprv_realloc(buf
, buflen
);
386 buflen
= sizeof(staticbuf
);
389 utrans_getAvailableID(i
, buf
, buflen
);
391 uprv_strcpy(buf
+ buflen
- 4, "..."); /* Truncate the name. */
396 if (i
< numtrans
- 1) {
401 /* Add a terminating newline if needed. */
403 if (sepchar
!= '\n') {
407 /* Free temporary data. */
409 if (buf
!= staticbuf
) {
421 uCR
= 0xd, // carriage return
422 uLF
= 0xa, // line feed
423 uNL
= 0x85, // newline
424 uLS
= 0x2028, // line separator
425 uPS
= 0x2029, // paragraph separator
426 uSig
= 0xfeff // signature/BOM character
429 static inline int32_t
430 getChunkLimit(const UnicodeString
&prev
, const UnicodeString
&s
) {
432 // CR, LF, CRLF, NL, LS, PS
433 // for paragraph ends (see UAX #13/Unicode 4)
434 // and include it in the chunk
435 // all of these characters are on the BMP
436 // do not include FF or VT in case they are part of a paragraph
437 // (important for bidi contexts)
438 static const UChar paraEnds
[] = {
439 0xd, 0xa, 0x85, 0x2028, 0x2029
442 iCR
, iLF
, iNL
, iLS
, iPS
, iCount
445 // first, see if there is a CRLF split between prev and s
446 if (prev
.endsWith(paraEnds
+ iCR
, 1)) {
447 if (s
.startsWith(paraEnds
+ iLF
, 1)) {
448 return 1; // split CRLF, include the LF
449 } else if (!s
.isEmpty()) {
450 return 0; // complete the last chunk
452 return -1; // wait for actual further contents to arrive
456 const UChar
*u
= s
.getBuffer(), *limit
= u
+ s
.length();
462 ((c
< uSP
) && (c
== uCR
|| c
== uLF
)) ||
469 return -1; // LF may be in the next chunk
470 } else if (*u
== uLF
) {
471 ++u
; // include the LF in this chunk
474 return (int32_t)(u
- s
.getBuffer());
478 return -1; // continue collecting the chunk
482 CNV_NO_FEFF
, // cannot convert the U+FEFF Unicode signature character (BOM)
483 CNV_WITH_FEFF
, // can convert the U+FEFF signature character
484 CNV_ADDS_FEFF
// automatically adds/detects the U+FEFF signature character
488 nibbleToHex(uint8_t n
) {
493 (UChar
)((0x61 - 10) + n
);
496 // check the converter's Unicode signature properties;
497 // the fromUnicode side of the converter must be in its initial state
498 // and will be reset again if it was used
500 cnvSigType(UConverter
*cnv
) {
504 // test if the output charset can convert U+FEFF
505 USet
*set
= uset_open(1, 0);
507 ucnv_getUnicodeSet(cnv
, set
, UCNV_ROUNDTRIP_SET
, &err
);
508 if (U_SUCCESS(err
) && uset_contains(set
, uSig
)) {
509 result
= CNV_WITH_FEFF
;
511 result
= CNV_NO_FEFF
; // an error occurred or U+FEFF cannot be converted
515 if (result
== CNV_WITH_FEFF
) {
516 // test if the output charset emits a signature anyway
517 const UChar a
[1] = { 0x61 }; // "a"
526 ucnv_fromUnicode(cnv
,
527 &out
, buffer
+ sizeof(buffer
),
530 ucnv_resetFromUnicode(cnv
);
532 if (NULL
!= ucnv_detectUnicodeSignature(buffer
, (int32_t)(out
- buffer
), NULL
, &err
) &&
535 result
= CNV_ADDS_FEFF
;
545 buf(NULL
), outbuf(NULL
), fromoffsets(NULL
),
546 bufsz(0), signature(0) {}
549 setBufferSize(size_t bufferSize
) {
552 buf
= new char[2 * bufsz
];
553 outbuf
= buf
+ bufsz
;
555 // +1 for an added U+FEFF in the intermediate Unicode buffer
556 fromoffsets
= new int32_t[bufsz
+ 1];
561 delete [] fromoffsets
;
564 UBool
convertFile(const char *pname
,
565 const char *fromcpage
,
566 UConverterToUCallback toucallback
,
569 UConverterFromUCallback fromucallback
,
570 const void *fromuctxt
,
572 const char *translit
,
573 const char *infilestr
,
574 FILE * outfile
, int verbose
);
576 friend int main(int argc
, char **argv
);
579 int32_t *fromoffsets
;
582 int8_t signature
; // add (1) or remove (-1) a U+FEFF Unicode signature character
585 // Convert a file from one encoding to another
587 ConvertFile::convertFile(const char *pname
,
588 const char *fromcpage
,
589 UConverterToUCallback toucallback
,
592 UConverterFromUCallback fromucallback
,
593 const void *fromuctxt
,
595 const char *translit
,
596 const char *infilestr
,
597 FILE * outfile
, int verbose
)
601 UConverter
*convfrom
= 0;
602 UConverter
*convto
= 0;
603 UErrorCode err
= U_ZERO_ERROR
;
605 const char *cbufp
, *prevbufp
;
608 uint32_t infoffset
= 0, outfoffset
= 0; /* Where we are in the file, for error reporting. */
610 const UChar
*unibuf
, *unibufbp
;
615 #if !UCONFIG_NO_TRANSLITERATION
616 Transliterator
*t
= 0; // Transliterator acting on Unicode data.
617 UnicodeString chunk
; // One chunk of the text being collected for transformation.
619 UnicodeString u
; // String to do the transliteration.
622 // use conversion offsets for error messages
623 // unless a transliterator is used -
624 // a text transformation will reorder characters in unpredictable ways
625 UBool useOffsets
= TRUE
;
627 // Open the correct input file or connect to stdin for reading input
629 if (infilestr
!= 0 && strcmp(infilestr
, "-")) {
630 infile
= fopen(infilestr
, "rb");
632 UnicodeString
str1(infilestr
, "");
633 str1
.append((UChar32
) 0);
634 UnicodeString
str2(strerror(errno
), "");
635 str2
.append((UChar32
) 0);
637 u_wmsg(stderr
, "cantOpenInputF", str1
.getBuffer(), str2
.getBuffer());
643 #ifdef USE_FILENO_BINARY_MODE
644 if (setmode(fileno(stdin
), O_BINARY
) == -1) {
646 u_wmsg(stderr
, "cantSetInBinMode");
653 fprintf(stderr
, "%s:\n", infilestr
);
656 #if !UCONFIG_NO_TRANSLITERATION
657 // Create transliterator as needed.
659 if (translit
!= NULL
&& *translit
) {
661 UnicodeString
str(translit
), pestr
;
663 /* Create from rules or by ID as needed. */
667 if (uprv_strchr(translit
, ':') || uprv_strchr(translit
, '>') || uprv_strchr(translit
, '<') || uprv_strchr(translit
, '>')) {
668 t
= Transliterator::createFromRules("Uconv", str
, UTRANS_FORWARD
, parse
, err
);
670 t
= Transliterator::createInstance(translit
, UTRANS_FORWARD
, err
);
673 if (U_FAILURE(err
)) {
674 str
.append((UChar32
) 0);
677 if (parse
.line
>= 0) {
678 UChar linebuf
[20], offsetbuf
[20];
679 uprv_itou(linebuf
, 20, parse
.line
, 10, 0);
680 uprv_itou(offsetbuf
, 20, parse
.offset
, 10, 0);
681 u_wmsg(stderr
, "cantCreateTranslitParseErr", str
.getTerminatedBuffer(),
682 u_wmsg_errorName(err
), linebuf
, offsetbuf
);
684 u_wmsg(stderr
, "cantCreateTranslit", str
.getTerminatedBuffer(),
685 u_wmsg_errorName(err
));
699 // Create codepage converter. If the codepage or its aliases weren't
700 // available, it returns NULL and a failure code. We also set the
701 // callbacks, and return errors in the same way.
703 convfrom
= ucnv_open(fromcpage
, &err
);
704 if (U_FAILURE(err
)) {
705 UnicodeString
str(fromcpage
, "");
707 u_wmsg(stderr
, "cantOpenFromCodeset", str
.getTerminatedBuffer(),
708 u_wmsg_errorName(err
));
711 ucnv_setToUCallBack(convfrom
, toucallback
, touctxt
, 0, 0, &err
);
712 if (U_FAILURE(err
)) {
714 u_wmsg(stderr
, "cantSetCallback", u_wmsg_errorName(err
));
718 convto
= ucnv_open(tocpage
, &err
);
719 if (U_FAILURE(err
)) {
720 UnicodeString
str(tocpage
, "");
722 u_wmsg(stderr
, "cantOpenToCodeset", str
.getTerminatedBuffer(),
723 u_wmsg_errorName(err
));
726 ucnv_setFromUCallBack(convto
, fromucallback
, fromuctxt
, 0, 0, &err
);
727 if (U_FAILURE(err
)) {
729 u_wmsg(stderr
, "cantSetCallback", u_wmsg_errorName(err
));
732 ucnv_setFallback(convto
, fallback
);
734 UBool willexit
, fromSawEndOfBytes
, toSawEndOfUnicode
;
737 // OK, we can convert now.
744 // input file offset at the beginning of the next buffer
747 rd
= fread(buf
, 1, bufsz
, infile
);
748 if (ferror(infile
) != 0) {
749 UnicodeString
str(strerror(errno
));
751 u_wmsg(stderr
, "cantRead", str
.getTerminatedBuffer());
755 // Convert the read buffer into the new encoding via Unicode.
756 // After the call 'unibufp' will be placed behind the last
757 // character that was converted in the 'unibuf'.
758 // Also the 'cbufp' is positioned behind the last converted
760 // At the last conversion in the file, flush should be set to
761 // true so that we get all characters converted.
763 // The converter must be flushed at the end of conversion so
764 // that characters on hold also will be written.
767 flush
= (UBool
)(rd
!= bufsz
);
769 // convert until the input is consumed
771 // remember the start of the current byte-to-Unicode conversion
774 unibuf
= unibufp
= u
.getBuffer((int32_t)bufsz
);
776 // Use bufsz instead of u.getCapacity() for the targetLimit
777 // so that we don't overflow fromoffsets[].
778 ucnv_toUnicode(convfrom
, &unibufp
, unibuf
+ bufsz
, &cbufp
,
779 buf
+ rd
, useOffsets
? fromoffsets
: NULL
, flush
, &err
);
781 ulen
= (int32_t)(unibufp
- unibuf
);
782 u
.releaseBuffer(ulen
);
784 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
785 // converting all of the input bytes.
786 // It works like this because ucnv_toUnicode() returns only under the
787 // following conditions:
788 // - an error occurred during conversion (an error code is set)
789 // - the target buffer is filled (the error code indicates an overflow)
790 // - the source is consumed
791 // That is, if the error code does not indicate a failure,
792 // not even an overflow, then the source must be consumed entirely.
793 fromSawEndOfBytes
= (UBool
)U_SUCCESS(err
);
795 if (err
== U_BUFFER_OVERFLOW_ERROR
) {
797 } else if (U_FAILURE(err
)) {
798 char pos
[32], errorBytes
[32];
799 int8_t i
, length
, errorLength
;
801 UErrorCode localError
= U_ZERO_ERROR
;
802 errorLength
= (int8_t)sizeof(errorBytes
);
803 ucnv_getInvalidChars(convfrom
, errorBytes
, &errorLength
, &localError
);
804 if (U_FAILURE(localError
) || errorLength
== 0) {
808 // print the input file offset of the start of the error bytes:
809 // input file offset of the current byte buffer +
810 // length of the just consumed bytes -
811 // length of the error bytes
813 (int8_t)sprintf(pos
, "%d",
814 (int)(infoffset
+ (cbufp
- buf
) - errorLength
));
816 // output the bytes that caused the error
818 for (i
= 0; i
< errorLength
; ++i
) {
820 str
.append((UChar
)uSP
);
822 str
.append(nibbleToHex((uint8_t)errorBytes
[i
] >> 4));
823 str
.append(nibbleToHex((uint8_t)errorBytes
[i
]));
827 u_wmsg(stderr
, "problemCvtToU",
828 UnicodeString(pos
, length
, "").getTerminatedBuffer(),
829 str
.getTerminatedBuffer(),
830 u_wmsg_errorName(err
));
833 err
= U_ZERO_ERROR
; /* reset the error for the rest of the conversion. */
836 // Replaced a check for whether the input was consumed by
837 // looping until it is; message key "premEndInput" now obsolete.
843 // remove a U+FEFF Unicode signature character if requested
845 if (u
.charAt(0) == uSig
) {
848 // account for the removed UChar and offset
852 // remove an offset from fromoffsets[] as well
853 // to keep the array parallel with the UChars
854 memmove(fromoffsets
, fromoffsets
+ 1, ulen
* 4);
861 #if !UCONFIG_NO_TRANSLITERATION
862 // Transliterate/transform if needed.
864 // For transformation, we use chunking code -
865 // collect Unicode input until, for example, an end-of-line,
866 // then transform and output-convert that and continue collecting.
867 // This makes the transformation result independent of the buffer size
868 // while avoiding the slower keyboard mode.
869 // The end-of-chunk characters are completely included in the
870 // transformed string in case they are to be transformed themselves.
876 chunkLimit
= getChunkLimit(chunk
, u
);
877 if (chunkLimit
< 0 && flush
&& fromSawEndOfBytes
) {
878 // use all of the rest at the end of the text
879 chunkLimit
= u
.length();
881 if (chunkLimit
>= 0) {
882 // complete the chunk and transform it
883 chunk
.append(u
, 0, chunkLimit
);
884 u
.remove(0, chunkLimit
);
885 t
->transliterate(chunk
);
887 // append the transformation result to the result and empty the chunk
891 // continue collecting the chunk
895 } while (!u
.isEmpty());
902 // add a U+FEFF Unicode signature character if requested
903 // and possible/necessary
905 if (u
.charAt(0) != uSig
&& cnvSigType(convto
) == CNV_WITH_FEFF
) {
906 u
.insert(0, (UChar
)uSig
);
909 // insert a pseudo-offset into fromoffsets[] as well
910 // to keep the array parallel with the UChars
911 memmove(fromoffsets
+ 1, fromoffsets
, ulen
* 4);
915 // account for the additional UChar and offset
921 // Convert the Unicode buffer into the destination codepage
922 // Again 'bufp' will be placed behind the last converted character
923 // And 'unibufp' will be placed behind the last converted unicode character
924 // At the last conversion flush should be set to true to ensure that
925 // all characters left get converted
927 unibuf
= unibufbp
= u
.getBuffer();
932 // Use fromSawEndOfBytes in addition to the flush flag -
933 // it indicates whether the intermediate Unicode string
934 // contains the very last UChars for the very last input bytes.
935 ucnv_fromUnicode(convto
, &bufp
, outbuf
+ bufsz
,
938 NULL
, (UBool
)(flush
&& fromSawEndOfBytes
), &err
);
940 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
941 // converting all of the intermediate UChars.
942 // See comment for fromSawEndOfBytes.
943 toSawEndOfUnicode
= (UBool
)U_SUCCESS(err
);
945 if (err
== U_BUFFER_OVERFLOW_ERROR
) {
947 } else if (U_FAILURE(err
)) {
948 UChar errorUChars
[4];
952 int8_t i
, length
, errorLength
;
954 UErrorCode localError
= U_ZERO_ERROR
;
955 errorLength
= (int8_t)LENGTHOF(errorUChars
);
956 ucnv_getInvalidUChars(convto
, errorUChars
, &errorLength
, &localError
);
957 if (U_FAILURE(localError
) || errorLength
== 0) {
958 // need at least 1 so that we don't access beyond the length of fromoffsets[]
965 // Unicode buffer offset of the start of the error UChars
966 ferroffset
= (int32_t)((unibufbp
- unibuf
) - errorLength
);
967 if (ferroffset
< 0) {
968 // approximation - the character started in the previous Unicode buffer
972 // get the corresponding byte offset out of fromoffsets[]
973 // go back if the offset is not known for some of the UChars
976 fromoffset
= fromoffsets
[ferroffset
];
977 } while (fromoffset
< 0 && --ferroffset
>= 0);
979 // total input file offset =
980 // input file offset of the current byte buffer +
981 // byte buffer offset of where the current Unicode buffer is converted from +
982 // fromoffsets[Unicode offset]
983 ferroffset
= infoffset
+ (prevbufp
- buf
) + fromoffset
;
984 errtag
= "problemCvtFromU";
986 // Do not use fromoffsets if (t != NULL) because the Unicode text may
987 // be different from what the offsets refer to.
989 // output file offset
990 ferroffset
= (int32_t)(outfoffset
+ (bufp
- outbuf
));
991 errtag
= "problemCvtFromUOut";
994 length
= (int8_t)sprintf(pos
, "%u", (int)ferroffset
);
996 // output the code points that caused the error
998 for (i
= 0; i
< errorLength
;) {
1000 str
.append((UChar
)uSP
);
1002 U16_NEXT(errorUChars
, i
, errorLength
, c
);
1003 if (c
>= 0x100000) {
1004 str
.append(nibbleToHex((uint8_t)(c
>> 20)));
1007 str
.append(nibbleToHex((uint8_t)(c
>> 16)));
1009 str
.append(nibbleToHex((uint8_t)(c
>> 12)));
1010 str
.append(nibbleToHex((uint8_t)(c
>> 8)));
1011 str
.append(nibbleToHex((uint8_t)(c
>> 4)));
1012 str
.append(nibbleToHex((uint8_t)c
));
1016 u_wmsg(stderr
, errtag
,
1017 UnicodeString(pos
, length
, "").getTerminatedBuffer(),
1018 str
.getTerminatedBuffer(),
1019 u_wmsg_errorName(err
));
1020 u_wmsg(stderr
, "errorUnicode", str
.getTerminatedBuffer());
1023 err
= U_ZERO_ERROR
; /* reset the error for the rest of the conversion. */
1026 // Replaced a check for whether the intermediate Unicode characters were all consumed by
1027 // looping until they are; message key "premEnd" now obsolete.
1029 // Finally, write the converted buffer to the output file
1030 size_t outlen
= (size_t) (bufp
- outbuf
);
1031 outfoffset
+= (int32_t)(wr
= fwrite(outbuf
, 1, outlen
, outfile
));
1033 UnicodeString
str(strerror(errno
));
1035 u_wmsg(stderr
, "cantWrite", str
.getTerminatedBuffer());
1042 } while (!toSawEndOfUnicode
);
1043 } while (!fromSawEndOfBytes
);
1044 } while (!flush
); // Stop when we have flushed the
1045 // converters (this means that it's
1046 // the end of output)
1056 ucnv_close(convfrom
);
1059 #if !UCONFIG_NO_TRANSLITERATION
1063 if (infile
!= stdin
) {
1070 static void usage(const char *pname
, int ecode
) {
1073 UErrorCode err
= U_ZERO_ERROR
;
1074 FILE *fp
= ecode
? stderr
: stdout
;
1079 ures_getStringByKey(gBundle
, ecode
? "lcUsageWord" : "ucUsageWord",
1081 UnicodeString
upname(pname
, (int32_t)(uprv_strlen(pname
) + 1));
1082 UnicodeString
mname(msg
, msgLen
+ 1);
1084 res
= u_wmsg(fp
, "usage", mname
.getBuffer(), upname
.getBuffer());
1089 if (!u_wmsg(fp
, "help")) {
1090 /* Now dump callbacks and finish. */
1093 sizeof(transcode_callbacks
) / sizeof(*transcode_callbacks
);
1094 for (i
= 0; i
< count
; ++i
) {
1095 fprintf(fp
, " %s", transcode_callbacks
[i
].name
);
1105 main(int argc
, char **argv
)
1110 size_t bufsz
= DEFAULT_BUFSZ
;
1112 const char *fromcpage
= 0;
1113 const char *tocpage
= 0;
1114 const char *translit
= 0;
1115 const char *outfilestr
= 0;
1116 UBool fallback
= FALSE
;
1118 UConverterFromUCallback fromucallback
= UCNV_FROM_U_CALLBACK_STOP
;
1119 const void *fromuctxt
= 0;
1120 UConverterToUCallback toucallback
= UCNV_TO_U_CALLBACK_STOP
;
1121 const void *touctxt
= 0;
1123 char **iter
, **remainArgv
, **remainArgvLimit
;
1124 char **end
= argv
+ argc
;
1128 UBool printConvs
= FALSE
, printCanon
= FALSE
, printTranslits
= FALSE
;
1129 const char *printName
= 0;
1131 UBool verbose
= FALSE
;
1132 UErrorCode status
= U_ZERO_ERROR
;
1136 /* Initialize ICU */
1138 if (U_FAILURE(status
)) {
1139 fprintf(stderr
, "%s: can not initialize ICU. status = %s\n",
1140 argv
[0], u_errorName(status
));
1144 // Get and prettify pname.
1145 pname
= uprv_strrchr(*argv
, U_FILE_SEP_CHAR
);
1148 pname
= uprv_strrchr(*argv
, '/');
1157 // First, get the arguments from command-line
1158 // to know the codepages to convert between
1160 remainArgv
= remainArgvLimit
= argv
+ 1;
1161 for (iter
= argv
+ 1; iter
!= end
; iter
++) {
1162 // Check for from charset
1163 if (strcmp("-f", *iter
) == 0 || !strcmp("--from-code", *iter
)) {
1169 } else if (strcmp("-t", *iter
) == 0 || !strcmp("--to-code", *iter
)) {
1175 } else if (strcmp("-x", *iter
) == 0) {
1181 } else if (!strcmp("--fallback", *iter
)) {
1183 } else if (!strcmp("--no-fallback", *iter
)) {
1185 } else if (strcmp("-b", *iter
) == 0 || !strcmp("--block-size", *iter
)) {
1188 bufsz
= atoi(*iter
);
1189 if ((int) bufsz
<= 0) {
1191 UnicodeString
str(*iter
);
1193 u_wmsg(stderr
, "badBlockSize", str
.getTerminatedBuffer());
1199 } else if (strcmp("-l", *iter
) == 0 || !strcmp("--list", *iter
)) {
1200 if (printTranslits
) {
1204 } else if (strcmp("--default-code", *iter
) == 0) {
1205 if (printTranslits
) {
1208 printName
= ucnv_getDefaultName();
1209 } else if (strcmp("--list-code", *iter
) == 0) {
1210 if (printTranslits
) {
1216 UErrorCode e
= U_ZERO_ERROR
;
1217 printName
= ucnv_getAlias(*iter
, 0, &e
);
1218 if (U_FAILURE(e
) || !printName
) {
1219 UnicodeString
str(*iter
);
1221 u_wmsg(stderr
, "noSuchCodeset", str
.getTerminatedBuffer());
1226 } else if (strcmp("--canon", *iter
) == 0) {
1228 } else if (strcmp("-L", *iter
) == 0
1229 || !strcmp("--list-transliterators", *iter
)) {
1233 printTranslits
= TRUE
;
1234 } else if (strcmp("-h", *iter
) == 0 || !strcmp("-?", *iter
)
1235 || !strcmp("--help", *iter
)) {
1237 } else if (!strcmp("-c", *iter
)) {
1238 fromucallback
= UCNV_FROM_U_CALLBACK_SKIP
;
1239 } else if (!strcmp("--to-callback", *iter
)) {
1242 const struct callback_ent
*cbe
= findCallback(*iter
);
1244 fromucallback
= cbe
->fromu
;
1245 fromuctxt
= cbe
->fromuctxt
;
1247 UnicodeString
str(*iter
);
1249 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1255 } else if (!strcmp("--from-callback", *iter
)) {
1258 const struct callback_ent
*cbe
= findCallback(*iter
);
1260 toucallback
= cbe
->tou
;
1261 touctxt
= cbe
->touctxt
;
1263 UnicodeString
str(*iter
);
1265 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1271 } else if (!strcmp("-i", *iter
)) {
1272 toucallback
= UCNV_TO_U_CALLBACK_SKIP
;
1273 } else if (!strcmp("--callback", *iter
)) {
1276 const struct callback_ent
*cbe
= findCallback(*iter
);
1278 fromucallback
= cbe
->fromu
;
1279 fromuctxt
= cbe
->fromuctxt
;
1280 toucallback
= cbe
->tou
;
1281 touctxt
= cbe
->touctxt
;
1283 UnicodeString
str(*iter
);
1285 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1291 } else if (!strcmp("-s", *iter
) || !strcmp("--silent", *iter
)) {
1293 } else if (!strcmp("-v", *iter
) || !strcmp("--verbose", *iter
)) {
1295 } else if (!strcmp("-V", *iter
) || !strcmp("--version", *iter
)) {
1296 printf("%s v2.1 ICU " U_ICU_VERSION
"\n", pname
);
1298 } else if (!strcmp("-o", *iter
) || !strcmp("--output", *iter
)) {
1300 if (iter
!= end
&& !outfilestr
) {
1305 } else if (0 == strcmp("--add-signature", *iter
)) {
1307 } else if (0 == strcmp("--remove-signature", *iter
)) {
1309 } else if (**iter
== '-' && (*iter
)[1]) {
1312 // move a non-option up in argv[]
1313 *remainArgvLimit
++ = *iter
;
1317 if (printConvs
|| printName
) {
1318 return printConverters(pname
, printName
, printCanon
) ? 2 : 0;
1319 } else if (printTranslits
) {
1320 return printTransliterators(printCanon
) ? 3 : 0;
1323 if (!fromcpage
|| !uprv_strcmp(fromcpage
, "-")) {
1324 fromcpage
= ucnv_getDefaultName();
1326 if (!tocpage
|| !uprv_strcmp(tocpage
, "-")) {
1327 tocpage
= ucnv_getDefaultName();
1330 // Open the correct output file or connect to stdout for reading input
1331 if (outfilestr
!= 0 && strcmp(outfilestr
, "-")) {
1332 outfile
= fopen(outfilestr
, "wb");
1334 UnicodeString
str1(outfilestr
, "");
1335 UnicodeString
str2(strerror(errno
), "");
1337 u_wmsg(stderr
, "cantCreateOutputF",
1338 str1
.getBuffer(), str2
.getBuffer());
1344 #ifdef USE_FILENO_BINARY_MODE
1345 if (setmode(fileno(outfile
), O_BINARY
) == -1) {
1346 u_wmsg(stderr
, "cantSetOutBinMode");
1352 /* Loop again on the arguments to find all the input files, and
1355 cf
.setBufferSize(bufsz
);
1357 if(remainArgv
< remainArgvLimit
) {
1358 for (iter
= remainArgv
; iter
!= remainArgvLimit
; iter
++) {
1359 if (!cf
.convertFile(
1360 pname
, fromcpage
, toucallback
, touctxt
, tocpage
,
1361 fromucallback
, fromuctxt
, fallback
, translit
, *iter
,
1368 if (!cf
.convertFile(
1369 pname
, fromcpage
, toucallback
, touctxt
, tocpage
,
1370 fromucallback
, fromuctxt
, fallback
, translit
, 0,
1382 if (outfile
!= stdout
) {
1391 * Hey, Emacs, please set the following:
1394 * indent-tabs-mode: nil