1 /*****************************************************************************
3 * Copyright (C) 1999-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
6 ******************************************************************************/
9 * uconv(1): an iconv(1)-like converter using ICU.
11 * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se>
12 * contributed in 1999.
14 * Conversion to the C conversion API and many improvements by
15 * Yves Arrouye <yves@realnames.com>, current maintainer.
17 * Markus Scherer maintainer from 2003.
18 * See source code repository history for changes.
21 #include <unicode/utypes.h>
22 #include <unicode/putil.h>
23 #include <unicode/ucnv.h>
24 #include <unicode/uenum.h>
25 #include <unicode/unistr.h>
26 #include <unicode/translit.h>
27 #include <unicode/uset.h>
28 #include <unicode/uclean.h>
39 #include "unicode/uwmsg.h"
43 #if U_PLATFORM_USES_ONLY_WIN32_API && !defined(__STRICT_ANSI__)
46 #if U_PLATFORM_USES_ONLY_WIN32_API
47 #define USE_FILENO_BINARY_MODE 1
48 /* Windows likes to rename Unix-like functions */
50 #define fileno _fileno
53 #define setmode _setmode
56 #define O_BINARY _O_BINARY
62 /* below from the README */
63 #include "unicode/utypes.h"
64 #include "unicode/udata.h"
65 U_CFUNC
char uconvmsg_dat
[];
68 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
70 #define DEFAULT_BUFSZ 4096
71 #define UCONVMSG "uconvmsg"
73 static UResourceBundle
*gBundle
= 0; /* Bundle containing messages. */
76 * Initialize the message bundle so that message strings can be fetched
81 static void initMsg(const char *pname
) {
85 char dataPath
[2048]; /* XXX Sloppy: should be PATH_MAX. */
86 UErrorCode err
= U_ZERO_ERROR
;
90 /* Set up our static data - if any */
92 udata_setAppData(UCONVMSG
, (const void*) uconvmsg_dat
, &err
);
94 fprintf(stderr
, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
95 pname
, u_errorName(err
));
96 err
= U_ZERO_ERROR
; /* It may still fail */
101 gBundle
= u_wmsg_setPath(UCONVMSG
, &err
);
102 if (U_FAILURE(err
)) {
104 "%s: warning: couldn't open bundle %s: %s\n",
105 pname
, UCONVMSG
, u_errorName(err
));
108 "%s: setAppData was called, internal data %s failed to load\n",
113 /* that was try #1, try again with a path */
114 uprv_strcpy(dataPath
, u_getDataDirectory());
115 uprv_strcat(dataPath
, U_FILE_SEP_STRING
);
116 uprv_strcat(dataPath
, UCONVMSG
);
118 gBundle
= u_wmsg_setPath(dataPath
, &err
);
119 if (U_FAILURE(err
)) {
121 "%s: warning: still couldn't open bundle %s: %s\n",
122 pname
, dataPath
, u_errorName(err
));
123 fprintf(stderr
, "%s: warning: messages will not be displayed\n", pname
);
129 /* Mapping of callback names to the callbacks passed to the converter
132 static struct callback_ent
{
134 UConverterFromUCallback fromu
;
135 const void *fromuctxt
;
136 UConverterToUCallback tou
;
138 } transcode_callbacks
[] = {
140 UCNV_FROM_U_CALLBACK_SUBSTITUTE
, 0,
141 UCNV_TO_U_CALLBACK_SUBSTITUTE
, 0 },
143 UCNV_FROM_U_CALLBACK_SKIP
, 0,
144 UCNV_TO_U_CALLBACK_SKIP
, 0 },
146 UCNV_FROM_U_CALLBACK_STOP
, 0,
147 UCNV_TO_U_CALLBACK_STOP
, 0 },
149 UCNV_FROM_U_CALLBACK_ESCAPE
, 0,
150 UCNV_TO_U_CALLBACK_ESCAPE
, 0},
152 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_ICU
,
153 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_ICU
},
155 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_JAVA
,
156 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_JAVA
},
158 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_C
,
159 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_C
},
161 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
,
162 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
},
164 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
,
165 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
},
167 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_DEC
,
168 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_DEC
},
169 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_UNICODE
,
170 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_UNICODE
}
173 /* Return a pointer to a callback record given its name. */
175 static const struct callback_ent
*findCallback(const char *name
) {
177 sizeof(transcode_callbacks
) / sizeof(*transcode_callbacks
);
179 /* We'll do a linear search, there aren't many of them and bsearch()
180 may not be that portable. */
182 for (i
= 0; i
< count
; ++i
) {
183 if (!uprv_stricmp(name
, transcode_callbacks
[i
].name
)) {
184 return &transcode_callbacks
[i
];
191 /* Print converter information. If lookfor is set, only that converter will
192 be printed, otherwise all converters will be printed. If canon is non
193 zero, tags and aliases for each converter are printed too, in the format
194 expected for convrters.txt(5). */
196 static int printConverters(const char *pname
, const char *lookfor
,
199 UErrorCode err
= U_ZERO_ERROR
;
204 /* If there is a specified name, just handle that now. */
208 printf("%s\n", lookfor
);
211 /* Because we are printing a canonical name, we need the
212 true converter name. We've done that already except for
213 the default name (because we want to print the exact
214 name one would get when calling ucnv_getDefaultName()
215 in non-canon mode). But since we do not know at this
216 point if we have the default name or something else, we
217 need to normalize again to the canonical converter
220 const char *truename
= ucnv_getAlias(lookfor
, 0, &err
);
221 if (U_SUCCESS(err
)) {
229 /* Print converter names. We come here for one of two reasons: we
230 are printing all the names (lookfor was null), or we have a
231 single converter to print but in canon mode, hence we need to
232 get to it in order to print everything. */
234 num
= ucnv_countAvailable();
237 u_wmsg(stderr
, "cantGetNames");
241 num
= 1; /* We know where we want to be. */
244 num_stds
= ucnv_countStandards();
245 stds
= (const char **) uprv_malloc(num_stds
* sizeof(*stds
));
247 u_wmsg(stderr
, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR
));
255 for (s
= 0; s
< num_stds
; ++s
) {
256 stds
[s
] = ucnv_getStandard(s
, &err
);
258 printf("%s ", stds
[s
]);
260 if (U_FAILURE(err
)) {
261 u_wmsg(stderr
, "cantGetTag", u_wmsg_errorName(err
));
270 for (int32_t i
= 0; i
< num
; i
++) {
272 uint16_t num_aliases
;
274 /* Set the name either to what we are looking for, or
275 to the current converter name. */
280 name
= ucnv_getAvailableName(i
);
283 /* Get all the aliases associated to the name. */
286 num_aliases
= ucnv_countAliases(name
, &err
);
287 if (U_FAILURE(err
)) {
290 UnicodeString
str(name
, "");
292 u_wmsg(stderr
, "cantGetAliases", str
.getTerminatedBuffer(),
293 u_wmsg_errorName(err
));
298 /* Write all the aliases and their tags. */
300 for (a
= 0; a
< num_aliases
; ++a
) {
301 const char *alias
= ucnv_getAlias(name
, a
, &err
);
303 if (U_FAILURE(err
)) {
304 UnicodeString
str(name
, "");
306 u_wmsg(stderr
, "cantGetAliases", str
.getTerminatedBuffer(),
307 u_wmsg_errorName(err
));
311 /* Print the current alias so that it looks right. */
312 printf("%s%s%s", (canon
? (a
== 0? "" : "\t" ) : "") ,
316 /* Look (slowly, linear searching) for a tag. */
319 /* -1 to skip the last standard */
320 for (s
= t
= 0; s
< num_stds
-1; ++s
) {
321 UEnumeration
*nameEnum
= ucnv_openStandardNames(name
, stds
[s
], &err
);
322 if (U_SUCCESS(err
)) {
323 /* List the standard tags */
324 const char *standardName
;
325 UBool isFirst
= TRUE
;
326 UErrorCode enumError
= U_ZERO_ERROR
;
327 while ((standardName
= uenum_next(nameEnum
, NULL
, &enumError
))) {
328 /* See if this alias is supported by this standard. */
329 if (!strcmp(standardName
, alias
)) {
334 /* Print a * after the default standard name */
335 printf(" %s%s", stds
[s
], (isFirst
? "*" : ""));
345 /* Terminate this entry. */
352 /* Terminate this entry. */
359 /* Free temporary data. */
371 /* Print all available transliterators. If canon is non zero, print
372 one transliterator per line. */
374 static int printTransliterators(UBool canon
)
376 #if UCONFIG_NO_TRANSLITERATION
377 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
380 UErrorCode status
= U_ZERO_ERROR
;
381 UEnumeration
*ids
= utrans_openIDs(&status
);
382 int32_t i
, numtrans
= uenum_count(ids
, &status
);
384 char sepchar
= canon
? '\n' : ' ';
386 for (i
= 0; U_SUCCESS(status
)&& (i
< numtrans
); ++i
) {
388 const char *nextTrans
= uenum_next(ids
, &len
, &status
);
390 printf("%s", nextTrans
);
391 if (i
< numtrans
- 1) {
398 /* Add a terminating newline if needed. */
400 if (sepchar
!= '\n') {
412 uCR
= 0xd, // carriage return
413 uLF
= 0xa, // line feed
414 uNL
= 0x85, // newline
415 uLS
= 0x2028, // line separator
416 uPS
= 0x2029, // paragraph separator
417 uSig
= 0xfeff // signature/BOM character
420 static inline int32_t
421 getChunkLimit(const UnicodeString
&prev
, const UnicodeString
&s
) {
423 // CR, LF, CRLF, NL, LS, PS
424 // for paragraph ends (see UAX #13/Unicode 4)
425 // and include it in the chunk
426 // all of these characters are on the BMP
427 // do not include FF or VT in case they are part of a paragraph
428 // (important for bidi contexts)
429 static const UChar paraEnds
[] = {
430 0xd, 0xa, 0x85, 0x2028, 0x2029
433 iCR
, iLF
, iNL
, iLS
, iPS
, iCount
436 // first, see if there is a CRLF split between prev and s
437 if (prev
.endsWith(paraEnds
+ iCR
, 1)) {
438 if (s
.startsWith(paraEnds
+ iLF
, 1)) {
439 return 1; // split CRLF, include the LF
440 } else if (!s
.isEmpty()) {
441 return 0; // complete the last chunk
443 return -1; // wait for actual further contents to arrive
447 const UChar
*u
= s
.getBuffer(), *limit
= u
+ s
.length();
453 ((c
< uSP
) && (c
== uCR
|| c
== uLF
)) ||
460 return -1; // LF may be in the next chunk
461 } else if (*u
== uLF
) {
462 ++u
; // include the LF in this chunk
465 return (int32_t)(u
- s
.getBuffer());
469 return -1; // continue collecting the chunk
473 CNV_NO_FEFF
, // cannot convert the U+FEFF Unicode signature character (BOM)
474 CNV_WITH_FEFF
, // can convert the U+FEFF signature character
475 CNV_ADDS_FEFF
// automatically adds/detects the U+FEFF signature character
479 nibbleToHex(uint8_t n
) {
484 (UChar
)((0x61 - 10) + n
);
487 // check the converter's Unicode signature properties;
488 // the fromUnicode side of the converter must be in its initial state
489 // and will be reset again if it was used
491 cnvSigType(UConverter
*cnv
) {
495 // test if the output charset can convert U+FEFF
496 USet
*set
= uset_open(1, 0);
498 ucnv_getUnicodeSet(cnv
, set
, UCNV_ROUNDTRIP_SET
, &err
);
499 if (U_SUCCESS(err
) && uset_contains(set
, uSig
)) {
500 result
= CNV_WITH_FEFF
;
502 result
= CNV_NO_FEFF
; // an error occurred or U+FEFF cannot be converted
506 if (result
== CNV_WITH_FEFF
) {
507 // test if the output charset emits a signature anyway
508 const UChar a
[1] = { 0x61 }; // "a"
517 ucnv_fromUnicode(cnv
,
518 &out
, buffer
+ sizeof(buffer
),
521 ucnv_resetFromUnicode(cnv
);
523 if (NULL
!= ucnv_detectUnicodeSignature(buffer
, (int32_t)(out
- buffer
), NULL
, &err
) &&
526 result
= CNV_ADDS_FEFF
;
536 buf(NULL
), outbuf(NULL
), fromoffsets(NULL
),
537 bufsz(0), signature(0) {}
540 setBufferSize(size_t bufferSize
) {
543 buf
= new char[2 * bufsz
];
544 outbuf
= buf
+ bufsz
;
546 // +1 for an added U+FEFF in the intermediate Unicode buffer
547 fromoffsets
= new int32_t[bufsz
+ 1];
552 delete [] fromoffsets
;
555 UBool
convertFile(const char *pname
,
556 const char *fromcpage
,
557 UConverterToUCallback toucallback
,
560 UConverterFromUCallback fromucallback
,
561 const void *fromuctxt
,
563 const char *translit
,
564 const char *infilestr
,
565 FILE * outfile
, int verbose
);
567 friend int main(int argc
, char **argv
);
570 int32_t *fromoffsets
;
573 int8_t signature
; // add (1) or remove (-1) a U+FEFF Unicode signature character
576 // Convert a file from one encoding to another
578 ConvertFile::convertFile(const char *pname
,
579 const char *fromcpage
,
580 UConverterToUCallback toucallback
,
583 UConverterFromUCallback fromucallback
,
584 const void *fromuctxt
,
586 const char *translit
,
587 const char *infilestr
,
588 FILE * outfile
, int verbose
)
592 UConverter
*convfrom
= 0;
593 UConverter
*convto
= 0;
594 UErrorCode err
= U_ZERO_ERROR
;
596 const char *cbufp
, *prevbufp
;
599 uint32_t infoffset
= 0, outfoffset
= 0; /* Where we are in the file, for error reporting. */
601 const UChar
*unibuf
, *unibufbp
;
606 #if !UCONFIG_NO_TRANSLITERATION
607 Transliterator
*t
= 0; // Transliterator acting on Unicode data.
608 UnicodeString chunk
; // One chunk of the text being collected for transformation.
610 UnicodeString u
; // String to do the transliteration.
613 // use conversion offsets for error messages
614 // unless a transliterator is used -
615 // a text transformation will reorder characters in unpredictable ways
616 UBool useOffsets
= TRUE
;
618 // Open the correct input file or connect to stdin for reading input
620 if (infilestr
!= 0 && strcmp(infilestr
, "-")) {
621 infile
= fopen(infilestr
, "rb");
623 UnicodeString
str1(infilestr
, "");
624 str1
.append((UChar32
) 0);
625 UnicodeString
str2(strerror(errno
), "");
626 str2
.append((UChar32
) 0);
628 u_wmsg(stderr
, "cantOpenInputF", str1
.getBuffer(), str2
.getBuffer());
634 #ifdef USE_FILENO_BINARY_MODE
635 if (setmode(fileno(stdin
), O_BINARY
) == -1) {
637 u_wmsg(stderr
, "cantSetInBinMode");
644 fprintf(stderr
, "%s:\n", infilestr
);
647 #if !UCONFIG_NO_TRANSLITERATION
648 // Create transliterator as needed.
650 if (translit
!= NULL
&& *translit
) {
652 UnicodeString
str(translit
), pestr
;
654 /* Create from rules or by ID as needed. */
658 if (uprv_strchr(translit
, ':') || uprv_strchr(translit
, '>') || uprv_strchr(translit
, '<') || uprv_strchr(translit
, '>')) {
659 t
= Transliterator::createFromRules("Uconv", str
, UTRANS_FORWARD
, parse
, err
);
661 t
= Transliterator::createInstance(translit
, UTRANS_FORWARD
, err
);
664 if (U_FAILURE(err
)) {
665 str
.append((UChar32
) 0);
668 if (parse
.line
>= 0) {
669 UChar linebuf
[20], offsetbuf
[20];
670 uprv_itou(linebuf
, 20, parse
.line
, 10, 0);
671 uprv_itou(offsetbuf
, 20, parse
.offset
, 10, 0);
672 u_wmsg(stderr
, "cantCreateTranslitParseErr", str
.getTerminatedBuffer(),
673 u_wmsg_errorName(err
), linebuf
, offsetbuf
);
675 u_wmsg(stderr
, "cantCreateTranslit", str
.getTerminatedBuffer(),
676 u_wmsg_errorName(err
));
690 // Create codepage converter. If the codepage or its aliases weren't
691 // available, it returns NULL and a failure code. We also set the
692 // callbacks, and return errors in the same way.
694 convfrom
= ucnv_open(fromcpage
, &err
);
695 if (U_FAILURE(err
)) {
696 UnicodeString
str(fromcpage
, "");
698 u_wmsg(stderr
, "cantOpenFromCodeset", str
.getTerminatedBuffer(),
699 u_wmsg_errorName(err
));
702 ucnv_setToUCallBack(convfrom
, toucallback
, touctxt
, 0, 0, &err
);
703 if (U_FAILURE(err
)) {
705 u_wmsg(stderr
, "cantSetCallback", u_wmsg_errorName(err
));
709 convto
= ucnv_open(tocpage
, &err
);
710 if (U_FAILURE(err
)) {
711 UnicodeString
str(tocpage
, "");
713 u_wmsg(stderr
, "cantOpenToCodeset", str
.getTerminatedBuffer(),
714 u_wmsg_errorName(err
));
717 ucnv_setFromUCallBack(convto
, fromucallback
, fromuctxt
, 0, 0, &err
);
718 if (U_FAILURE(err
)) {
720 u_wmsg(stderr
, "cantSetCallback", u_wmsg_errorName(err
));
723 ucnv_setFallback(convto
, fallback
);
725 UBool willexit
, fromSawEndOfBytes
, toSawEndOfUnicode
;
728 // OK, we can convert now.
735 // input file offset at the beginning of the next buffer
738 rd
= fread(buf
, 1, bufsz
, infile
);
739 if (ferror(infile
) != 0) {
740 UnicodeString
str(strerror(errno
));
742 u_wmsg(stderr
, "cantRead", str
.getTerminatedBuffer());
746 // Convert the read buffer into the new encoding via Unicode.
747 // After the call 'unibufp' will be placed behind the last
748 // character that was converted in the 'unibuf'.
749 // Also the 'cbufp' is positioned behind the last converted
751 // At the last conversion in the file, flush should be set to
752 // true so that we get all characters converted.
754 // The converter must be flushed at the end of conversion so
755 // that characters on hold also will be written.
758 flush
= (UBool
)(rd
!= bufsz
);
760 // convert until the input is consumed
762 // remember the start of the current byte-to-Unicode conversion
765 unibuf
= unibufp
= u
.getBuffer((int32_t)bufsz
);
767 // Use bufsz instead of u.getCapacity() for the targetLimit
768 // so that we don't overflow fromoffsets[].
769 ucnv_toUnicode(convfrom
, &unibufp
, unibuf
+ bufsz
, &cbufp
,
770 buf
+ rd
, useOffsets
? fromoffsets
: NULL
, flush
, &err
);
772 ulen
= (int32_t)(unibufp
- unibuf
);
773 u
.releaseBuffer(U_SUCCESS(err
) ? ulen
: 0);
775 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
776 // converting all of the input bytes.
777 // It works like this because ucnv_toUnicode() returns only under the
778 // following conditions:
779 // - an error occurred during conversion (an error code is set)
780 // - the target buffer is filled (the error code indicates an overflow)
781 // - the source is consumed
782 // That is, if the error code does not indicate a failure,
783 // not even an overflow, then the source must be consumed entirely.
784 fromSawEndOfBytes
= (UBool
)U_SUCCESS(err
);
786 if (err
== U_BUFFER_OVERFLOW_ERROR
) {
788 } else if (U_FAILURE(err
)) {
789 char pos
[32], errorBytes
[32];
790 int8_t i
, length
, errorLength
;
792 UErrorCode localError
= U_ZERO_ERROR
;
793 errorLength
= (int8_t)sizeof(errorBytes
);
794 ucnv_getInvalidChars(convfrom
, errorBytes
, &errorLength
, &localError
);
795 if (U_FAILURE(localError
) || errorLength
== 0) {
799 // print the input file offset of the start of the error bytes:
800 // input file offset of the current byte buffer +
801 // length of the just consumed bytes -
802 // length of the error bytes
804 (int8_t)sprintf(pos
, "%d",
805 (int)(infoffset
+ (cbufp
- buf
) - errorLength
));
807 // output the bytes that caused the error
809 for (i
= 0; i
< errorLength
; ++i
) {
811 str
.append((UChar
)uSP
);
813 str
.append(nibbleToHex((uint8_t)errorBytes
[i
] >> 4));
814 str
.append(nibbleToHex((uint8_t)errorBytes
[i
]));
818 u_wmsg(stderr
, "problemCvtToU",
819 UnicodeString(pos
, length
, "").getTerminatedBuffer(),
820 str
.getTerminatedBuffer(),
821 u_wmsg_errorName(err
));
824 err
= U_ZERO_ERROR
; /* reset the error for the rest of the conversion. */
827 // Replaced a check for whether the input was consumed by
828 // looping until it is; message key "premEndInput" now obsolete.
834 // remove a U+FEFF Unicode signature character if requested
836 if (u
.charAt(0) == uSig
) {
839 // account for the removed UChar and offset
843 // remove an offset from fromoffsets[] as well
844 // to keep the array parallel with the UChars
845 memmove(fromoffsets
, fromoffsets
+ 1, ulen
* 4);
852 #if !UCONFIG_NO_TRANSLITERATION
853 // Transliterate/transform if needed.
855 // For transformation, we use chunking code -
856 // collect Unicode input until, for example, an end-of-line,
857 // then transform and output-convert that and continue collecting.
858 // This makes the transformation result independent of the buffer size
859 // while avoiding the slower keyboard mode.
860 // The end-of-chunk characters are completely included in the
861 // transformed string in case they are to be transformed themselves.
867 chunkLimit
= getChunkLimit(chunk
, u
);
868 if (chunkLimit
< 0 && flush
&& fromSawEndOfBytes
) {
869 // use all of the rest at the end of the text
870 chunkLimit
= u
.length();
872 if (chunkLimit
>= 0) {
873 // complete the chunk and transform it
874 chunk
.append(u
, 0, chunkLimit
);
875 u
.remove(0, chunkLimit
);
876 t
->transliterate(chunk
);
878 // append the transformation result to the result and empty the chunk
882 // continue collecting the chunk
886 } while (!u
.isEmpty());
893 // add a U+FEFF Unicode signature character if requested
894 // and possible/necessary
896 if (u
.charAt(0) != uSig
&& cnvSigType(convto
) == CNV_WITH_FEFF
) {
897 u
.insert(0, (UChar
)uSig
);
900 // insert a pseudo-offset into fromoffsets[] as well
901 // to keep the array parallel with the UChars
902 memmove(fromoffsets
+ 1, fromoffsets
, ulen
* 4);
906 // account for the additional UChar and offset
912 // Convert the Unicode buffer into the destination codepage
913 // Again 'bufp' will be placed behind the last converted character
914 // And 'unibufp' will be placed behind the last converted unicode character
915 // At the last conversion flush should be set to true to ensure that
916 // all characters left get converted
918 unibuf
= unibufbp
= u
.getBuffer();
923 // Use fromSawEndOfBytes in addition to the flush flag -
924 // it indicates whether the intermediate Unicode string
925 // contains the very last UChars for the very last input bytes.
926 ucnv_fromUnicode(convto
, &bufp
, outbuf
+ bufsz
,
929 NULL
, (UBool
)(flush
&& fromSawEndOfBytes
), &err
);
931 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
932 // converting all of the intermediate UChars.
933 // See comment for fromSawEndOfBytes.
934 toSawEndOfUnicode
= (UBool
)U_SUCCESS(err
);
936 if (err
== U_BUFFER_OVERFLOW_ERROR
) {
938 } else if (U_FAILURE(err
)) {
939 UChar errorUChars
[4];
943 int8_t i
, length
, errorLength
;
945 UErrorCode localError
= U_ZERO_ERROR
;
946 errorLength
= (int8_t)LENGTHOF(errorUChars
);
947 ucnv_getInvalidUChars(convto
, errorUChars
, &errorLength
, &localError
);
948 if (U_FAILURE(localError
) || errorLength
== 0) {
949 // need at least 1 so that we don't access beyond the length of fromoffsets[]
956 // Unicode buffer offset of the start of the error UChars
957 ferroffset
= (int32_t)((unibufbp
- unibuf
) - errorLength
);
958 if (ferroffset
< 0) {
959 // approximation - the character started in the previous Unicode buffer
963 // get the corresponding byte offset out of fromoffsets[]
964 // go back if the offset is not known for some of the UChars
967 fromoffset
= fromoffsets
[ferroffset
];
968 } while (fromoffset
< 0 && --ferroffset
>= 0);
970 // total input file offset =
971 // input file offset of the current byte buffer +
972 // byte buffer offset of where the current Unicode buffer is converted from +
973 // fromoffsets[Unicode offset]
974 ferroffset
= infoffset
+ (prevbufp
- buf
) + fromoffset
;
975 errtag
= "problemCvtFromU";
977 // Do not use fromoffsets if (t != NULL) because the Unicode text may
978 // be different from what the offsets refer to.
980 // output file offset
981 ferroffset
= (int32_t)(outfoffset
+ (bufp
- outbuf
));
982 errtag
= "problemCvtFromUOut";
985 length
= (int8_t)sprintf(pos
, "%u", (int)ferroffset
);
987 // output the code points that caused the error
989 for (i
= 0; i
< errorLength
;) {
991 str
.append((UChar
)uSP
);
993 U16_NEXT(errorUChars
, i
, errorLength
, c
);
995 str
.append(nibbleToHex((uint8_t)(c
>> 20)));
998 str
.append(nibbleToHex((uint8_t)(c
>> 16)));
1000 str
.append(nibbleToHex((uint8_t)(c
>> 12)));
1001 str
.append(nibbleToHex((uint8_t)(c
>> 8)));
1002 str
.append(nibbleToHex((uint8_t)(c
>> 4)));
1003 str
.append(nibbleToHex((uint8_t)c
));
1007 u_wmsg(stderr
, errtag
,
1008 UnicodeString(pos
, length
, "").getTerminatedBuffer(),
1009 str
.getTerminatedBuffer(),
1010 u_wmsg_errorName(err
));
1011 u_wmsg(stderr
, "errorUnicode", str
.getTerminatedBuffer());
1014 err
= U_ZERO_ERROR
; /* reset the error for the rest of the conversion. */
1017 // Replaced a check for whether the intermediate Unicode characters were all consumed by
1018 // looping until they are; message key "premEnd" now obsolete.
1020 // Finally, write the converted buffer to the output file
1021 size_t outlen
= (size_t) (bufp
- outbuf
);
1022 outfoffset
+= (int32_t)(wr
= fwrite(outbuf
, 1, outlen
, outfile
));
1024 UnicodeString
str(strerror(errno
));
1026 u_wmsg(stderr
, "cantWrite", str
.getTerminatedBuffer());
1033 } while (!toSawEndOfUnicode
);
1034 } while (!fromSawEndOfBytes
);
1035 } while (!flush
); // Stop when we have flushed the
1036 // converters (this means that it's
1037 // the end of output)
1047 ucnv_close(convfrom
);
1050 #if !UCONFIG_NO_TRANSLITERATION
1054 if (infile
!= stdin
) {
1061 static void usage(const char *pname
, int ecode
) {
1064 UErrorCode err
= U_ZERO_ERROR
;
1065 FILE *fp
= ecode
? stderr
: stdout
;
1070 ures_getStringByKey(gBundle
, ecode
? "lcUsageWord" : "ucUsageWord",
1072 UnicodeString
upname(pname
, (int32_t)(uprv_strlen(pname
) + 1));
1073 UnicodeString
mname(msg
, msgLen
+ 1);
1075 res
= u_wmsg(fp
, "usage", mname
.getBuffer(), upname
.getBuffer());
1080 if (!u_wmsg(fp
, "help")) {
1081 /* Now dump callbacks and finish. */
1084 sizeof(transcode_callbacks
) / sizeof(*transcode_callbacks
);
1085 for (i
= 0; i
< count
; ++i
) {
1086 fprintf(fp
, " %s", transcode_callbacks
[i
].name
);
1096 main(int argc
, char **argv
)
1101 size_t bufsz
= DEFAULT_BUFSZ
;
1103 const char *fromcpage
= 0;
1104 const char *tocpage
= 0;
1105 const char *translit
= 0;
1106 const char *outfilestr
= 0;
1107 UBool fallback
= FALSE
;
1109 UConverterFromUCallback fromucallback
= UCNV_FROM_U_CALLBACK_STOP
;
1110 const void *fromuctxt
= 0;
1111 UConverterToUCallback toucallback
= UCNV_TO_U_CALLBACK_STOP
;
1112 const void *touctxt
= 0;
1114 char **iter
, **remainArgv
, **remainArgvLimit
;
1115 char **end
= argv
+ argc
;
1119 UBool printConvs
= FALSE
, printCanon
= FALSE
, printTranslits
= FALSE
;
1120 const char *printName
= 0;
1122 UBool verbose
= FALSE
;
1123 UErrorCode status
= U_ZERO_ERROR
;
1127 /* Initialize ICU */
1129 if (U_FAILURE(status
)) {
1130 fprintf(stderr
, "%s: can not initialize ICU. status = %s\n",
1131 argv
[0], u_errorName(status
));
1135 // Get and prettify pname.
1136 pname
= uprv_strrchr(*argv
, U_FILE_SEP_CHAR
);
1137 #if U_PLATFORM_USES_ONLY_WIN32_API
1139 pname
= uprv_strrchr(*argv
, '/');
1148 // First, get the arguments from command-line
1149 // to know the codepages to convert between
1151 remainArgv
= remainArgvLimit
= argv
+ 1;
1152 for (iter
= argv
+ 1; iter
!= end
; iter
++) {
1153 // Check for from charset
1154 if (strcmp("-f", *iter
) == 0 || !strcmp("--from-code", *iter
)) {
1160 } else if (strcmp("-t", *iter
) == 0 || !strcmp("--to-code", *iter
)) {
1166 } else if (strcmp("-x", *iter
) == 0) {
1172 } else if (!strcmp("--fallback", *iter
)) {
1174 } else if (!strcmp("--no-fallback", *iter
)) {
1176 } else if (strcmp("-b", *iter
) == 0 || !strcmp("--block-size", *iter
)) {
1179 bufsz
= atoi(*iter
);
1180 if ((int) bufsz
<= 0) {
1182 UnicodeString
str(*iter
);
1184 u_wmsg(stderr
, "badBlockSize", str
.getTerminatedBuffer());
1190 } else if (strcmp("-l", *iter
) == 0 || !strcmp("--list", *iter
)) {
1191 if (printTranslits
) {
1195 } else if (strcmp("--default-code", *iter
) == 0) {
1196 if (printTranslits
) {
1199 printName
= ucnv_getDefaultName();
1200 } else if (strcmp("--list-code", *iter
) == 0) {
1201 if (printTranslits
) {
1207 UErrorCode e
= U_ZERO_ERROR
;
1208 printName
= ucnv_getAlias(*iter
, 0, &e
);
1209 if (U_FAILURE(e
) || !printName
) {
1210 UnicodeString
str(*iter
);
1212 u_wmsg(stderr
, "noSuchCodeset", str
.getTerminatedBuffer());
1217 } else if (strcmp("--canon", *iter
) == 0) {
1219 } else if (strcmp("-L", *iter
) == 0
1220 || !strcmp("--list-transliterators", *iter
)) {
1224 printTranslits
= TRUE
;
1225 } else if (strcmp("-h", *iter
) == 0 || !strcmp("-?", *iter
)
1226 || !strcmp("--help", *iter
)) {
1228 } else if (!strcmp("-c", *iter
)) {
1229 fromucallback
= UCNV_FROM_U_CALLBACK_SKIP
;
1230 } else if (!strcmp("--to-callback", *iter
)) {
1233 const struct callback_ent
*cbe
= findCallback(*iter
);
1235 fromucallback
= cbe
->fromu
;
1236 fromuctxt
= cbe
->fromuctxt
;
1238 UnicodeString
str(*iter
);
1240 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1246 } else if (!strcmp("--from-callback", *iter
)) {
1249 const struct callback_ent
*cbe
= findCallback(*iter
);
1251 toucallback
= cbe
->tou
;
1252 touctxt
= cbe
->touctxt
;
1254 UnicodeString
str(*iter
);
1256 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1262 } else if (!strcmp("-i", *iter
)) {
1263 toucallback
= UCNV_TO_U_CALLBACK_SKIP
;
1264 } else if (!strcmp("--callback", *iter
)) {
1267 const struct callback_ent
*cbe
= findCallback(*iter
);
1269 fromucallback
= cbe
->fromu
;
1270 fromuctxt
= cbe
->fromuctxt
;
1271 toucallback
= cbe
->tou
;
1272 touctxt
= cbe
->touctxt
;
1274 UnicodeString
str(*iter
);
1276 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1282 } else if (!strcmp("-s", *iter
) || !strcmp("--silent", *iter
)) {
1284 } else if (!strcmp("-v", *iter
) || !strcmp("--verbose", *iter
)) {
1286 } else if (!strcmp("-V", *iter
) || !strcmp("--version", *iter
)) {
1287 printf("%s v2.1 ICU " U_ICU_VERSION
"\n", pname
);
1289 } else if (!strcmp("-o", *iter
) || !strcmp("--output", *iter
)) {
1291 if (iter
!= end
&& !outfilestr
) {
1296 } else if (0 == strcmp("--add-signature", *iter
)) {
1298 } else if (0 == strcmp("--remove-signature", *iter
)) {
1300 } else if (**iter
== '-' && (*iter
)[1]) {
1303 // move a non-option up in argv[]
1304 *remainArgvLimit
++ = *iter
;
1308 if (printConvs
|| printName
) {
1309 return printConverters(pname
, printName
, printCanon
) ? 2 : 0;
1310 } else if (printTranslits
) {
1311 return printTransliterators(printCanon
) ? 3 : 0;
1314 if (!fromcpage
|| !uprv_strcmp(fromcpage
, "-")) {
1315 fromcpage
= ucnv_getDefaultName();
1317 if (!tocpage
|| !uprv_strcmp(tocpage
, "-")) {
1318 tocpage
= ucnv_getDefaultName();
1321 // Open the correct output file or connect to stdout for reading input
1322 if (outfilestr
!= 0 && strcmp(outfilestr
, "-")) {
1323 outfile
= fopen(outfilestr
, "wb");
1325 UnicodeString
str1(outfilestr
, "");
1326 UnicodeString
str2(strerror(errno
), "");
1328 u_wmsg(stderr
, "cantCreateOutputF",
1329 str1
.getBuffer(), str2
.getBuffer());
1335 #ifdef USE_FILENO_BINARY_MODE
1336 if (setmode(fileno(outfile
), O_BINARY
) == -1) {
1337 u_wmsg(stderr
, "cantSetOutBinMode");
1343 /* Loop again on the arguments to find all the input files, and
1346 cf
.setBufferSize(bufsz
);
1348 if(remainArgv
< remainArgvLimit
) {
1349 for (iter
= remainArgv
; iter
!= remainArgvLimit
; iter
++) {
1350 if (!cf
.convertFile(
1351 pname
, fromcpage
, toucallback
, touctxt
, tocpage
,
1352 fromucallback
, fromuctxt
, fallback
, translit
, *iter
,
1359 if (!cf
.convertFile(
1360 pname
, fromcpage
, toucallback
, touctxt
, tocpage
,
1361 fromucallback
, fromuctxt
, fallback
, translit
, 0,
1370 #if !UCONFIG_NO_LEGACY_CONVERSION
1373 fprintf(stderr
, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n");
1377 if (outfile
!= stdout
) {
1386 * Hey, Emacs, please set the following:
1389 * indent-tabs-mode: nil