1 /*****************************************************************************
3 * Copyright (C) 1999-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
6 ******************************************************************************/
9 * uconv(1): an iconv(1)-like converter using ICU.
11 * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se>
12 * contributed in 1999.
14 * Conversion to the C conversion API and many improvements by
15 * Yves Arrouye <yves@realnames.com>, current maintainer.
17 * Markus Scherer maintainer from 2003.
18 * See source code repository history for changes.
21 #include <unicode/utypes.h>
22 #include <unicode/putil.h>
23 #include <unicode/ucnv.h>
24 #include <unicode/uenum.h>
25 #include <unicode/unistr.h>
26 #include <unicode/translit.h>
27 #include <unicode/uset.h>
28 #include <unicode/uclean.h>
39 #include "unicode/uwmsg.h"
43 #if (defined(U_WINDOWS) || defined(U_CYGWIN)) && !defined(__STRICT_ANSI__)
46 #if defined(U_WINDOWS)
47 #define USE_FILENO_BINARY_MODE 1
48 /* Windows likes to rename Unix-like functions */
50 #define fileno _fileno
53 #define setmode _setmode
56 #define O_BINARY _O_BINARY
62 /* below from the README */
63 #include "unicode/utypes.h"
64 #include "unicode/udata.h"
65 U_CFUNC
char uconvmsg_dat
[];
68 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
70 #define DEFAULT_BUFSZ 4096
71 #define UCONVMSG "uconvmsg"
73 static UResourceBundle
*gBundle
= 0; /* Bundle containing messages. */
76 * Initialize the message bundle so that message strings can be fetched
81 static void initMsg(const char *pname
) {
85 char dataPath
[2048]; /* XXX Sloppy: should be PATH_MAX. */
86 UErrorCode err
= U_ZERO_ERROR
;
90 /* Set up our static data - if any */
92 udata_setAppData(UCONVMSG
, (const void*) uconvmsg_dat
, &err
);
94 fprintf(stderr
, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
95 pname
, u_errorName(err
));
96 err
= U_ZERO_ERROR
; /* It may still fail */
101 gBundle
= u_wmsg_setPath(UCONVMSG
, &err
);
102 if (U_FAILURE(err
)) {
104 "%s: warning: couldn't open bundle %s: %s\n",
105 pname
, UCONVMSG
, u_errorName(err
));
108 "%s: setAppData was called, internal data %s failed to load\n",
113 /* that was try #1, try again with a path */
114 uprv_strcpy(dataPath
, u_getDataDirectory());
115 uprv_strcat(dataPath
, U_FILE_SEP_STRING
);
116 uprv_strcat(dataPath
, UCONVMSG
);
118 gBundle
= u_wmsg_setPath(dataPath
, &err
);
119 if (U_FAILURE(err
)) {
121 "%s: warning: still couldn't open bundle %s: %s\n",
122 pname
, dataPath
, u_errorName(err
));
123 fprintf(stderr
, "%s: warning: messages will not be displayed\n", pname
);
129 /* Mapping of callback names to the callbacks passed to the converter
132 static struct callback_ent
{
134 UConverterFromUCallback fromu
;
135 const void *fromuctxt
;
136 UConverterToUCallback tou
;
138 } transcode_callbacks
[] = {
140 UCNV_FROM_U_CALLBACK_SUBSTITUTE
, 0,
141 UCNV_TO_U_CALLBACK_SUBSTITUTE
, 0 },
143 UCNV_FROM_U_CALLBACK_SKIP
, 0,
144 UCNV_TO_U_CALLBACK_SKIP
, 0 },
146 UCNV_FROM_U_CALLBACK_STOP
, 0,
147 UCNV_TO_U_CALLBACK_STOP
, 0 },
149 UCNV_FROM_U_CALLBACK_ESCAPE
, 0,
150 UCNV_TO_U_CALLBACK_ESCAPE
, 0},
152 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_ICU
,
153 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_ICU
},
155 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_JAVA
,
156 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_JAVA
},
158 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_C
,
159 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_C
},
161 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
,
162 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
},
164 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
,
165 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
},
167 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_DEC
,
168 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_DEC
},
169 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_UNICODE
,
170 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_UNICODE
}
173 /* Return a pointer to a callback record given its name. */
175 static const struct callback_ent
*findCallback(const char *name
) {
177 sizeof(transcode_callbacks
) / sizeof(*transcode_callbacks
);
179 /* We'll do a linear search, there aren't many of them and bsearch()
180 may not be that portable. */
182 for (i
= 0; i
< count
; ++i
) {
183 if (!uprv_stricmp(name
, transcode_callbacks
[i
].name
)) {
184 return &transcode_callbacks
[i
];
191 /* Print converter information. If lookfor is set, only that converter will
192 be printed, otherwise all converters will be printed. If canon is non
193 zero, tags and aliases for each converter are printed too, in the format
194 expected for convrters.txt(5). */
196 static int printConverters(const char *pname
, const char *lookfor
,
199 UErrorCode err
= U_ZERO_ERROR
;
204 /* If there is a specified name, just handle that now. */
208 printf("%s\n", lookfor
);
211 /* Because we are printing a canonical name, we need the
212 true converter name. We've done that already except for
213 the default name (because we want to print the exact
214 name one would get when calling ucnv_getDefaultName()
215 in non-canon mode). But since we do not know at this
216 point if we have the default name or something else, we
217 need to normalize again to the canonical converter
220 const char *truename
= ucnv_getAlias(lookfor
, 0, &err
);
221 if (U_SUCCESS(err
)) {
229 /* Print converter names. We come here for one of two reasons: we
230 are printing all the names (lookfor was null), or we have a
231 single converter to print but in canon mode, hence we need to
232 get to it in order to print everything. */
234 num
= ucnv_countAvailable();
237 u_wmsg(stderr
, "cantGetNames");
241 num
= 1; /* We know where we want to be. */
244 num_stds
= ucnv_countStandards();
245 stds
= (const char **) uprv_malloc(num_stds
* sizeof(*stds
));
247 u_wmsg(stderr
, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR
));
255 for (s
= 0; s
< num_stds
; ++s
) {
256 stds
[s
] = ucnv_getStandard(s
, &err
);
258 printf("%s ", stds
[s
]);
260 if (U_FAILURE(err
)) {
261 u_wmsg(stderr
, "cantGetTag", u_wmsg_errorName(err
));
270 for (int32_t i
= 0; i
< num
; i
++) {
272 uint16_t num_aliases
;
274 /* Set the name either to what we are looking for, or
275 to the current converter name. */
280 name
= ucnv_getAvailableName(i
);
283 /* Get all the aliases associated to the name. */
286 num_aliases
= ucnv_countAliases(name
, &err
);
287 if (U_FAILURE(err
)) {
290 UnicodeString
str(name
, "");
292 u_wmsg(stderr
, "cantGetAliases", str
.getTerminatedBuffer(),
293 u_wmsg_errorName(err
));
298 /* Write all the aliases and their tags. */
300 for (a
= 0; a
< num_aliases
; ++a
) {
301 const char *alias
= ucnv_getAlias(name
, a
, &err
);
303 if (U_FAILURE(err
)) {
304 UnicodeString
str(name
, "");
306 u_wmsg(stderr
, "cantGetAliases", str
.getTerminatedBuffer(),
307 u_wmsg_errorName(err
));
311 /* Print the current alias so that it looks right. */
312 printf("%s%s%s", (canon
? (a
== 0? "" : "\t" ) : "") ,
316 /* Look (slowly, linear searching) for a tag. */
319 /* -1 to skip the last standard */
320 for (s
= t
= 0; s
< num_stds
-1; ++s
) {
321 UEnumeration
*nameEnum
= ucnv_openStandardNames(name
, stds
[s
], &err
);
322 if (U_SUCCESS(err
)) {
323 /* List the standard tags */
324 const char *standardName
;
325 UBool isFirst
= TRUE
;
326 UErrorCode enumError
= U_ZERO_ERROR
;
327 while ((standardName
= uenum_next(nameEnum
, NULL
, &enumError
))) {
328 /* See if this alias is supported by this standard. */
329 if (!strcmp(standardName
, alias
)) {
334 /* Print a * after the default standard name */
335 printf(" %s%s", stds
[s
], (isFirst
? "*" : ""));
345 /* Terminate this entry. */
352 /* Terminate this entry. */
359 /* Free temporary data. */
371 /* Print all available transliterators. If canon is non zero, print
372 one transliterator per line. */
374 static int printTransliterators(UBool canon
)
376 #if UCONFIG_NO_TRANSLITERATION
377 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
380 int32_t numtrans
= utrans_countAvailableIDs(), i
;
382 char *buf
= (char *) uprv_malloc(buflen
);
385 char sepchar
= canon
? '\n' : ' ';
389 buflen
= sizeof(staticbuf
);
392 for (i
= 0; i
< numtrans
; ++i
) {
393 int32_t len
= utrans_getAvailableID(i
, buf
, buflen
);
394 if (len
>= buflen
- 1) {
395 if (buf
!= staticbuf
) {
400 buf
= (char *) uprv_realloc(buf
, buflen
);
403 buflen
= sizeof(staticbuf
);
406 utrans_getAvailableID(i
, buf
, buflen
);
408 uprv_strcpy(buf
+ buflen
- 4, "..."); /* Truncate the name. */
413 if (i
< numtrans
- 1) {
418 /* Add a terminating newline if needed. */
420 if (sepchar
!= '\n') {
424 /* Free temporary data. */
426 if (buf
!= staticbuf
) {
438 uCR
= 0xd, // carriage return
439 uLF
= 0xa, // line feed
440 uNL
= 0x85, // newline
441 uLS
= 0x2028, // line separator
442 uPS
= 0x2029, // paragraph separator
443 uSig
= 0xfeff // signature/BOM character
446 static inline int32_t
447 getChunkLimit(const UnicodeString
&prev
, const UnicodeString
&s
) {
449 // CR, LF, CRLF, NL, LS, PS
450 // for paragraph ends (see UAX #13/Unicode 4)
451 // and include it in the chunk
452 // all of these characters are on the BMP
453 // do not include FF or VT in case they are part of a paragraph
454 // (important for bidi contexts)
455 static const UChar paraEnds
[] = {
456 0xd, 0xa, 0x85, 0x2028, 0x2029
459 iCR
, iLF
, iNL
, iLS
, iPS
, iCount
462 // first, see if there is a CRLF split between prev and s
463 if (prev
.endsWith(paraEnds
+ iCR
, 1)) {
464 if (s
.startsWith(paraEnds
+ iLF
, 1)) {
465 return 1; // split CRLF, include the LF
466 } else if (!s
.isEmpty()) {
467 return 0; // complete the last chunk
469 return -1; // wait for actual further contents to arrive
473 const UChar
*u
= s
.getBuffer(), *limit
= u
+ s
.length();
479 ((c
< uSP
) && (c
== uCR
|| c
== uLF
)) ||
486 return -1; // LF may be in the next chunk
487 } else if (*u
== uLF
) {
488 ++u
; // include the LF in this chunk
491 return (int32_t)(u
- s
.getBuffer());
495 return -1; // continue collecting the chunk
499 CNV_NO_FEFF
, // cannot convert the U+FEFF Unicode signature character (BOM)
500 CNV_WITH_FEFF
, // can convert the U+FEFF signature character
501 CNV_ADDS_FEFF
// automatically adds/detects the U+FEFF signature character
505 nibbleToHex(uint8_t n
) {
510 (UChar
)((0x61 - 10) + n
);
513 // check the converter's Unicode signature properties;
514 // the fromUnicode side of the converter must be in its initial state
515 // and will be reset again if it was used
517 cnvSigType(UConverter
*cnv
) {
521 // test if the output charset can convert U+FEFF
522 USet
*set
= uset_open(1, 0);
524 ucnv_getUnicodeSet(cnv
, set
, UCNV_ROUNDTRIP_SET
, &err
);
525 if (U_SUCCESS(err
) && uset_contains(set
, uSig
)) {
526 result
= CNV_WITH_FEFF
;
528 result
= CNV_NO_FEFF
; // an error occurred or U+FEFF cannot be converted
532 if (result
== CNV_WITH_FEFF
) {
533 // test if the output charset emits a signature anyway
534 const UChar a
[1] = { 0x61 }; // "a"
543 ucnv_fromUnicode(cnv
,
544 &out
, buffer
+ sizeof(buffer
),
547 ucnv_resetFromUnicode(cnv
);
549 if (NULL
!= ucnv_detectUnicodeSignature(buffer
, (int32_t)(out
- buffer
), NULL
, &err
) &&
552 result
= CNV_ADDS_FEFF
;
562 buf(NULL
), outbuf(NULL
), fromoffsets(NULL
),
563 bufsz(0), signature(0) {}
566 setBufferSize(size_t bufferSize
) {
569 buf
= new char[2 * bufsz
];
570 outbuf
= buf
+ bufsz
;
572 // +1 for an added U+FEFF in the intermediate Unicode buffer
573 fromoffsets
= new int32_t[bufsz
+ 1];
578 delete [] fromoffsets
;
581 UBool
convertFile(const char *pname
,
582 const char *fromcpage
,
583 UConverterToUCallback toucallback
,
586 UConverterFromUCallback fromucallback
,
587 const void *fromuctxt
,
589 const char *translit
,
590 const char *infilestr
,
591 FILE * outfile
, int verbose
);
593 friend int main(int argc
, char **argv
);
596 int32_t *fromoffsets
;
599 int8_t signature
; // add (1) or remove (-1) a U+FEFF Unicode signature character
602 // Convert a file from one encoding to another
604 ConvertFile::convertFile(const char *pname
,
605 const char *fromcpage
,
606 UConverterToUCallback toucallback
,
609 UConverterFromUCallback fromucallback
,
610 const void *fromuctxt
,
612 const char *translit
,
613 const char *infilestr
,
614 FILE * outfile
, int verbose
)
618 UConverter
*convfrom
= 0;
619 UConverter
*convto
= 0;
620 UErrorCode err
= U_ZERO_ERROR
;
622 const char *cbufp
, *prevbufp
;
625 uint32_t infoffset
= 0, outfoffset
= 0; /* Where we are in the file, for error reporting. */
627 const UChar
*unibuf
, *unibufbp
;
632 #if !UCONFIG_NO_TRANSLITERATION
633 Transliterator
*t
= 0; // Transliterator acting on Unicode data.
634 UnicodeString chunk
; // One chunk of the text being collected for transformation.
636 UnicodeString u
; // String to do the transliteration.
639 // use conversion offsets for error messages
640 // unless a transliterator is used -
641 // a text transformation will reorder characters in unpredictable ways
642 UBool useOffsets
= TRUE
;
644 // Open the correct input file or connect to stdin for reading input
646 if (infilestr
!= 0 && strcmp(infilestr
, "-")) {
647 infile
= fopen(infilestr
, "rb");
649 UnicodeString
str1(infilestr
, "");
650 str1
.append((UChar32
) 0);
651 UnicodeString
str2(strerror(errno
), "");
652 str2
.append((UChar32
) 0);
654 u_wmsg(stderr
, "cantOpenInputF", str1
.getBuffer(), str2
.getBuffer());
660 #ifdef USE_FILENO_BINARY_MODE
661 if (setmode(fileno(stdin
), O_BINARY
) == -1) {
663 u_wmsg(stderr
, "cantSetInBinMode");
670 fprintf(stderr
, "%s:\n", infilestr
);
673 #if !UCONFIG_NO_TRANSLITERATION
674 // Create transliterator as needed.
676 if (translit
!= NULL
&& *translit
) {
678 UnicodeString
str(translit
), pestr
;
680 /* Create from rules or by ID as needed. */
684 if (uprv_strchr(translit
, ':') || uprv_strchr(translit
, '>') || uprv_strchr(translit
, '<') || uprv_strchr(translit
, '>')) {
685 t
= Transliterator::createFromRules("Uconv", str
, UTRANS_FORWARD
, parse
, err
);
687 t
= Transliterator::createInstance(translit
, UTRANS_FORWARD
, err
);
690 if (U_FAILURE(err
)) {
691 str
.append((UChar32
) 0);
694 if (parse
.line
>= 0) {
695 UChar linebuf
[20], offsetbuf
[20];
696 uprv_itou(linebuf
, 20, parse
.line
, 10, 0);
697 uprv_itou(offsetbuf
, 20, parse
.offset
, 10, 0);
698 u_wmsg(stderr
, "cantCreateTranslitParseErr", str
.getTerminatedBuffer(),
699 u_wmsg_errorName(err
), linebuf
, offsetbuf
);
701 u_wmsg(stderr
, "cantCreateTranslit", str
.getTerminatedBuffer(),
702 u_wmsg_errorName(err
));
716 // Create codepage converter. If the codepage or its aliases weren't
717 // available, it returns NULL and a failure code. We also set the
718 // callbacks, and return errors in the same way.
720 convfrom
= ucnv_open(fromcpage
, &err
);
721 if (U_FAILURE(err
)) {
722 UnicodeString
str(fromcpage
, "");
724 u_wmsg(stderr
, "cantOpenFromCodeset", str
.getTerminatedBuffer(),
725 u_wmsg_errorName(err
));
728 ucnv_setToUCallBack(convfrom
, toucallback
, touctxt
, 0, 0, &err
);
729 if (U_FAILURE(err
)) {
731 u_wmsg(stderr
, "cantSetCallback", u_wmsg_errorName(err
));
735 convto
= ucnv_open(tocpage
, &err
);
736 if (U_FAILURE(err
)) {
737 UnicodeString
str(tocpage
, "");
739 u_wmsg(stderr
, "cantOpenToCodeset", str
.getTerminatedBuffer(),
740 u_wmsg_errorName(err
));
743 ucnv_setFromUCallBack(convto
, fromucallback
, fromuctxt
, 0, 0, &err
);
744 if (U_FAILURE(err
)) {
746 u_wmsg(stderr
, "cantSetCallback", u_wmsg_errorName(err
));
749 ucnv_setFallback(convto
, fallback
);
751 UBool willexit
, fromSawEndOfBytes
, toSawEndOfUnicode
;
754 // OK, we can convert now.
761 // input file offset at the beginning of the next buffer
764 rd
= fread(buf
, 1, bufsz
, infile
);
765 if (ferror(infile
) != 0) {
766 UnicodeString
str(strerror(errno
));
768 u_wmsg(stderr
, "cantRead", str
.getTerminatedBuffer());
772 // Convert the read buffer into the new encoding via Unicode.
773 // After the call 'unibufp' will be placed behind the last
774 // character that was converted in the 'unibuf'.
775 // Also the 'cbufp' is positioned behind the last converted
777 // At the last conversion in the file, flush should be set to
778 // true so that we get all characters converted.
780 // The converter must be flushed at the end of conversion so
781 // that characters on hold also will be written.
784 flush
= (UBool
)(rd
!= bufsz
);
786 // convert until the input is consumed
788 // remember the start of the current byte-to-Unicode conversion
791 unibuf
= unibufp
= u
.getBuffer((int32_t)bufsz
);
793 // Use bufsz instead of u.getCapacity() for the targetLimit
794 // so that we don't overflow fromoffsets[].
795 ucnv_toUnicode(convfrom
, &unibufp
, unibuf
+ bufsz
, &cbufp
,
796 buf
+ rd
, useOffsets
? fromoffsets
: NULL
, flush
, &err
);
798 ulen
= (int32_t)(unibufp
- unibuf
);
799 u
.releaseBuffer(U_SUCCESS(err
) ? ulen
: 0);
801 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
802 // converting all of the input bytes.
803 // It works like this because ucnv_toUnicode() returns only under the
804 // following conditions:
805 // - an error occurred during conversion (an error code is set)
806 // - the target buffer is filled (the error code indicates an overflow)
807 // - the source is consumed
808 // That is, if the error code does not indicate a failure,
809 // not even an overflow, then the source must be consumed entirely.
810 fromSawEndOfBytes
= (UBool
)U_SUCCESS(err
);
812 if (err
== U_BUFFER_OVERFLOW_ERROR
) {
814 } else if (U_FAILURE(err
)) {
815 char pos
[32], errorBytes
[32];
816 int8_t i
, length
, errorLength
;
818 UErrorCode localError
= U_ZERO_ERROR
;
819 errorLength
= (int8_t)sizeof(errorBytes
);
820 ucnv_getInvalidChars(convfrom
, errorBytes
, &errorLength
, &localError
);
821 if (U_FAILURE(localError
) || errorLength
== 0) {
825 // print the input file offset of the start of the error bytes:
826 // input file offset of the current byte buffer +
827 // length of the just consumed bytes -
828 // length of the error bytes
830 (int8_t)sprintf(pos
, "%d",
831 (int)(infoffset
+ (cbufp
- buf
) - errorLength
));
833 // output the bytes that caused the error
835 for (i
= 0; i
< errorLength
; ++i
) {
837 str
.append((UChar
)uSP
);
839 str
.append(nibbleToHex((uint8_t)errorBytes
[i
] >> 4));
840 str
.append(nibbleToHex((uint8_t)errorBytes
[i
]));
844 u_wmsg(stderr
, "problemCvtToU",
845 UnicodeString(pos
, length
, "").getTerminatedBuffer(),
846 str
.getTerminatedBuffer(),
847 u_wmsg_errorName(err
));
850 err
= U_ZERO_ERROR
; /* reset the error for the rest of the conversion. */
853 // Replaced a check for whether the input was consumed by
854 // looping until it is; message key "premEndInput" now obsolete.
860 // remove a U+FEFF Unicode signature character if requested
862 if (u
.charAt(0) == uSig
) {
865 // account for the removed UChar and offset
869 // remove an offset from fromoffsets[] as well
870 // to keep the array parallel with the UChars
871 memmove(fromoffsets
, fromoffsets
+ 1, ulen
* 4);
878 #if !UCONFIG_NO_TRANSLITERATION
879 // Transliterate/transform if needed.
881 // For transformation, we use chunking code -
882 // collect Unicode input until, for example, an end-of-line,
883 // then transform and output-convert that and continue collecting.
884 // This makes the transformation result independent of the buffer size
885 // while avoiding the slower keyboard mode.
886 // The end-of-chunk characters are completely included in the
887 // transformed string in case they are to be transformed themselves.
893 chunkLimit
= getChunkLimit(chunk
, u
);
894 if (chunkLimit
< 0 && flush
&& fromSawEndOfBytes
) {
895 // use all of the rest at the end of the text
896 chunkLimit
= u
.length();
898 if (chunkLimit
>= 0) {
899 // complete the chunk and transform it
900 chunk
.append(u
, 0, chunkLimit
);
901 u
.remove(0, chunkLimit
);
902 t
->transliterate(chunk
);
904 // append the transformation result to the result and empty the chunk
908 // continue collecting the chunk
912 } while (!u
.isEmpty());
919 // add a U+FEFF Unicode signature character if requested
920 // and possible/necessary
922 if (u
.charAt(0) != uSig
&& cnvSigType(convto
) == CNV_WITH_FEFF
) {
923 u
.insert(0, (UChar
)uSig
);
926 // insert a pseudo-offset into fromoffsets[] as well
927 // to keep the array parallel with the UChars
928 memmove(fromoffsets
+ 1, fromoffsets
, ulen
* 4);
932 // account for the additional UChar and offset
938 // Convert the Unicode buffer into the destination codepage
939 // Again 'bufp' will be placed behind the last converted character
940 // And 'unibufp' will be placed behind the last converted unicode character
941 // At the last conversion flush should be set to true to ensure that
942 // all characters left get converted
944 unibuf
= unibufbp
= u
.getBuffer();
949 // Use fromSawEndOfBytes in addition to the flush flag -
950 // it indicates whether the intermediate Unicode string
951 // contains the very last UChars for the very last input bytes.
952 ucnv_fromUnicode(convto
, &bufp
, outbuf
+ bufsz
,
955 NULL
, (UBool
)(flush
&& fromSawEndOfBytes
), &err
);
957 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
958 // converting all of the intermediate UChars.
959 // See comment for fromSawEndOfBytes.
960 toSawEndOfUnicode
= (UBool
)U_SUCCESS(err
);
962 if (err
== U_BUFFER_OVERFLOW_ERROR
) {
964 } else if (U_FAILURE(err
)) {
965 UChar errorUChars
[4];
969 int8_t i
, length
, errorLength
;
971 UErrorCode localError
= U_ZERO_ERROR
;
972 errorLength
= (int8_t)LENGTHOF(errorUChars
);
973 ucnv_getInvalidUChars(convto
, errorUChars
, &errorLength
, &localError
);
974 if (U_FAILURE(localError
) || errorLength
== 0) {
975 // need at least 1 so that we don't access beyond the length of fromoffsets[]
982 // Unicode buffer offset of the start of the error UChars
983 ferroffset
= (int32_t)((unibufbp
- unibuf
) - errorLength
);
984 if (ferroffset
< 0) {
985 // approximation - the character started in the previous Unicode buffer
989 // get the corresponding byte offset out of fromoffsets[]
990 // go back if the offset is not known for some of the UChars
993 fromoffset
= fromoffsets
[ferroffset
];
994 } while (fromoffset
< 0 && --ferroffset
>= 0);
996 // total input file offset =
997 // input file offset of the current byte buffer +
998 // byte buffer offset of where the current Unicode buffer is converted from +
999 // fromoffsets[Unicode offset]
1000 ferroffset
= infoffset
+ (prevbufp
- buf
) + fromoffset
;
1001 errtag
= "problemCvtFromU";
1003 // Do not use fromoffsets if (t != NULL) because the Unicode text may
1004 // be different from what the offsets refer to.
1006 // output file offset
1007 ferroffset
= (int32_t)(outfoffset
+ (bufp
- outbuf
));
1008 errtag
= "problemCvtFromUOut";
1011 length
= (int8_t)sprintf(pos
, "%u", (int)ferroffset
);
1013 // output the code points that caused the error
1015 for (i
= 0; i
< errorLength
;) {
1017 str
.append((UChar
)uSP
);
1019 U16_NEXT(errorUChars
, i
, errorLength
, c
);
1020 if (c
>= 0x100000) {
1021 str
.append(nibbleToHex((uint8_t)(c
>> 20)));
1024 str
.append(nibbleToHex((uint8_t)(c
>> 16)));
1026 str
.append(nibbleToHex((uint8_t)(c
>> 12)));
1027 str
.append(nibbleToHex((uint8_t)(c
>> 8)));
1028 str
.append(nibbleToHex((uint8_t)(c
>> 4)));
1029 str
.append(nibbleToHex((uint8_t)c
));
1033 u_wmsg(stderr
, errtag
,
1034 UnicodeString(pos
, length
, "").getTerminatedBuffer(),
1035 str
.getTerminatedBuffer(),
1036 u_wmsg_errorName(err
));
1037 u_wmsg(stderr
, "errorUnicode", str
.getTerminatedBuffer());
1040 err
= U_ZERO_ERROR
; /* reset the error for the rest of the conversion. */
1043 // Replaced a check for whether the intermediate Unicode characters were all consumed by
1044 // looping until they are; message key "premEnd" now obsolete.
1046 // Finally, write the converted buffer to the output file
1047 size_t outlen
= (size_t) (bufp
- outbuf
);
1048 outfoffset
+= (int32_t)(wr
= fwrite(outbuf
, 1, outlen
, outfile
));
1050 UnicodeString
str(strerror(errno
));
1052 u_wmsg(stderr
, "cantWrite", str
.getTerminatedBuffer());
1059 } while (!toSawEndOfUnicode
);
1060 } while (!fromSawEndOfBytes
);
1061 } while (!flush
); // Stop when we have flushed the
1062 // converters (this means that it's
1063 // the end of output)
1073 ucnv_close(convfrom
);
1076 #if !UCONFIG_NO_TRANSLITERATION
1080 if (infile
!= stdin
) {
1087 static void usage(const char *pname
, int ecode
) {
1090 UErrorCode err
= U_ZERO_ERROR
;
1091 FILE *fp
= ecode
? stderr
: stdout
;
1096 ures_getStringByKey(gBundle
, ecode
? "lcUsageWord" : "ucUsageWord",
1098 UnicodeString
upname(pname
, (int32_t)(uprv_strlen(pname
) + 1));
1099 UnicodeString
mname(msg
, msgLen
+ 1);
1101 res
= u_wmsg(fp
, "usage", mname
.getBuffer(), upname
.getBuffer());
1106 if (!u_wmsg(fp
, "help")) {
1107 /* Now dump callbacks and finish. */
1110 sizeof(transcode_callbacks
) / sizeof(*transcode_callbacks
);
1111 for (i
= 0; i
< count
; ++i
) {
1112 fprintf(fp
, " %s", transcode_callbacks
[i
].name
);
1122 main(int argc
, char **argv
)
1127 size_t bufsz
= DEFAULT_BUFSZ
;
1129 const char *fromcpage
= 0;
1130 const char *tocpage
= 0;
1131 const char *translit
= 0;
1132 const char *outfilestr
= 0;
1133 UBool fallback
= FALSE
;
1135 UConverterFromUCallback fromucallback
= UCNV_FROM_U_CALLBACK_STOP
;
1136 const void *fromuctxt
= 0;
1137 UConverterToUCallback toucallback
= UCNV_TO_U_CALLBACK_STOP
;
1138 const void *touctxt
= 0;
1140 char **iter
, **remainArgv
, **remainArgvLimit
;
1141 char **end
= argv
+ argc
;
1145 UBool printConvs
= FALSE
, printCanon
= FALSE
, printTranslits
= FALSE
;
1146 const char *printName
= 0;
1148 UBool verbose
= FALSE
;
1149 UErrorCode status
= U_ZERO_ERROR
;
1153 /* Initialize ICU */
1155 if (U_FAILURE(status
)) {
1156 fprintf(stderr
, "%s: can not initialize ICU. status = %s\n",
1157 argv
[0], u_errorName(status
));
1161 // Get and prettify pname.
1162 pname
= uprv_strrchr(*argv
, U_FILE_SEP_CHAR
);
1165 pname
= uprv_strrchr(*argv
, '/');
1174 // First, get the arguments from command-line
1175 // to know the codepages to convert between
1177 remainArgv
= remainArgvLimit
= argv
+ 1;
1178 for (iter
= argv
+ 1; iter
!= end
; iter
++) {
1179 // Check for from charset
1180 if (strcmp("-f", *iter
) == 0 || !strcmp("--from-code", *iter
)) {
1186 } else if (strcmp("-t", *iter
) == 0 || !strcmp("--to-code", *iter
)) {
1192 } else if (strcmp("-x", *iter
) == 0) {
1198 } else if (!strcmp("--fallback", *iter
)) {
1200 } else if (!strcmp("--no-fallback", *iter
)) {
1202 } else if (strcmp("-b", *iter
) == 0 || !strcmp("--block-size", *iter
)) {
1205 bufsz
= atoi(*iter
);
1206 if ((int) bufsz
<= 0) {
1208 UnicodeString
str(*iter
);
1210 u_wmsg(stderr
, "badBlockSize", str
.getTerminatedBuffer());
1216 } else if (strcmp("-l", *iter
) == 0 || !strcmp("--list", *iter
)) {
1217 if (printTranslits
) {
1221 } else if (strcmp("--default-code", *iter
) == 0) {
1222 if (printTranslits
) {
1225 printName
= ucnv_getDefaultName();
1226 } else if (strcmp("--list-code", *iter
) == 0) {
1227 if (printTranslits
) {
1233 UErrorCode e
= U_ZERO_ERROR
;
1234 printName
= ucnv_getAlias(*iter
, 0, &e
);
1235 if (U_FAILURE(e
) || !printName
) {
1236 UnicodeString
str(*iter
);
1238 u_wmsg(stderr
, "noSuchCodeset", str
.getTerminatedBuffer());
1243 } else if (strcmp("--canon", *iter
) == 0) {
1245 } else if (strcmp("-L", *iter
) == 0
1246 || !strcmp("--list-transliterators", *iter
)) {
1250 printTranslits
= TRUE
;
1251 } else if (strcmp("-h", *iter
) == 0 || !strcmp("-?", *iter
)
1252 || !strcmp("--help", *iter
)) {
1254 } else if (!strcmp("-c", *iter
)) {
1255 fromucallback
= UCNV_FROM_U_CALLBACK_SKIP
;
1256 } else if (!strcmp("--to-callback", *iter
)) {
1259 const struct callback_ent
*cbe
= findCallback(*iter
);
1261 fromucallback
= cbe
->fromu
;
1262 fromuctxt
= cbe
->fromuctxt
;
1264 UnicodeString
str(*iter
);
1266 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1272 } else if (!strcmp("--from-callback", *iter
)) {
1275 const struct callback_ent
*cbe
= findCallback(*iter
);
1277 toucallback
= cbe
->tou
;
1278 touctxt
= cbe
->touctxt
;
1280 UnicodeString
str(*iter
);
1282 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1288 } else if (!strcmp("-i", *iter
)) {
1289 toucallback
= UCNV_TO_U_CALLBACK_SKIP
;
1290 } else if (!strcmp("--callback", *iter
)) {
1293 const struct callback_ent
*cbe
= findCallback(*iter
);
1295 fromucallback
= cbe
->fromu
;
1296 fromuctxt
= cbe
->fromuctxt
;
1297 toucallback
= cbe
->tou
;
1298 touctxt
= cbe
->touctxt
;
1300 UnicodeString
str(*iter
);
1302 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1308 } else if (!strcmp("-s", *iter
) || !strcmp("--silent", *iter
)) {
1310 } else if (!strcmp("-v", *iter
) || !strcmp("--verbose", *iter
)) {
1312 } else if (!strcmp("-V", *iter
) || !strcmp("--version", *iter
)) {
1313 printf("%s v2.1 ICU " U_ICU_VERSION
"\n", pname
);
1315 } else if (!strcmp("-o", *iter
) || !strcmp("--output", *iter
)) {
1317 if (iter
!= end
&& !outfilestr
) {
1322 } else if (0 == strcmp("--add-signature", *iter
)) {
1324 } else if (0 == strcmp("--remove-signature", *iter
)) {
1326 } else if (**iter
== '-' && (*iter
)[1]) {
1329 // move a non-option up in argv[]
1330 *remainArgvLimit
++ = *iter
;
1334 if (printConvs
|| printName
) {
1335 return printConverters(pname
, printName
, printCanon
) ? 2 : 0;
1336 } else if (printTranslits
) {
1337 return printTransliterators(printCanon
) ? 3 : 0;
1340 if (!fromcpage
|| !uprv_strcmp(fromcpage
, "-")) {
1341 fromcpage
= ucnv_getDefaultName();
1343 if (!tocpage
|| !uprv_strcmp(tocpage
, "-")) {
1344 tocpage
= ucnv_getDefaultName();
1347 // Open the correct output file or connect to stdout for reading input
1348 if (outfilestr
!= 0 && strcmp(outfilestr
, "-")) {
1349 outfile
= fopen(outfilestr
, "wb");
1351 UnicodeString
str1(outfilestr
, "");
1352 UnicodeString
str2(strerror(errno
), "");
1354 u_wmsg(stderr
, "cantCreateOutputF",
1355 str1
.getBuffer(), str2
.getBuffer());
1361 #ifdef USE_FILENO_BINARY_MODE
1362 if (setmode(fileno(outfile
), O_BINARY
) == -1) {
1363 u_wmsg(stderr
, "cantSetOutBinMode");
1369 /* Loop again on the arguments to find all the input files, and
1372 cf
.setBufferSize(bufsz
);
1374 if(remainArgv
< remainArgvLimit
) {
1375 for (iter
= remainArgv
; iter
!= remainArgvLimit
; iter
++) {
1376 if (!cf
.convertFile(
1377 pname
, fromcpage
, toucallback
, touctxt
, tocpage
,
1378 fromucallback
, fromuctxt
, fallback
, translit
, *iter
,
1385 if (!cf
.convertFile(
1386 pname
, fromcpage
, toucallback
, touctxt
, tocpage
,
1387 fromucallback
, fromuctxt
, fallback
, translit
, 0,
1399 if (outfile
!= stdout
) {
1408 * Hey, Emacs, please set the following:
1411 * indent-tabs-mode: nil