1 /*****************************************************************************
3 * Copyright (C) 1999-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
6 ******************************************************************************/
9 * uconv(1): an iconv(1)-like converter using ICU.
11 * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se>
12 * contributed in 1999.
14 * Conversion to the C conversion API and many improvements by
15 * Yves Arrouye <yves@realnames.com>, current maintainer.
17 * Markus Scherer maintainer from 2003.
18 * See source code repository history for changes.
21 #include <unicode/utypes.h>
22 #include <unicode/putil.h>
23 #include <unicode/ucnv.h>
24 #include <unicode/uenum.h>
25 #include <unicode/unistr.h>
26 #include <unicode/translit.h>
27 #include <unicode/uset.h>
28 #include <unicode/uclean.h>
29 #include <unicode/utf16.h>
40 #include "unicode/uwmsg.h"
44 #if U_PLATFORM_USES_ONLY_WIN32_API && !defined(__STRICT_ANSI__)
47 #if U_PLATFORM_USES_ONLY_WIN32_API
48 #define USE_FILENO_BINARY_MODE 1
49 /* Windows likes to rename Unix-like functions */
51 #define fileno _fileno
54 #define setmode _setmode
57 #define O_BINARY _O_BINARY
63 /* below from the README */
64 #include "unicode/utypes.h"
65 #include "unicode/udata.h"
66 U_CFUNC
char uconvmsg_dat
[];
69 #define DEFAULT_BUFSZ 4096
70 #define UCONVMSG "uconvmsg"
72 static UResourceBundle
*gBundle
= 0; /* Bundle containing messages. */
75 * Initialize the message bundle so that message strings can be fetched
80 static void initMsg(const char *pname
) {
84 char dataPath
[2048]; /* XXX Sloppy: should be PATH_MAX. */
85 UErrorCode err
= U_ZERO_ERROR
;
89 /* Set up our static data - if any */
90 #if defined(UCONVMSG_LINK) && U_PLATFORM != U_PF_OS390 /* On z/OS, this is failing. */
91 udata_setAppData(UCONVMSG
, (const void*) uconvmsg_dat
, &err
);
93 fprintf(stderr
, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
94 pname
, u_errorName(err
));
95 err
= U_ZERO_ERROR
; /* It may still fail */
100 gBundle
= u_wmsg_setPath(UCONVMSG
, &err
);
101 if (U_FAILURE(err
)) {
103 "%s: warning: couldn't open bundle %s: %s\n",
104 pname
, UCONVMSG
, u_errorName(err
));
107 "%s: setAppData was called, internal data %s failed to load\n",
112 /* that was try #1, try again with a path */
113 uprv_strcpy(dataPath
, u_getDataDirectory());
114 uprv_strcat(dataPath
, U_FILE_SEP_STRING
);
115 uprv_strcat(dataPath
, UCONVMSG
);
117 gBundle
= u_wmsg_setPath(dataPath
, &err
);
118 if (U_FAILURE(err
)) {
120 "%s: warning: still couldn't open bundle %s: %s\n",
121 pname
, dataPath
, u_errorName(err
));
122 fprintf(stderr
, "%s: warning: messages will not be displayed\n", pname
);
128 /* Mapping of callback names to the callbacks passed to the converter
131 static struct callback_ent
{
133 UConverterFromUCallback fromu
;
134 const void *fromuctxt
;
135 UConverterToUCallback tou
;
137 } transcode_callbacks
[] = {
139 UCNV_FROM_U_CALLBACK_SUBSTITUTE
, 0,
140 UCNV_TO_U_CALLBACK_SUBSTITUTE
, 0 },
142 UCNV_FROM_U_CALLBACK_SKIP
, 0,
143 UCNV_TO_U_CALLBACK_SKIP
, 0 },
145 UCNV_FROM_U_CALLBACK_STOP
, 0,
146 UCNV_TO_U_CALLBACK_STOP
, 0 },
148 UCNV_FROM_U_CALLBACK_ESCAPE
, 0,
149 UCNV_TO_U_CALLBACK_ESCAPE
, 0},
151 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_ICU
,
152 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_ICU
},
154 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_JAVA
,
155 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_JAVA
},
157 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_C
,
158 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_C
},
160 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
,
161 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
},
163 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
,
164 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
},
166 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_DEC
,
167 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_DEC
},
168 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_UNICODE
,
169 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_UNICODE
}
172 /* Return a pointer to a callback record given its name. */
174 static const struct callback_ent
*findCallback(const char *name
) {
176 UPRV_LENGTHOF(transcode_callbacks
);
178 /* We'll do a linear search, there aren't many of them and bsearch()
179 may not be that portable. */
181 for (i
= 0; i
< count
; ++i
) {
182 if (!uprv_stricmp(name
, transcode_callbacks
[i
].name
)) {
183 return &transcode_callbacks
[i
];
190 /* Print converter information. If lookfor is set, only that converter will
191 be printed, otherwise all converters will be printed. If canon is non
192 zero, tags and aliases for each converter are printed too, in the format
193 expected for convrters.txt(5). */
195 static int printConverters(const char *pname
, const char *lookfor
,
198 UErrorCode err
= U_ZERO_ERROR
;
203 /* If there is a specified name, just handle that now. */
207 printf("%s\n", lookfor
);
210 /* Because we are printing a canonical name, we need the
211 true converter name. We've done that already except for
212 the default name (because we want to print the exact
213 name one would get when calling ucnv_getDefaultName()
214 in non-canon mode). But since we do not know at this
215 point if we have the default name or something else, we
216 need to normalize again to the canonical converter
219 const char *truename
= ucnv_getAlias(lookfor
, 0, &err
);
220 if (U_SUCCESS(err
)) {
228 /* Print converter names. We come here for one of two reasons: we
229 are printing all the names (lookfor was null), or we have a
230 single converter to print but in canon mode, hence we need to
231 get to it in order to print everything. */
233 num
= ucnv_countAvailable();
236 u_wmsg(stderr
, "cantGetNames");
240 num
= 1; /* We know where we want to be. */
243 num_stds
= ucnv_countStandards();
244 stds
= (const char **) uprv_malloc(num_stds
* sizeof(*stds
));
246 u_wmsg(stderr
, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR
));
254 for (s
= 0; s
< num_stds
; ++s
) {
255 stds
[s
] = ucnv_getStandard(s
, &err
);
257 printf("%s ", stds
[s
]);
259 if (U_FAILURE(err
)) {
260 u_wmsg(stderr
, "cantGetTag", u_wmsg_errorName(err
));
269 for (int32_t i
= 0; i
< num
; i
++) {
271 uint16_t num_aliases
;
273 /* Set the name either to what we are looking for, or
274 to the current converter name. */
279 name
= ucnv_getAvailableName(i
);
282 /* Get all the aliases associated to the name. */
285 num_aliases
= ucnv_countAliases(name
, &err
);
286 if (U_FAILURE(err
)) {
289 UnicodeString
str(name
, "");
291 u_wmsg(stderr
, "cantGetAliases", str
.getTerminatedBuffer(),
292 u_wmsg_errorName(err
));
297 /* Write all the aliases and their tags. */
299 for (a
= 0; a
< num_aliases
; ++a
) {
300 const char *alias
= ucnv_getAlias(name
, a
, &err
);
302 if (U_FAILURE(err
)) {
303 UnicodeString
str(name
, "");
305 u_wmsg(stderr
, "cantGetAliases", str
.getTerminatedBuffer(),
306 u_wmsg_errorName(err
));
310 /* Print the current alias so that it looks right. */
311 printf("%s%s%s", (canon
? (a
== 0? "" : "\t" ) : "") ,
315 /* Look (slowly, linear searching) for a tag. */
318 /* -1 to skip the last standard */
319 for (s
= t
= 0; s
< num_stds
-1; ++s
) {
320 UEnumeration
*nameEnum
= ucnv_openStandardNames(name
, stds
[s
], &err
);
321 if (U_SUCCESS(err
)) {
322 /* List the standard tags */
323 const char *standardName
;
324 UBool isFirst
= TRUE
;
325 UErrorCode enumError
= U_ZERO_ERROR
;
326 while ((standardName
= uenum_next(nameEnum
, NULL
, &enumError
))) {
327 /* See if this alias is supported by this standard. */
328 if (!strcmp(standardName
, alias
)) {
333 /* Print a * after the default standard name */
334 printf(" %s%s", stds
[s
], (isFirst
? "*" : ""));
344 /* Terminate this entry. */
351 /* Terminate this entry. */
358 /* Free temporary data. */
370 /* Print all available transliterators. If canon is non zero, print
371 one transliterator per line. */
373 static int printTransliterators(UBool canon
)
375 #if UCONFIG_NO_TRANSLITERATION
376 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
379 UErrorCode status
= U_ZERO_ERROR
;
380 UEnumeration
*ids
= utrans_openIDs(&status
);
381 int32_t i
, numtrans
= uenum_count(ids
, &status
);
383 char sepchar
= canon
? '\n' : ' ';
385 for (i
= 0; U_SUCCESS(status
)&& (i
< numtrans
); ++i
) {
387 const char *nextTrans
= uenum_next(ids
, &len
, &status
);
389 printf("%s", nextTrans
);
390 if (i
< numtrans
- 1) {
397 /* Add a terminating newline if needed. */
399 if (sepchar
!= '\n') {
411 uCR
= 0xd, // carriage return
412 uLF
= 0xa, // line feed
413 uNL
= 0x85, // newline
414 uLS
= 0x2028, // line separator
415 uPS
= 0x2029, // paragraph separator
416 uSig
= 0xfeff // signature/BOM character
419 static inline int32_t
420 getChunkLimit(const UnicodeString
&prev
, const UnicodeString
&s
) {
422 // CR, LF, CRLF, NL, LS, PS
423 // for paragraph ends (see UAX #13/Unicode 4)
424 // and include it in the chunk
425 // all of these characters are on the BMP
426 // do not include FF or VT in case they are part of a paragraph
427 // (important for bidi contexts)
428 static const UChar paraEnds
[] = {
429 0xd, 0xa, 0x85, 0x2028, 0x2029
432 iCR
, iLF
, iNL
, iLS
, iPS
, iCount
435 // first, see if there is a CRLF split between prev and s
436 if (prev
.endsWith(paraEnds
+ iCR
, 1)) {
437 if (s
.startsWith(paraEnds
+ iLF
, 1)) {
438 return 1; // split CRLF, include the LF
439 } else if (!s
.isEmpty()) {
440 return 0; // complete the last chunk
442 return -1; // wait for actual further contents to arrive
446 const UChar
*u
= s
.getBuffer(), *limit
= u
+ s
.length();
452 ((c
< uSP
) && (c
== uCR
|| c
== uLF
)) ||
459 return -1; // LF may be in the next chunk
460 } else if (*u
== uLF
) {
461 ++u
; // include the LF in this chunk
464 return (int32_t)(u
- s
.getBuffer());
468 return -1; // continue collecting the chunk
472 CNV_NO_FEFF
, // cannot convert the U+FEFF Unicode signature character (BOM)
473 CNV_WITH_FEFF
, // can convert the U+FEFF signature character
474 CNV_ADDS_FEFF
// automatically adds/detects the U+FEFF signature character
478 nibbleToHex(uint8_t n
) {
483 (UChar
)((0x61 - 10) + n
);
486 // check the converter's Unicode signature properties;
487 // the fromUnicode side of the converter must be in its initial state
488 // and will be reset again if it was used
490 cnvSigType(UConverter
*cnv
) {
494 // test if the output charset can convert U+FEFF
495 USet
*set
= uset_open(1, 0);
497 ucnv_getUnicodeSet(cnv
, set
, UCNV_ROUNDTRIP_SET
, &err
);
498 if (U_SUCCESS(err
) && uset_contains(set
, uSig
)) {
499 result
= CNV_WITH_FEFF
;
501 result
= CNV_NO_FEFF
; // an error occurred or U+FEFF cannot be converted
505 if (result
== CNV_WITH_FEFF
) {
506 // test if the output charset emits a signature anyway
507 const UChar a
[1] = { 0x61 }; // "a"
516 ucnv_fromUnicode(cnv
,
517 &out
, buffer
+ sizeof(buffer
),
520 ucnv_resetFromUnicode(cnv
);
522 if (NULL
!= ucnv_detectUnicodeSignature(buffer
, (int32_t)(out
- buffer
), NULL
, &err
) &&
525 result
= CNV_ADDS_FEFF
;
535 buf(NULL
), outbuf(NULL
), fromoffsets(NULL
),
536 bufsz(0), signature(0) {}
539 setBufferSize(size_t bufferSize
) {
542 buf
= new char[2 * bufsz
];
543 outbuf
= buf
+ bufsz
;
545 // +1 for an added U+FEFF in the intermediate Unicode buffer
546 fromoffsets
= new int32_t[bufsz
+ 1];
551 delete [] fromoffsets
;
554 UBool
convertFile(const char *pname
,
555 const char *fromcpage
,
556 UConverterToUCallback toucallback
,
559 UConverterFromUCallback fromucallback
,
560 const void *fromuctxt
,
562 const char *translit
,
563 const char *infilestr
,
564 FILE * outfile
, int verbose
);
566 friend int main(int argc
, char **argv
);
569 int32_t *fromoffsets
;
572 int8_t signature
; // add (1) or remove (-1) a U+FEFF Unicode signature character
575 // Convert a file from one encoding to another
577 ConvertFile::convertFile(const char *pname
,
578 const char *fromcpage
,
579 UConverterToUCallback toucallback
,
582 UConverterFromUCallback fromucallback
,
583 const void *fromuctxt
,
585 const char *translit
,
586 const char *infilestr
,
587 FILE * outfile
, int verbose
)
591 UConverter
*convfrom
= 0;
592 UConverter
*convto
= 0;
593 UErrorCode err
= U_ZERO_ERROR
;
595 UBool closeFile
= FALSE
;
596 const char *cbufp
, *prevbufp
;
599 uint32_t infoffset
= 0, outfoffset
= 0; /* Where we are in the file, for error reporting. */
601 const UChar
*unibuf
, *unibufbp
;
606 #if !UCONFIG_NO_TRANSLITERATION
607 Transliterator
*t
= 0; // Transliterator acting on Unicode data.
608 UnicodeString chunk
; // One chunk of the text being collected for transformation.
610 UnicodeString u
; // String to do the transliteration.
613 // use conversion offsets for error messages
614 // unless a transliterator is used -
615 // a text transformation will reorder characters in unpredictable ways
616 UBool useOffsets
= TRUE
;
618 // Open the correct input file or connect to stdin for reading input
620 if (infilestr
!= 0 && strcmp(infilestr
, "-")) {
621 infile
= fopen(infilestr
, "rb");
623 UnicodeString
str1(infilestr
, "");
624 str1
.append((UChar32
) 0);
625 UnicodeString
str2(strerror(errno
), "");
626 str2
.append((UChar32
) 0);
628 u_wmsg(stderr
, "cantOpenInputF", str1
.getBuffer(), str2
.getBuffer());
635 #ifdef USE_FILENO_BINARY_MODE
636 if (setmode(fileno(stdin
), O_BINARY
) == -1) {
638 u_wmsg(stderr
, "cantSetInBinMode");
645 fprintf(stderr
, "%s:\n", infilestr
);
648 #if !UCONFIG_NO_TRANSLITERATION
649 // Create transliterator as needed.
651 if (translit
!= NULL
&& *translit
) {
653 UnicodeString
str(translit
), pestr
;
655 /* Create from rules or by ID as needed. */
659 if (uprv_strchr(translit
, ':') || uprv_strchr(translit
, '>') || uprv_strchr(translit
, '<') || uprv_strchr(translit
, '>')) {
660 t
= Transliterator::createFromRules(UNICODE_STRING_SIMPLE("Uconv"), str
, UTRANS_FORWARD
, parse
, err
);
662 t
= Transliterator::createInstance(UnicodeString(translit
, -1, US_INV
), UTRANS_FORWARD
, err
);
665 if (U_FAILURE(err
)) {
666 str
.append((UChar32
) 0);
669 if (parse
.line
>= 0) {
670 UChar linebuf
[20], offsetbuf
[20];
671 uprv_itou(linebuf
, 20, parse
.line
, 10, 0);
672 uprv_itou(offsetbuf
, 20, parse
.offset
, 10, 0);
673 u_wmsg(stderr
, "cantCreateTranslitParseErr", str
.getTerminatedBuffer(),
674 u_wmsg_errorName(err
), linebuf
, offsetbuf
);
676 u_wmsg(stderr
, "cantCreateTranslit", str
.getTerminatedBuffer(),
677 u_wmsg_errorName(err
));
691 // Create codepage converter. If the codepage or its aliases weren't
692 // available, it returns NULL and a failure code. We also set the
693 // callbacks, and return errors in the same way.
695 convfrom
= ucnv_open(fromcpage
, &err
);
696 if (U_FAILURE(err
)) {
697 UnicodeString
str(fromcpage
, "");
699 u_wmsg(stderr
, "cantOpenFromCodeset", str
.getTerminatedBuffer(),
700 u_wmsg_errorName(err
));
703 ucnv_setToUCallBack(convfrom
, toucallback
, touctxt
, 0, 0, &err
);
704 if (U_FAILURE(err
)) {
706 u_wmsg(stderr
, "cantSetCallback", u_wmsg_errorName(err
));
710 convto
= ucnv_open(tocpage
, &err
);
711 if (U_FAILURE(err
)) {
712 UnicodeString
str(tocpage
, "");
714 u_wmsg(stderr
, "cantOpenToCodeset", str
.getTerminatedBuffer(),
715 u_wmsg_errorName(err
));
718 ucnv_setFromUCallBack(convto
, fromucallback
, fromuctxt
, 0, 0, &err
);
719 if (U_FAILURE(err
)) {
721 u_wmsg(stderr
, "cantSetCallback", u_wmsg_errorName(err
));
724 ucnv_setFallback(convto
, fallback
);
726 UBool willexit
, fromSawEndOfBytes
, toSawEndOfUnicode
;
729 // OK, we can convert now.
736 // input file offset at the beginning of the next buffer
739 rd
= fread(buf
, 1, bufsz
, infile
);
740 if (ferror(infile
) != 0) {
741 UnicodeString
str(strerror(errno
));
743 u_wmsg(stderr
, "cantRead", str
.getTerminatedBuffer());
747 // Convert the read buffer into the new encoding via Unicode.
748 // After the call 'unibufp' will be placed behind the last
749 // character that was converted in the 'unibuf'.
750 // Also the 'cbufp' is positioned behind the last converted
752 // At the last conversion in the file, flush should be set to
753 // true so that we get all characters converted.
755 // The converter must be flushed at the end of conversion so
756 // that characters on hold also will be written.
759 flush
= (UBool
)(rd
!= bufsz
);
761 // convert until the input is consumed
763 // remember the start of the current byte-to-Unicode conversion
766 unibuf
= unibufp
= u
.getBuffer((int32_t)bufsz
);
768 // Use bufsz instead of u.getCapacity() for the targetLimit
769 // so that we don't overflow fromoffsets[].
770 ucnv_toUnicode(convfrom
, &unibufp
, unibuf
+ bufsz
, &cbufp
,
771 buf
+ rd
, useOffsets
? fromoffsets
: NULL
, flush
, &err
);
773 ulen
= (int32_t)(unibufp
- unibuf
);
774 u
.releaseBuffer(U_SUCCESS(err
) ? ulen
: 0);
776 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
777 // converting all of the input bytes.
778 // It works like this because ucnv_toUnicode() returns only under the
779 // following conditions:
780 // - an error occurred during conversion (an error code is set)
781 // - the target buffer is filled (the error code indicates an overflow)
782 // - the source is consumed
783 // That is, if the error code does not indicate a failure,
784 // not even an overflow, then the source must be consumed entirely.
785 fromSawEndOfBytes
= (UBool
)U_SUCCESS(err
);
787 if (err
== U_BUFFER_OVERFLOW_ERROR
) {
789 } else if (U_FAILURE(err
)) {
790 char pos
[32], errorBytes
[32];
791 int8_t i
, length
, errorLength
;
793 UErrorCode localError
= U_ZERO_ERROR
;
794 errorLength
= (int8_t)sizeof(errorBytes
);
795 ucnv_getInvalidChars(convfrom
, errorBytes
, &errorLength
, &localError
);
796 if (U_FAILURE(localError
) || errorLength
== 0) {
800 // print the input file offset of the start of the error bytes:
801 // input file offset of the current byte buffer +
802 // length of the just consumed bytes -
803 // length of the error bytes
805 (int8_t)sprintf(pos
, "%d",
806 (int)(infoffset
+ (cbufp
- buf
) - errorLength
));
808 // output the bytes that caused the error
810 for (i
= 0; i
< errorLength
; ++i
) {
812 str
.append((UChar
)uSP
);
814 str
.append(nibbleToHex((uint8_t)errorBytes
[i
] >> 4));
815 str
.append(nibbleToHex((uint8_t)errorBytes
[i
]));
819 u_wmsg(stderr
, "problemCvtToU",
820 UnicodeString(pos
, length
, "").getTerminatedBuffer(),
821 str
.getTerminatedBuffer(),
822 u_wmsg_errorName(err
));
825 err
= U_ZERO_ERROR
; /* reset the error for the rest of the conversion. */
828 // Replaced a check for whether the input was consumed by
829 // looping until it is; message key "premEndInput" now obsolete.
835 // remove a U+FEFF Unicode signature character if requested
837 if (u
.charAt(0) == uSig
) {
840 // account for the removed UChar and offset
844 // remove an offset from fromoffsets[] as well
845 // to keep the array parallel with the UChars
846 memmove(fromoffsets
, fromoffsets
+ 1, ulen
* 4);
853 #if !UCONFIG_NO_TRANSLITERATION
854 // Transliterate/transform if needed.
856 // For transformation, we use chunking code -
857 // collect Unicode input until, for example, an end-of-line,
858 // then transform and output-convert that and continue collecting.
859 // This makes the transformation result independent of the buffer size
860 // while avoiding the slower keyboard mode.
861 // The end-of-chunk characters are completely included in the
862 // transformed string in case they are to be transformed themselves.
868 chunkLimit
= getChunkLimit(chunk
, u
);
869 if (chunkLimit
< 0 && flush
&& fromSawEndOfBytes
) {
870 // use all of the rest at the end of the text
871 chunkLimit
= u
.length();
873 if (chunkLimit
>= 0) {
874 // complete the chunk and transform it
875 chunk
.append(u
, 0, chunkLimit
);
876 u
.remove(0, chunkLimit
);
877 t
->transliterate(chunk
);
879 // append the transformation result to the result and empty the chunk
883 // continue collecting the chunk
887 } while (!u
.isEmpty());
894 // add a U+FEFF Unicode signature character if requested
895 // and possible/necessary
897 if (u
.charAt(0) != uSig
&& cnvSigType(convto
) == CNV_WITH_FEFF
) {
898 u
.insert(0, (UChar
)uSig
);
901 // insert a pseudo-offset into fromoffsets[] as well
902 // to keep the array parallel with the UChars
903 memmove(fromoffsets
+ 1, fromoffsets
, ulen
* 4);
907 // account for the additional UChar and offset
913 // Convert the Unicode buffer into the destination codepage
914 // Again 'bufp' will be placed behind the last converted character
915 // And 'unibufp' will be placed behind the last converted unicode character
916 // At the last conversion flush should be set to true to ensure that
917 // all characters left get converted
919 unibuf
= unibufbp
= u
.getBuffer();
924 // Use fromSawEndOfBytes in addition to the flush flag -
925 // it indicates whether the intermediate Unicode string
926 // contains the very last UChars for the very last input bytes.
927 ucnv_fromUnicode(convto
, &bufp
, outbuf
+ bufsz
,
930 NULL
, (UBool
)(flush
&& fromSawEndOfBytes
), &err
);
932 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
933 // converting all of the intermediate UChars.
934 // See comment for fromSawEndOfBytes.
935 toSawEndOfUnicode
= (UBool
)U_SUCCESS(err
);
937 if (err
== U_BUFFER_OVERFLOW_ERROR
) {
939 } else if (U_FAILURE(err
)) {
940 UChar errorUChars
[4];
944 int8_t i
, length
, errorLength
;
946 UErrorCode localError
= U_ZERO_ERROR
;
947 errorLength
= UPRV_LENGTHOF(errorUChars
);
948 ucnv_getInvalidUChars(convto
, errorUChars
, &errorLength
, &localError
);
949 if (U_FAILURE(localError
) || errorLength
== 0) {
950 // need at least 1 so that we don't access beyond the length of fromoffsets[]
957 // Unicode buffer offset of the start of the error UChars
958 ferroffset
= (int32_t)((unibufbp
- unibuf
) - errorLength
);
959 if (ferroffset
< 0) {
960 // approximation - the character started in the previous Unicode buffer
964 // get the corresponding byte offset out of fromoffsets[]
965 // go back if the offset is not known for some of the UChars
968 fromoffset
= fromoffsets
[ferroffset
];
969 } while (fromoffset
< 0 && --ferroffset
>= 0);
971 // total input file offset =
972 // input file offset of the current byte buffer +
973 // byte buffer offset of where the current Unicode buffer is converted from +
974 // fromoffsets[Unicode offset]
975 ferroffset
= infoffset
+ (prevbufp
- buf
) + fromoffset
;
976 errtag
= "problemCvtFromU";
978 // Do not use fromoffsets if (t != NULL) because the Unicode text may
979 // be different from what the offsets refer to.
981 // output file offset
982 ferroffset
= (int32_t)(outfoffset
+ (bufp
- outbuf
));
983 errtag
= "problemCvtFromUOut";
986 length
= (int8_t)sprintf(pos
, "%u", (int)ferroffset
);
988 // output the code points that caused the error
990 for (i
= 0; i
< errorLength
;) {
992 str
.append((UChar
)uSP
);
994 U16_NEXT(errorUChars
, i
, errorLength
, c
);
996 str
.append(nibbleToHex((uint8_t)(c
>> 20)));
999 str
.append(nibbleToHex((uint8_t)(c
>> 16)));
1001 str
.append(nibbleToHex((uint8_t)(c
>> 12)));
1002 str
.append(nibbleToHex((uint8_t)(c
>> 8)));
1003 str
.append(nibbleToHex((uint8_t)(c
>> 4)));
1004 str
.append(nibbleToHex((uint8_t)c
));
1008 u_wmsg(stderr
, errtag
,
1009 UnicodeString(pos
, length
, "").getTerminatedBuffer(),
1010 str
.getTerminatedBuffer(),
1011 u_wmsg_errorName(err
));
1012 u_wmsg(stderr
, "errorUnicode", str
.getTerminatedBuffer());
1015 err
= U_ZERO_ERROR
; /* reset the error for the rest of the conversion. */
1018 // Replaced a check for whether the intermediate Unicode characters were all consumed by
1019 // looping until they are; message key "premEnd" now obsolete.
1021 // Finally, write the converted buffer to the output file
1022 size_t outlen
= (size_t) (bufp
- outbuf
);
1023 outfoffset
+= (int32_t)(wr
= fwrite(outbuf
, 1, outlen
, outfile
));
1025 UnicodeString
str(strerror(errno
));
1027 u_wmsg(stderr
, "cantWrite", str
.getTerminatedBuffer());
1034 } while (!toSawEndOfUnicode
);
1035 } while (!fromSawEndOfBytes
);
1036 } while (!flush
); // Stop when we have flushed the
1037 // converters (this means that it's
1038 // the end of output)
1048 ucnv_close(convfrom
);
1051 #if !UCONFIG_NO_TRANSLITERATION
1062 static void usage(const char *pname
, int ecode
) {
1065 UErrorCode err
= U_ZERO_ERROR
;
1066 FILE *fp
= ecode
? stderr
: stdout
;
1071 ures_getStringByKey(gBundle
, ecode
? "lcUsageWord" : "ucUsageWord",
1073 UnicodeString
upname(pname
, (int32_t)(uprv_strlen(pname
) + 1));
1074 UnicodeString
mname(msg
, msgLen
+ 1);
1076 res
= u_wmsg(fp
, "usage", mname
.getBuffer(), upname
.getBuffer());
1081 if (!u_wmsg(fp
, "help")) {
1082 /* Now dump callbacks and finish. */
1085 UPRV_LENGTHOF(transcode_callbacks
);
1086 for (i
= 0; i
< count
; ++i
) {
1087 fprintf(fp
, " %s", transcode_callbacks
[i
].name
);
1097 main(int argc
, char **argv
)
1102 size_t bufsz
= DEFAULT_BUFSZ
;
1104 const char *fromcpage
= 0;
1105 const char *tocpage
= 0;
1106 const char *translit
= 0;
1107 const char *outfilestr
= 0;
1108 UBool fallback
= FALSE
;
1110 UConverterFromUCallback fromucallback
= UCNV_FROM_U_CALLBACK_STOP
;
1111 const void *fromuctxt
= 0;
1112 UConverterToUCallback toucallback
= UCNV_TO_U_CALLBACK_STOP
;
1113 const void *touctxt
= 0;
1115 char **iter
, **remainArgv
, **remainArgvLimit
;
1116 char **end
= argv
+ argc
;
1120 UBool printConvs
= FALSE
, printCanon
= FALSE
, printTranslits
= FALSE
;
1121 const char *printName
= 0;
1123 UBool verbose
= FALSE
;
1124 UErrorCode status
= U_ZERO_ERROR
;
1128 /* Initialize ICU */
1130 if (U_FAILURE(status
)) {
1131 fprintf(stderr
, "%s: can not initialize ICU. status = %s\n",
1132 argv
[0], u_errorName(status
));
1136 // Get and prettify pname.
1137 pname
= uprv_strrchr(*argv
, U_FILE_SEP_CHAR
);
1138 #if U_PLATFORM_USES_ONLY_WIN32_API
1140 pname
= uprv_strrchr(*argv
, '/');
1149 // First, get the arguments from command-line
1150 // to know the codepages to convert between
1152 remainArgv
= remainArgvLimit
= argv
+ 1;
1153 for (iter
= argv
+ 1; iter
!= end
; iter
++) {
1154 // Check for from charset
1155 if (strcmp("-f", *iter
) == 0 || !strcmp("--from-code", *iter
)) {
1161 } else if (strcmp("-t", *iter
) == 0 || !strcmp("--to-code", *iter
)) {
1167 } else if (strcmp("-x", *iter
) == 0) {
1173 } else if (!strcmp("--fallback", *iter
)) {
1175 } else if (!strcmp("--no-fallback", *iter
)) {
1177 } else if (strcmp("-b", *iter
) == 0 || !strcmp("--block-size", *iter
)) {
1180 bufsz
= atoi(*iter
);
1181 if ((int) bufsz
<= 0) {
1183 UnicodeString
str(*iter
);
1185 u_wmsg(stderr
, "badBlockSize", str
.getTerminatedBuffer());
1191 } else if (strcmp("-l", *iter
) == 0 || !strcmp("--list", *iter
)) {
1192 if (printTranslits
) {
1196 } else if (strcmp("--default-code", *iter
) == 0) {
1197 if (printTranslits
) {
1200 printName
= ucnv_getDefaultName();
1201 } else if (strcmp("--list-code", *iter
) == 0) {
1202 if (printTranslits
) {
1208 UErrorCode e
= U_ZERO_ERROR
;
1209 printName
= ucnv_getAlias(*iter
, 0, &e
);
1210 if (U_FAILURE(e
) || !printName
) {
1211 UnicodeString
str(*iter
);
1213 u_wmsg(stderr
, "noSuchCodeset", str
.getTerminatedBuffer());
1218 } else if (strcmp("--canon", *iter
) == 0) {
1220 } else if (strcmp("-L", *iter
) == 0
1221 || !strcmp("--list-transliterators", *iter
)) {
1225 printTranslits
= TRUE
;
1226 } else if (strcmp("-h", *iter
) == 0 || !strcmp("-?", *iter
)
1227 || !strcmp("--help", *iter
)) {
1229 } else if (!strcmp("-c", *iter
)) {
1230 fromucallback
= UCNV_FROM_U_CALLBACK_SKIP
;
1231 } else if (!strcmp("--to-callback", *iter
)) {
1234 const struct callback_ent
*cbe
= findCallback(*iter
);
1236 fromucallback
= cbe
->fromu
;
1237 fromuctxt
= cbe
->fromuctxt
;
1239 UnicodeString
str(*iter
);
1241 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1247 } else if (!strcmp("--from-callback", *iter
)) {
1250 const struct callback_ent
*cbe
= findCallback(*iter
);
1252 toucallback
= cbe
->tou
;
1253 touctxt
= cbe
->touctxt
;
1255 UnicodeString
str(*iter
);
1257 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1263 } else if (!strcmp("-i", *iter
)) {
1264 toucallback
= UCNV_TO_U_CALLBACK_SKIP
;
1265 } else if (!strcmp("--callback", *iter
)) {
1268 const struct callback_ent
*cbe
= findCallback(*iter
);
1270 fromucallback
= cbe
->fromu
;
1271 fromuctxt
= cbe
->fromuctxt
;
1272 toucallback
= cbe
->tou
;
1273 touctxt
= cbe
->touctxt
;
1275 UnicodeString
str(*iter
);
1277 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1283 } else if (!strcmp("-s", *iter
) || !strcmp("--silent", *iter
)) {
1285 } else if (!strcmp("-v", *iter
) || !strcmp("--verbose", *iter
)) {
1287 } else if (!strcmp("-V", *iter
) || !strcmp("--version", *iter
)) {
1288 printf("%s v2.1 ICU " U_ICU_VERSION
"\n", pname
);
1290 } else if (!strcmp("-o", *iter
) || !strcmp("--output", *iter
)) {
1292 if (iter
!= end
&& !outfilestr
) {
1297 } else if (0 == strcmp("--add-signature", *iter
)) {
1299 } else if (0 == strcmp("--remove-signature", *iter
)) {
1301 } else if (**iter
== '-' && (*iter
)[1]) {
1304 // move a non-option up in argv[]
1305 *remainArgvLimit
++ = *iter
;
1309 if (printConvs
|| printName
) {
1310 return printConverters(pname
, printName
, printCanon
) ? 2 : 0;
1311 } else if (printTranslits
) {
1312 return printTransliterators(printCanon
) ? 3 : 0;
1315 if (!fromcpage
|| !uprv_strcmp(fromcpage
, "-")) {
1316 fromcpage
= ucnv_getDefaultName();
1318 if (!tocpage
|| !uprv_strcmp(tocpage
, "-")) {
1319 tocpage
= ucnv_getDefaultName();
1322 // Open the correct output file or connect to stdout for reading input
1323 if (outfilestr
!= 0 && strcmp(outfilestr
, "-")) {
1324 outfile
= fopen(outfilestr
, "wb");
1326 UnicodeString
str1(outfilestr
, "");
1327 UnicodeString
str2(strerror(errno
), "");
1329 u_wmsg(stderr
, "cantCreateOutputF",
1330 str1
.getBuffer(), str2
.getBuffer());
1336 #ifdef USE_FILENO_BINARY_MODE
1337 if (setmode(fileno(outfile
), O_BINARY
) == -1) {
1338 u_wmsg(stderr
, "cantSetOutBinMode");
1344 /* Loop again on the arguments to find all the input files, and
1347 cf
.setBufferSize(bufsz
);
1349 if(remainArgv
< remainArgvLimit
) {
1350 for (iter
= remainArgv
; iter
!= remainArgvLimit
; iter
++) {
1351 if (!cf
.convertFile(
1352 pname
, fromcpage
, toucallback
, touctxt
, tocpage
,
1353 fromucallback
, fromuctxt
, fallback
, translit
, *iter
,
1360 if (!cf
.convertFile(
1361 pname
, fromcpage
, toucallback
, touctxt
, tocpage
,
1362 fromucallback
, fromuctxt
, fallback
, translit
, 0,
1371 #if !UCONFIG_NO_LEGACY_CONVERSION
1374 fprintf(stderr
, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n");
1378 if (outfile
!= stdout
) {
1389 * Hey, Emacs, please set the following:
1392 * indent-tabs-mode: nil