1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*****************************************************************************
5 * Copyright (C) 1999-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
8 ******************************************************************************/
11 * uconv(1): an iconv(1)-like converter using ICU.
13 * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se>
14 * contributed in 1999.
16 * Conversion to the C conversion API and many improvements by
17 * Yves Arrouye <yves@realnames.com>, current maintainer.
19 * Markus Scherer maintainer from 2003.
20 * See source code repository history for changes.
23 #include <unicode/utypes.h>
24 #include <unicode/putil.h>
25 #include <unicode/ucnv.h>
26 #include <unicode/uenum.h>
27 #include <unicode/unistr.h>
28 #include <unicode/translit.h>
29 #include <unicode/uset.h>
30 #include <unicode/uclean.h>
31 #include <unicode/utf16.h>
42 #include "unicode/uwmsg.h"
46 #if U_PLATFORM_USES_ONLY_WIN32_API && !defined(__STRICT_ANSI__)
49 #if U_PLATFORM_USES_ONLY_WIN32_API
50 #define USE_FILENO_BINARY_MODE 1
51 /* Windows likes to rename Unix-like functions */
53 #define fileno _fileno
56 #define setmode _setmode
59 #define O_BINARY _O_BINARY
65 /* below from the README */
66 #include "unicode/utypes.h"
67 #include "unicode/udata.h"
68 U_CFUNC
char uconvmsg_dat
[];
71 #define DEFAULT_BUFSZ 4096
72 #define UCONVMSG "uconvmsg"
74 static UResourceBundle
*gBundle
= 0; /* Bundle containing messages. */
77 * Initialize the message bundle so that message strings can be fetched
82 static void initMsg(const char *pname
) {
86 char dataPath
[2048]; /* XXX Sloppy: should be PATH_MAX. */
87 UErrorCode err
= U_ZERO_ERROR
;
91 /* Set up our static data - if any */
92 #if defined(UCONVMSG_LINK) && U_PLATFORM != U_PF_OS390 /* On z/OS, this is failing. */
93 udata_setAppData(UCONVMSG
, (const void*) uconvmsg_dat
, &err
);
95 fprintf(stderr
, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
96 pname
, u_errorName(err
));
97 err
= U_ZERO_ERROR
; /* It may still fail */
102 gBundle
= u_wmsg_setPath(UCONVMSG
, &err
);
103 if (U_FAILURE(err
)) {
105 "%s: warning: couldn't open bundle %s: %s\n",
106 pname
, UCONVMSG
, u_errorName(err
));
109 "%s: setAppData was called, internal data %s failed to load\n",
114 /* that was try #1, try again with a path */
115 uprv_strcpy(dataPath
, u_getDataDirectory());
116 uprv_strcat(dataPath
, U_FILE_SEP_STRING
);
117 uprv_strcat(dataPath
, UCONVMSG
);
119 gBundle
= u_wmsg_setPath(dataPath
, &err
);
120 if (U_FAILURE(err
)) {
122 "%s: warning: still couldn't open bundle %s: %s\n",
123 pname
, dataPath
, u_errorName(err
));
124 fprintf(stderr
, "%s: warning: messages will not be displayed\n", pname
);
130 /* Mapping of callback names to the callbacks passed to the converter
133 static struct callback_ent
{
135 UConverterFromUCallback fromu
;
136 const void *fromuctxt
;
137 UConverterToUCallback tou
;
139 } transcode_callbacks
[] = {
141 UCNV_FROM_U_CALLBACK_SUBSTITUTE
, 0,
142 UCNV_TO_U_CALLBACK_SUBSTITUTE
, 0 },
144 UCNV_FROM_U_CALLBACK_SKIP
, 0,
145 UCNV_TO_U_CALLBACK_SKIP
, 0 },
147 UCNV_FROM_U_CALLBACK_STOP
, 0,
148 UCNV_TO_U_CALLBACK_STOP
, 0 },
150 UCNV_FROM_U_CALLBACK_ESCAPE
, 0,
151 UCNV_TO_U_CALLBACK_ESCAPE
, 0},
153 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_ICU
,
154 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_ICU
},
156 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_JAVA
,
157 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_JAVA
},
159 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_C
,
160 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_C
},
162 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
,
163 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
},
165 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
,
166 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
},
168 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_DEC
,
169 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_DEC
},
170 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_UNICODE
,
171 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_UNICODE
}
174 /* Return a pointer to a callback record given its name. */
176 static const struct callback_ent
*findCallback(const char *name
) {
178 UPRV_LENGTHOF(transcode_callbacks
);
180 /* We'll do a linear search, there aren't many of them and bsearch()
181 may not be that portable. */
183 for (i
= 0; i
< count
; ++i
) {
184 if (!uprv_stricmp(name
, transcode_callbacks
[i
].name
)) {
185 return &transcode_callbacks
[i
];
192 /* Print converter information. If lookfor is set, only that converter will
193 be printed, otherwise all converters will be printed. If canon is non
194 zero, tags and aliases for each converter are printed too, in the format
195 expected for convrters.txt(5). */
197 static int printConverters(const char *pname
, const char *lookfor
,
200 UErrorCode err
= U_ZERO_ERROR
;
205 /* If there is a specified name, just handle that now. */
209 printf("%s\n", lookfor
);
212 /* Because we are printing a canonical name, we need the
213 true converter name. We've done that already except for
214 the default name (because we want to print the exact
215 name one would get when calling ucnv_getDefaultName()
216 in non-canon mode). But since we do not know at this
217 point if we have the default name or something else, we
218 need to normalize again to the canonical converter
221 const char *truename
= ucnv_getAlias(lookfor
, 0, &err
);
222 if (U_SUCCESS(err
)) {
230 /* Print converter names. We come here for one of two reasons: we
231 are printing all the names (lookfor was null), or we have a
232 single converter to print but in canon mode, hence we need to
233 get to it in order to print everything. */
235 num
= ucnv_countAvailable();
238 u_wmsg(stderr
, "cantGetNames");
242 num
= 1; /* We know where we want to be. */
245 num_stds
= ucnv_countStandards();
246 stds
= (const char **) uprv_malloc(num_stds
* sizeof(*stds
));
248 u_wmsg(stderr
, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR
));
256 for (s
= 0; s
< num_stds
; ++s
) {
257 stds
[s
] = ucnv_getStandard(s
, &err
);
259 printf("%s ", stds
[s
]);
261 if (U_FAILURE(err
)) {
262 u_wmsg(stderr
, "cantGetTag", u_wmsg_errorName(err
));
271 for (int32_t i
= 0; i
< num
; i
++) {
273 uint16_t num_aliases
;
275 /* Set the name either to what we are looking for, or
276 to the current converter name. */
281 name
= ucnv_getAvailableName(i
);
284 /* Get all the aliases associated to the name. */
287 num_aliases
= ucnv_countAliases(name
, &err
);
288 if (U_FAILURE(err
)) {
291 UnicodeString
str(name
, "");
293 u_wmsg(stderr
, "cantGetAliases", str
.getTerminatedBuffer(),
294 u_wmsg_errorName(err
));
299 /* Write all the aliases and their tags. */
301 for (a
= 0; a
< num_aliases
; ++a
) {
302 const char *alias
= ucnv_getAlias(name
, a
, &err
);
304 if (U_FAILURE(err
)) {
305 UnicodeString
str(name
, "");
307 u_wmsg(stderr
, "cantGetAliases", str
.getTerminatedBuffer(),
308 u_wmsg_errorName(err
));
312 /* Print the current alias so that it looks right. */
313 printf("%s%s%s", (canon
? (a
== 0? "" : "\t" ) : "") ,
317 /* Look (slowly, linear searching) for a tag. */
320 /* -1 to skip the last standard */
321 for (s
= t
= 0; s
< num_stds
-1; ++s
) {
322 UEnumeration
*nameEnum
= ucnv_openStandardNames(name
, stds
[s
], &err
);
323 if (U_SUCCESS(err
)) {
324 /* List the standard tags */
325 const char *standardName
;
326 UBool isFirst
= TRUE
;
327 UErrorCode enumError
= U_ZERO_ERROR
;
328 while ((standardName
= uenum_next(nameEnum
, NULL
, &enumError
))) {
329 /* See if this alias is supported by this standard. */
330 if (!strcmp(standardName
, alias
)) {
335 /* Print a * after the default standard name */
336 printf(" %s%s", stds
[s
], (isFirst
? "*" : ""));
346 /* Terminate this entry. */
353 /* Terminate this entry. */
360 /* Free temporary data. */
372 /* Print all available transliterators. If canon is non zero, print
373 one transliterator per line. */
375 static int printTransliterators(UBool canon
)
377 #if UCONFIG_NO_TRANSLITERATION
378 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
381 UErrorCode status
= U_ZERO_ERROR
;
382 UEnumeration
*ids
= utrans_openIDs(&status
);
383 int32_t i
, numtrans
= uenum_count(ids
, &status
);
385 char sepchar
= canon
? '\n' : ' ';
387 for (i
= 0; U_SUCCESS(status
)&& (i
< numtrans
); ++i
) {
389 const char *nextTrans
= uenum_next(ids
, &len
, &status
);
391 printf("%s", nextTrans
);
392 if (i
< numtrans
- 1) {
399 /* Add a terminating newline if needed. */
401 if (sepchar
!= '\n') {
413 uCR
= 0xd, // carriage return
414 uLF
= 0xa, // line feed
415 uNL
= 0x85, // newline
416 uLS
= 0x2028, // line separator
417 uPS
= 0x2029, // paragraph separator
418 uSig
= 0xfeff // signature/BOM character
421 static inline int32_t
422 getChunkLimit(const UnicodeString
&prev
, const UnicodeString
&s
) {
424 // CR, LF, CRLF, NL, LS, PS
425 // for paragraph ends (see UAX #13/Unicode 4)
426 // and include it in the chunk
427 // all of these characters are on the BMP
428 // do not include FF or VT in case they are part of a paragraph
429 // (important for bidi contexts)
430 static const UChar paraEnds
[] = {
431 0xd, 0xa, 0x85, 0x2028, 0x2029
434 iCR
, iLF
, iNL
, iLS
, iPS
, iCount
437 // first, see if there is a CRLF split between prev and s
438 if (prev
.endsWith(paraEnds
+ iCR
, 1)) {
439 if (s
.startsWith(paraEnds
+ iLF
, 1)) {
440 return 1; // split CRLF, include the LF
441 } else if (!s
.isEmpty()) {
442 return 0; // complete the last chunk
444 return -1; // wait for actual further contents to arrive
448 const UChar
*u
= s
.getBuffer(), *limit
= u
+ s
.length();
454 ((c
< uSP
) && (c
== uCR
|| c
== uLF
)) ||
461 return -1; // LF may be in the next chunk
462 } else if (*u
== uLF
) {
463 ++u
; // include the LF in this chunk
466 return (int32_t)(u
- s
.getBuffer());
470 return -1; // continue collecting the chunk
474 CNV_NO_FEFF
, // cannot convert the U+FEFF Unicode signature character (BOM)
475 CNV_WITH_FEFF
, // can convert the U+FEFF signature character
476 CNV_ADDS_FEFF
// automatically adds/detects the U+FEFF signature character
480 nibbleToHex(uint8_t n
) {
485 (UChar
)((0x61 - 10) + n
);
488 // check the converter's Unicode signature properties;
489 // the fromUnicode side of the converter must be in its initial state
490 // and will be reset again if it was used
492 cnvSigType(UConverter
*cnv
) {
496 // test if the output charset can convert U+FEFF
497 USet
*set
= uset_open(1, 0);
499 ucnv_getUnicodeSet(cnv
, set
, UCNV_ROUNDTRIP_SET
, &err
);
500 if (U_SUCCESS(err
) && uset_contains(set
, uSig
)) {
501 result
= CNV_WITH_FEFF
;
503 result
= CNV_NO_FEFF
; // an error occurred or U+FEFF cannot be converted
507 if (result
== CNV_WITH_FEFF
) {
508 // test if the output charset emits a signature anyway
509 const UChar a
[1] = { 0x61 }; // "a"
518 ucnv_fromUnicode(cnv
,
519 &out
, buffer
+ sizeof(buffer
),
522 ucnv_resetFromUnicode(cnv
);
524 if (NULL
!= ucnv_detectUnicodeSignature(buffer
, (int32_t)(out
- buffer
), NULL
, &err
) &&
527 result
= CNV_ADDS_FEFF
;
537 buf(NULL
), outbuf(NULL
), fromoffsets(NULL
),
538 bufsz(0), signature(0) {}
541 setBufferSize(size_t bufferSize
) {
544 buf
= new char[2 * bufsz
];
545 outbuf
= buf
+ bufsz
;
547 // +1 for an added U+FEFF in the intermediate Unicode buffer
548 fromoffsets
= new int32_t[bufsz
+ 1];
553 delete [] fromoffsets
;
556 UBool
convertFile(const char *pname
,
557 const char *fromcpage
,
558 UConverterToUCallback toucallback
,
561 UConverterFromUCallback fromucallback
,
562 const void *fromuctxt
,
564 const char *translit
,
565 const char *infilestr
,
566 FILE * outfile
, int verbose
);
568 friend int main(int argc
, char **argv
);
571 int32_t *fromoffsets
;
574 int8_t signature
; // add (1) or remove (-1) a U+FEFF Unicode signature character
577 // Convert a file from one encoding to another
579 ConvertFile::convertFile(const char *pname
,
580 const char *fromcpage
,
581 UConverterToUCallback toucallback
,
584 UConverterFromUCallback fromucallback
,
585 const void *fromuctxt
,
587 const char *translit
,
588 const char *infilestr
,
589 FILE * outfile
, int verbose
)
593 UConverter
*convfrom
= 0;
594 UConverter
*convto
= 0;
595 UErrorCode err
= U_ZERO_ERROR
;
597 UBool closeFile
= FALSE
;
598 const char *cbufp
, *prevbufp
;
601 uint32_t infoffset
= 0, outfoffset
= 0; /* Where we are in the file, for error reporting. */
603 const UChar
*unibuf
, *unibufbp
;
608 #if !UCONFIG_NO_TRANSLITERATION
609 Transliterator
*t
= 0; // Transliterator acting on Unicode data.
610 UnicodeString chunk
; // One chunk of the text being collected for transformation.
612 UnicodeString u
; // String to do the transliteration.
615 // use conversion offsets for error messages
616 // unless a transliterator is used -
617 // a text transformation will reorder characters in unpredictable ways
618 UBool useOffsets
= TRUE
;
620 // Open the correct input file or connect to stdin for reading input
622 if (infilestr
!= 0 && strcmp(infilestr
, "-")) {
623 infile
= fopen(infilestr
, "rb");
625 UnicodeString
str1(infilestr
, "");
626 str1
.append((UChar32
) 0);
627 UnicodeString
str2(strerror(errno
), "");
628 str2
.append((UChar32
) 0);
630 u_wmsg(stderr
, "cantOpenInputF", str1
.getBuffer(), str2
.getBuffer());
637 #ifdef USE_FILENO_BINARY_MODE
638 if (setmode(fileno(stdin
), O_BINARY
) == -1) {
640 u_wmsg(stderr
, "cantSetInBinMode");
647 fprintf(stderr
, "%s:\n", infilestr
);
650 #if !UCONFIG_NO_TRANSLITERATION
651 // Create transliterator as needed.
653 if (translit
!= NULL
&& *translit
) {
655 UnicodeString
str(translit
), pestr
;
657 /* Create from rules or by ID as needed. */
661 if (uprv_strchr(translit
, ':') || uprv_strchr(translit
, '>') || uprv_strchr(translit
, '<') || uprv_strchr(translit
, '>')) {
662 t
= Transliterator::createFromRules(UNICODE_STRING_SIMPLE("Uconv"), str
, UTRANS_FORWARD
, parse
, err
);
664 t
= Transliterator::createInstance(UnicodeString(translit
, -1, US_INV
), UTRANS_FORWARD
, err
);
667 if (U_FAILURE(err
)) {
668 str
.append((UChar32
) 0);
671 if (parse
.line
>= 0) {
672 UChar linebuf
[20], offsetbuf
[20];
673 uprv_itou(linebuf
, 20, parse
.line
, 10, 0);
674 uprv_itou(offsetbuf
, 20, parse
.offset
, 10, 0);
675 u_wmsg(stderr
, "cantCreateTranslitParseErr", str
.getTerminatedBuffer(),
676 u_wmsg_errorName(err
), linebuf
, offsetbuf
);
678 u_wmsg(stderr
, "cantCreateTranslit", str
.getTerminatedBuffer(),
679 u_wmsg_errorName(err
));
693 // Create codepage converter. If the codepage or its aliases weren't
694 // available, it returns NULL and a failure code. We also set the
695 // callbacks, and return errors in the same way.
697 convfrom
= ucnv_open(fromcpage
, &err
);
698 if (U_FAILURE(err
)) {
699 UnicodeString
str(fromcpage
, "");
701 u_wmsg(stderr
, "cantOpenFromCodeset", str
.getTerminatedBuffer(),
702 u_wmsg_errorName(err
));
705 ucnv_setToUCallBack(convfrom
, toucallback
, touctxt
, 0, 0, &err
);
706 if (U_FAILURE(err
)) {
708 u_wmsg(stderr
, "cantSetCallback", u_wmsg_errorName(err
));
712 convto
= ucnv_open(tocpage
, &err
);
713 if (U_FAILURE(err
)) {
714 UnicodeString
str(tocpage
, "");
716 u_wmsg(stderr
, "cantOpenToCodeset", str
.getTerminatedBuffer(),
717 u_wmsg_errorName(err
));
720 ucnv_setFromUCallBack(convto
, fromucallback
, fromuctxt
, 0, 0, &err
);
721 if (U_FAILURE(err
)) {
723 u_wmsg(stderr
, "cantSetCallback", u_wmsg_errorName(err
));
726 ucnv_setFallback(convto
, fallback
);
728 UBool willexit
, fromSawEndOfBytes
, toSawEndOfUnicode
;
731 // OK, we can convert now.
738 // input file offset at the beginning of the next buffer
741 rd
= fread(buf
, 1, bufsz
, infile
);
742 if (ferror(infile
) != 0) {
743 UnicodeString
str(strerror(errno
));
745 u_wmsg(stderr
, "cantRead", str
.getTerminatedBuffer());
749 // Convert the read buffer into the new encoding via Unicode.
750 // After the call 'unibufp' will be placed behind the last
751 // character that was converted in the 'unibuf'.
752 // Also the 'cbufp' is positioned behind the last converted
754 // At the last conversion in the file, flush should be set to
755 // true so that we get all characters converted.
757 // The converter must be flushed at the end of conversion so
758 // that characters on hold also will be written.
761 flush
= (UBool
)(rd
!= bufsz
);
763 // convert until the input is consumed
765 // remember the start of the current byte-to-Unicode conversion
768 unibuf
= unibufp
= u
.getBuffer((int32_t)bufsz
);
770 // Use bufsz instead of u.getCapacity() for the targetLimit
771 // so that we don't overflow fromoffsets[].
772 ucnv_toUnicode(convfrom
, &unibufp
, unibuf
+ bufsz
, &cbufp
,
773 buf
+ rd
, useOffsets
? fromoffsets
: NULL
, flush
, &err
);
775 ulen
= (int32_t)(unibufp
- unibuf
);
776 u
.releaseBuffer(U_SUCCESS(err
) ? ulen
: 0);
778 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
779 // converting all of the input bytes.
780 // It works like this because ucnv_toUnicode() returns only under the
781 // following conditions:
782 // - an error occurred during conversion (an error code is set)
783 // - the target buffer is filled (the error code indicates an overflow)
784 // - the source is consumed
785 // That is, if the error code does not indicate a failure,
786 // not even an overflow, then the source must be consumed entirely.
787 fromSawEndOfBytes
= (UBool
)U_SUCCESS(err
);
789 if (err
== U_BUFFER_OVERFLOW_ERROR
) {
791 } else if (U_FAILURE(err
)) {
792 char pos
[32], errorBytes
[32];
793 int8_t i
, length
, errorLength
;
795 UErrorCode localError
= U_ZERO_ERROR
;
796 errorLength
= (int8_t)sizeof(errorBytes
);
797 ucnv_getInvalidChars(convfrom
, errorBytes
, &errorLength
, &localError
);
798 if (U_FAILURE(localError
) || errorLength
== 0) {
802 // print the input file offset of the start of the error bytes:
803 // input file offset of the current byte buffer +
804 // length of the just consumed bytes -
805 // length of the error bytes
807 (int8_t)sprintf(pos
, "%d",
808 (int)(infoffset
+ (cbufp
- buf
) - errorLength
));
810 // output the bytes that caused the error
812 for (i
= 0; i
< errorLength
; ++i
) {
814 str
.append((UChar
)uSP
);
816 str
.append(nibbleToHex((uint8_t)errorBytes
[i
] >> 4));
817 str
.append(nibbleToHex((uint8_t)errorBytes
[i
]));
821 u_wmsg(stderr
, "problemCvtToU",
822 UnicodeString(pos
, length
, "").getTerminatedBuffer(),
823 str
.getTerminatedBuffer(),
824 u_wmsg_errorName(err
));
827 err
= U_ZERO_ERROR
; /* reset the error for the rest of the conversion. */
830 // Replaced a check for whether the input was consumed by
831 // looping until it is; message key "premEndInput" now obsolete.
837 // remove a U+FEFF Unicode signature character if requested
839 if (u
.charAt(0) == uSig
) {
842 // account for the removed UChar and offset
846 // remove an offset from fromoffsets[] as well
847 // to keep the array parallel with the UChars
848 memmove(fromoffsets
, fromoffsets
+ 1, ulen
* 4);
855 #if !UCONFIG_NO_TRANSLITERATION
856 // Transliterate/transform if needed.
858 // For transformation, we use chunking code -
859 // collect Unicode input until, for example, an end-of-line,
860 // then transform and output-convert that and continue collecting.
861 // This makes the transformation result independent of the buffer size
862 // while avoiding the slower keyboard mode.
863 // The end-of-chunk characters are completely included in the
864 // transformed string in case they are to be transformed themselves.
870 chunkLimit
= getChunkLimit(chunk
, u
);
871 if (chunkLimit
< 0 && flush
&& fromSawEndOfBytes
) {
872 // use all of the rest at the end of the text
873 chunkLimit
= u
.length();
875 if (chunkLimit
>= 0) {
876 // complete the chunk and transform it
877 chunk
.append(u
, 0, chunkLimit
);
878 u
.remove(0, chunkLimit
);
879 t
->transliterate(chunk
);
881 // append the transformation result to the result and empty the chunk
885 // continue collecting the chunk
889 } while (!u
.isEmpty());
896 // add a U+FEFF Unicode signature character if requested
897 // and possible/necessary
899 if (u
.charAt(0) != uSig
&& cnvSigType(convto
) == CNV_WITH_FEFF
) {
900 u
.insert(0, (UChar
)uSig
);
903 // insert a pseudo-offset into fromoffsets[] as well
904 // to keep the array parallel with the UChars
905 memmove(fromoffsets
+ 1, fromoffsets
, ulen
* 4);
909 // account for the additional UChar and offset
915 // Convert the Unicode buffer into the destination codepage
916 // Again 'bufp' will be placed behind the last converted character
917 // And 'unibufp' will be placed behind the last converted unicode character
918 // At the last conversion flush should be set to true to ensure that
919 // all characters left get converted
921 unibuf
= unibufbp
= u
.getBuffer();
926 // Use fromSawEndOfBytes in addition to the flush flag -
927 // it indicates whether the intermediate Unicode string
928 // contains the very last UChars for the very last input bytes.
929 ucnv_fromUnicode(convto
, &bufp
, outbuf
+ bufsz
,
932 NULL
, (UBool
)(flush
&& fromSawEndOfBytes
), &err
);
934 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
935 // converting all of the intermediate UChars.
936 // See comment for fromSawEndOfBytes.
937 toSawEndOfUnicode
= (UBool
)U_SUCCESS(err
);
939 if (err
== U_BUFFER_OVERFLOW_ERROR
) {
941 } else if (U_FAILURE(err
)) {
942 UChar errorUChars
[4];
946 int8_t i
, length
, errorLength
;
948 UErrorCode localError
= U_ZERO_ERROR
;
949 errorLength
= UPRV_LENGTHOF(errorUChars
);
950 ucnv_getInvalidUChars(convto
, errorUChars
, &errorLength
, &localError
);
951 if (U_FAILURE(localError
) || errorLength
== 0) {
952 // need at least 1 so that we don't access beyond the length of fromoffsets[]
959 // Unicode buffer offset of the start of the error UChars
960 ferroffset
= (int32_t)((unibufbp
- unibuf
) - errorLength
);
961 if (ferroffset
< 0) {
962 // approximation - the character started in the previous Unicode buffer
966 // get the corresponding byte offset out of fromoffsets[]
967 // go back if the offset is not known for some of the UChars
970 fromoffset
= fromoffsets
[ferroffset
];
971 } while (fromoffset
< 0 && --ferroffset
>= 0);
973 // total input file offset =
974 // input file offset of the current byte buffer +
975 // byte buffer offset of where the current Unicode buffer is converted from +
976 // fromoffsets[Unicode offset]
977 ferroffset
= infoffset
+ (prevbufp
- buf
) + fromoffset
;
978 errtag
= "problemCvtFromU";
980 // Do not use fromoffsets if (t != NULL) because the Unicode text may
981 // be different from what the offsets refer to.
983 // output file offset
984 ferroffset
= (int32_t)(outfoffset
+ (bufp
- outbuf
));
985 errtag
= "problemCvtFromUOut";
988 length
= (int8_t)sprintf(pos
, "%u", (int)ferroffset
);
990 // output the code points that caused the error
992 for (i
= 0; i
< errorLength
;) {
994 str
.append((UChar
)uSP
);
996 U16_NEXT(errorUChars
, i
, errorLength
, c
);
998 str
.append(nibbleToHex((uint8_t)(c
>> 20)));
1001 str
.append(nibbleToHex((uint8_t)(c
>> 16)));
1003 str
.append(nibbleToHex((uint8_t)(c
>> 12)));
1004 str
.append(nibbleToHex((uint8_t)(c
>> 8)));
1005 str
.append(nibbleToHex((uint8_t)(c
>> 4)));
1006 str
.append(nibbleToHex((uint8_t)c
));
1010 u_wmsg(stderr
, errtag
,
1011 UnicodeString(pos
, length
, "").getTerminatedBuffer(),
1012 str
.getTerminatedBuffer(),
1013 u_wmsg_errorName(err
));
1014 u_wmsg(stderr
, "errorUnicode", str
.getTerminatedBuffer());
1017 err
= U_ZERO_ERROR
; /* reset the error for the rest of the conversion. */
1020 // Replaced a check for whether the intermediate Unicode characters were all consumed by
1021 // looping until they are; message key "premEnd" now obsolete.
1023 // Finally, write the converted buffer to the output file
1024 size_t outlen
= (size_t) (bufp
- outbuf
);
1025 outfoffset
+= (int32_t)(wr
= fwrite(outbuf
, 1, outlen
, outfile
));
1027 UnicodeString
str(strerror(errno
));
1029 u_wmsg(stderr
, "cantWrite", str
.getTerminatedBuffer());
1036 } while (!toSawEndOfUnicode
);
1037 } while (!fromSawEndOfBytes
);
1038 } while (!flush
); // Stop when we have flushed the
1039 // converters (this means that it's
1040 // the end of output)
1050 ucnv_close(convfrom
);
1053 #if !UCONFIG_NO_TRANSLITERATION
1064 static void usage(const char *pname
, int ecode
) {
1067 UErrorCode err
= U_ZERO_ERROR
;
1068 FILE *fp
= ecode
? stderr
: stdout
;
1073 ures_getStringByKey(gBundle
, ecode
? "lcUsageWord" : "ucUsageWord",
1075 UnicodeString
upname(pname
, (int32_t)(uprv_strlen(pname
) + 1));
1076 UnicodeString
mname(msg
, msgLen
+ 1);
1078 res
= u_wmsg(fp
, "usage", mname
.getBuffer(), upname
.getBuffer());
1083 if (!u_wmsg(fp
, "help")) {
1084 /* Now dump callbacks and finish. */
1087 UPRV_LENGTHOF(transcode_callbacks
);
1088 for (i
= 0; i
< count
; ++i
) {
1089 fprintf(fp
, " %s", transcode_callbacks
[i
].name
);
1099 main(int argc
, char **argv
)
1104 size_t bufsz
= DEFAULT_BUFSZ
;
1106 const char *fromcpage
= 0;
1107 const char *tocpage
= 0;
1108 const char *translit
= 0;
1109 const char *outfilestr
= 0;
1110 UBool fallback
= FALSE
;
1112 UConverterFromUCallback fromucallback
= UCNV_FROM_U_CALLBACK_STOP
;
1113 const void *fromuctxt
= 0;
1114 UConverterToUCallback toucallback
= UCNV_TO_U_CALLBACK_STOP
;
1115 const void *touctxt
= 0;
1117 char **iter
, **remainArgv
, **remainArgvLimit
;
1118 char **end
= argv
+ argc
;
1122 UBool printConvs
= FALSE
, printCanon
= FALSE
, printTranslits
= FALSE
;
1123 const char *printName
= 0;
1125 UBool verbose
= FALSE
;
1126 UErrorCode status
= U_ZERO_ERROR
;
1130 /* Initialize ICU */
1132 if (U_FAILURE(status
)) {
1133 fprintf(stderr
, "%s: can not initialize ICU. status = %s\n",
1134 argv
[0], u_errorName(status
));
1138 // Get and prettify pname.
1139 pname
= uprv_strrchr(*argv
, U_FILE_SEP_CHAR
);
1140 #if U_PLATFORM_USES_ONLY_WIN32_API
1142 pname
= uprv_strrchr(*argv
, '/');
1151 // First, get the arguments from command-line
1152 // to know the codepages to convert between
1154 remainArgv
= remainArgvLimit
= argv
+ 1;
1155 for (iter
= argv
+ 1; iter
!= end
; iter
++) {
1156 // Check for from charset
1157 if (strcmp("-f", *iter
) == 0 || !strcmp("--from-code", *iter
)) {
1163 } else if (strcmp("-t", *iter
) == 0 || !strcmp("--to-code", *iter
)) {
1169 } else if (strcmp("-x", *iter
) == 0) {
1175 } else if (!strcmp("--fallback", *iter
)) {
1177 } else if (!strcmp("--no-fallback", *iter
)) {
1179 } else if (strcmp("-b", *iter
) == 0 || !strcmp("--block-size", *iter
)) {
1182 bufsz
= atoi(*iter
);
1183 if ((int) bufsz
<= 0) {
1185 UnicodeString
str(*iter
);
1187 u_wmsg(stderr
, "badBlockSize", str
.getTerminatedBuffer());
1193 } else if (strcmp("-l", *iter
) == 0 || !strcmp("--list", *iter
)) {
1194 if (printTranslits
) {
1198 } else if (strcmp("--default-code", *iter
) == 0) {
1199 if (printTranslits
) {
1202 printName
= ucnv_getDefaultName();
1203 } else if (strcmp("--list-code", *iter
) == 0) {
1204 if (printTranslits
) {
1210 UErrorCode e
= U_ZERO_ERROR
;
1211 printName
= ucnv_getAlias(*iter
, 0, &e
);
1212 if (U_FAILURE(e
) || !printName
) {
1213 UnicodeString
str(*iter
);
1215 u_wmsg(stderr
, "noSuchCodeset", str
.getTerminatedBuffer());
1220 } else if (strcmp("--canon", *iter
) == 0) {
1222 } else if (strcmp("-L", *iter
) == 0
1223 || !strcmp("--list-transliterators", *iter
)) {
1227 printTranslits
= TRUE
;
1228 } else if (strcmp("-h", *iter
) == 0 || !strcmp("-?", *iter
)
1229 || !strcmp("--help", *iter
)) {
1231 } else if (!strcmp("-c", *iter
)) {
1232 fromucallback
= UCNV_FROM_U_CALLBACK_SKIP
;
1233 } else if (!strcmp("--to-callback", *iter
)) {
1236 const struct callback_ent
*cbe
= findCallback(*iter
);
1238 fromucallback
= cbe
->fromu
;
1239 fromuctxt
= cbe
->fromuctxt
;
1241 UnicodeString
str(*iter
);
1243 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1249 } else if (!strcmp("--from-callback", *iter
)) {
1252 const struct callback_ent
*cbe
= findCallback(*iter
);
1254 toucallback
= cbe
->tou
;
1255 touctxt
= cbe
->touctxt
;
1257 UnicodeString
str(*iter
);
1259 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1265 } else if (!strcmp("-i", *iter
)) {
1266 toucallback
= UCNV_TO_U_CALLBACK_SKIP
;
1267 } else if (!strcmp("--callback", *iter
)) {
1270 const struct callback_ent
*cbe
= findCallback(*iter
);
1272 fromucallback
= cbe
->fromu
;
1273 fromuctxt
= cbe
->fromuctxt
;
1274 toucallback
= cbe
->tou
;
1275 touctxt
= cbe
->touctxt
;
1277 UnicodeString
str(*iter
);
1279 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1285 } else if (!strcmp("-s", *iter
) || !strcmp("--silent", *iter
)) {
1287 } else if (!strcmp("-v", *iter
) || !strcmp("--verbose", *iter
)) {
1289 } else if (!strcmp("-V", *iter
) || !strcmp("--version", *iter
)) {
1290 printf("%s v2.1 ICU " U_ICU_VERSION
"\n", pname
);
1292 } else if (!strcmp("-o", *iter
) || !strcmp("--output", *iter
)) {
1294 if (iter
!= end
&& !outfilestr
) {
1299 } else if (0 == strcmp("--add-signature", *iter
)) {
1301 } else if (0 == strcmp("--remove-signature", *iter
)) {
1303 } else if (**iter
== '-' && (*iter
)[1]) {
1306 // move a non-option up in argv[]
1307 *remainArgvLimit
++ = *iter
;
1311 if (printConvs
|| printName
) {
1312 return printConverters(pname
, printName
, printCanon
) ? 2 : 0;
1313 } else if (printTranslits
) {
1314 return printTransliterators(printCanon
) ? 3 : 0;
1317 if (!fromcpage
|| !uprv_strcmp(fromcpage
, "-")) {
1318 fromcpage
= ucnv_getDefaultName();
1320 if (!tocpage
|| !uprv_strcmp(tocpage
, "-")) {
1321 tocpage
= ucnv_getDefaultName();
1324 // Open the correct output file or connect to stdout for reading input
1325 if (outfilestr
!= 0 && strcmp(outfilestr
, "-")) {
1326 outfile
= fopen(outfilestr
, "wb");
1328 UnicodeString
str1(outfilestr
, "");
1329 UnicodeString
str2(strerror(errno
), "");
1331 u_wmsg(stderr
, "cantCreateOutputF",
1332 str1
.getBuffer(), str2
.getBuffer());
1338 #ifdef USE_FILENO_BINARY_MODE
1339 if (setmode(fileno(outfile
), O_BINARY
) == -1) {
1340 u_wmsg(stderr
, "cantSetOutBinMode");
1346 /* Loop again on the arguments to find all the input files, and
1349 cf
.setBufferSize(bufsz
);
1351 if(remainArgv
< remainArgvLimit
) {
1352 for (iter
= remainArgv
; iter
!= remainArgvLimit
; iter
++) {
1353 if (!cf
.convertFile(
1354 pname
, fromcpage
, toucallback
, touctxt
, tocpage
,
1355 fromucallback
, fromuctxt
, fallback
, translit
, *iter
,
1362 if (!cf
.convertFile(
1363 pname
, fromcpage
, toucallback
, touctxt
, tocpage
,
1364 fromucallback
, fromuctxt
, fallback
, translit
, 0,
1373 #if !UCONFIG_NO_LEGACY_CONVERSION
1376 fprintf(stderr
, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n");
1380 if (outfile
!= stdout
) {
1391 * Hey, Emacs, please set the following:
1394 * indent-tabs-mode: nil