1 /*****************************************************************************
3 * Copyright (C) 1999-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
6 ******************************************************************************/
9 * uconv(1): an iconv(1)-like converter using ICU.
11 * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se>
12 * contributed in 1999.
14 * Conversion to the C conversion API and many improvements by
15 * Yves Arrouye <yves@realnames.com>, current maintainer.
17 * Markus Scherer maintainer from 2003.
18 * See source code repository history for changes.
21 #include <unicode/utypes.h>
22 #include <unicode/putil.h>
23 #include <unicode/ucnv.h>
24 #include <unicode/uenum.h>
25 #include <unicode/unistr.h>
26 #include <unicode/translit.h>
27 #include <unicode/uset.h>
28 #include <unicode/uclean.h>
39 #include "unicode/uwmsg.h"
41 #if (defined(U_WINDOWS) || defined(U_CYGWIN)) && !defined(__STRICT_ANSI__)
44 #if defined(U_WINDOWS)
45 #define USE_FILENO_BINARY_MODE 1
46 /* Windows likes to rename Unix-like functions */
48 #define fileno _fileno
51 #define setmode _setmode
54 #define O_BINARY _O_BINARY
60 /* below from the README */
61 #include "unicode/utypes.h"
62 #include "unicode/udata.h"
63 U_CFUNC
char uconvmsg_dat
[];
66 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
68 #define DEFAULT_BUFSZ 4096
69 #define UCONVMSG "uconvmsg"
71 static UResourceBundle
*gBundle
= 0; /* Bundle containing messages. */
74 * Initialize the message bundle so that message strings can be fetched
79 static void initMsg(const char *pname
) {
83 char dataPath
[2048]; /* XXX Sloppy: should be PATH_MAX. */
84 UErrorCode err
= U_ZERO_ERROR
;
88 /* Set up our static data - if any */
90 udata_setAppData(UCONVMSG
, (const void*) uconvmsg_dat
, &err
);
92 fprintf(stderr
, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
93 pname
, u_errorName(err
));
94 err
= U_ZERO_ERROR
; /* It may still fail */
99 gBundle
= u_wmsg_setPath(UCONVMSG
, &err
);
100 if (U_FAILURE(err
)) {
102 "%s: warning: couldn't open bundle %s: %s\n",
103 pname
, UCONVMSG
, u_errorName(err
));
106 "%s: setAppData was called, internal data %s failed to load\n",
111 /* that was try #1, try again with a path */
112 uprv_strcpy(dataPath
, u_getDataDirectory());
113 uprv_strcat(dataPath
, U_FILE_SEP_STRING
);
114 uprv_strcat(dataPath
, UCONVMSG
);
116 gBundle
= u_wmsg_setPath(dataPath
, &err
);
117 if (U_FAILURE(err
)) {
119 "%s: warning: still couldn't open bundle %s: %s\n",
120 pname
, dataPath
, u_errorName(err
));
121 fprintf(stderr
, "%s: warning: messages will not be displayed\n", pname
);
127 /* Mapping of callback names to the callbacks passed to the converter
130 static struct callback_ent
{
132 UConverterFromUCallback fromu
;
133 const void *fromuctxt
;
134 UConverterToUCallback tou
;
136 } transcode_callbacks
[] = {
138 UCNV_FROM_U_CALLBACK_SUBSTITUTE
, 0,
139 UCNV_TO_U_CALLBACK_SUBSTITUTE
, 0 },
141 UCNV_FROM_U_CALLBACK_SKIP
, 0,
142 UCNV_TO_U_CALLBACK_SKIP
, 0 },
144 UCNV_FROM_U_CALLBACK_STOP
, 0,
145 UCNV_TO_U_CALLBACK_STOP
, 0 },
147 UCNV_FROM_U_CALLBACK_ESCAPE
, 0,
148 UCNV_TO_U_CALLBACK_ESCAPE
, 0},
150 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_ICU
,
151 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_ICU
},
153 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_JAVA
,
154 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_JAVA
},
156 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_C
,
157 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_C
},
159 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
,
160 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
},
162 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
,
163 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_HEX
},
165 UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_DEC
,
166 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_XML_DEC
},
167 { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_UNICODE
,
168 UCNV_TO_U_CALLBACK_ESCAPE
, UCNV_ESCAPE_UNICODE
}
171 /* Return a pointer to a callback record given its name. */
173 static const struct callback_ent
*findCallback(const char *name
) {
175 sizeof(transcode_callbacks
) / sizeof(*transcode_callbacks
);
177 /* We'll do a linear search, there aren't many of them and bsearch()
178 may not be that portable. */
180 for (i
= 0; i
< count
; ++i
) {
181 if (!uprv_stricmp(name
, transcode_callbacks
[i
].name
)) {
182 return &transcode_callbacks
[i
];
189 /* Print converter information. If lookfor is set, only that converter will
190 be printed, otherwise all converters will be printed. If canon is non
191 zero, tags and aliases for each converter are printed too, in the format
192 expected for convrters.txt(5). */
194 static int printConverters(const char *pname
, const char *lookfor
,
197 UErrorCode err
= U_ZERO_ERROR
;
202 /* If there is a specified name, just handle that now. */
206 printf("%s\n", lookfor
);
209 /* Because we are printing a canonical name, we need the
210 true converter name. We've done that already except for
211 the default name (because we want to print the exact
212 name one would get when calling ucnv_getDefaultName()
213 in non-canon mode). But since we do not know at this
214 point if we have the default name or something else, we
215 need to normalize again to the canonical converter
218 const char *truename
= ucnv_getAlias(lookfor
, 0, &err
);
219 if (U_SUCCESS(err
)) {
227 /* Print converter names. We come here for one of two reasons: we
228 are printing all the names (lookfor was null), or we have a
229 single converter to print but in canon mode, hence we need to
230 get to it in order to print everything. */
232 num
= ucnv_countAvailable();
235 u_wmsg(stderr
, "cantGetNames");
239 num
= 1; /* We know where we want to be. */
242 num_stds
= ucnv_countStandards();
243 stds
= (const char **) uprv_malloc(num_stds
* sizeof(*stds
));
245 u_wmsg(stderr
, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR
));
253 for (s
= 0; s
< num_stds
; ++s
) {
254 stds
[s
] = ucnv_getStandard(s
, &err
);
256 printf("%s ", stds
[s
]);
258 if (U_FAILURE(err
)) {
259 u_wmsg(stderr
, "cantGetTag", u_wmsg_errorName(err
));
268 for (int32_t i
= 0; i
< num
; i
++) {
270 uint16_t num_aliases
;
272 /* Set the name either to what we are looking for, or
273 to the current converter name. */
278 name
= ucnv_getAvailableName(i
);
281 /* Get all the aliases associated to the name. */
284 num_aliases
= ucnv_countAliases(name
, &err
);
285 if (U_FAILURE(err
)) {
288 UnicodeString
str(name
, "");
290 u_wmsg(stderr
, "cantGetAliases", str
.getTerminatedBuffer(),
291 u_wmsg_errorName(err
));
296 /* Write all the aliases and their tags. */
298 for (a
= 0; a
< num_aliases
; ++a
) {
299 const char *alias
= ucnv_getAlias(name
, a
, &err
);
301 if (U_FAILURE(err
)) {
302 UnicodeString
str(name
, "");
304 u_wmsg(stderr
, "cantGetAliases", str
.getTerminatedBuffer(),
305 u_wmsg_errorName(err
));
309 /* Print the current alias so that it looks right. */
310 printf("%s%s%s", (canon
? (a
== 0? "" : "\t" ) : "") ,
314 /* Look (slowly, linear searching) for a tag. */
317 /* -1 to skip the last standard */
318 for (s
= t
= 0; s
< num_stds
-1; ++s
) {
319 UEnumeration
*nameEnum
= ucnv_openStandardNames(name
, stds
[s
], &err
);
320 if (U_SUCCESS(err
)) {
321 /* List the standard tags */
322 const char *standardName
;
323 UBool isFirst
= TRUE
;
324 UErrorCode enumError
= U_ZERO_ERROR
;
325 while ((standardName
= uenum_next(nameEnum
, NULL
, &enumError
))) {
326 /* See if this alias is supported by this standard. */
327 if (!strcmp(standardName
, alias
)) {
332 /* Print a * after the default standard name */
333 printf(" %s%s", stds
[s
], (isFirst
? "*" : ""));
343 /* Terminate this entry. */
350 /* Terminate this entry. */
357 /* Free temporary data. */
366 /* Print all available transliterators. If canon is non zero, print
367 one transliterator per line. */
369 static int printTransliterators(UBool canon
)
371 #if UCONFIG_NO_TRANSLITERATION
372 printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
375 int32_t numtrans
= utrans_countAvailableIDs(), i
;
377 char *buf
= (char *) uprv_malloc(buflen
);
380 char sepchar
= canon
? '\n' : ' ';
384 buflen
= sizeof(staticbuf
);
387 for (i
= 0; i
< numtrans
; ++i
) {
388 int32_t len
= utrans_getAvailableID(i
, buf
, buflen
);
389 if (len
>= buflen
- 1) {
390 if (buf
!= staticbuf
) {
395 buf
= (char *) uprv_realloc(buf
, buflen
);
398 buflen
= sizeof(staticbuf
);
401 utrans_getAvailableID(i
, buf
, buflen
);
403 uprv_strcpy(buf
+ buflen
- 4, "..."); /* Truncate the name. */
408 if (i
< numtrans
- 1) {
413 /* Add a terminating newline if needed. */
415 if (sepchar
!= '\n') {
419 /* Free temporary data. */
421 if (buf
!= staticbuf
) {
433 uCR
= 0xd, // carriage return
434 uLF
= 0xa, // line feed
435 uNL
= 0x85, // newline
436 uLS
= 0x2028, // line separator
437 uPS
= 0x2029, // paragraph separator
438 uSig
= 0xfeff // signature/BOM character
441 static inline int32_t
442 getChunkLimit(const UnicodeString
&prev
, const UnicodeString
&s
) {
444 // CR, LF, CRLF, NL, LS, PS
445 // for paragraph ends (see UAX #13/Unicode 4)
446 // and include it in the chunk
447 // all of these characters are on the BMP
448 // do not include FF or VT in case they are part of a paragraph
449 // (important for bidi contexts)
450 static const UChar paraEnds
[] = {
451 0xd, 0xa, 0x85, 0x2028, 0x2029
454 iCR
, iLF
, iNL
, iLS
, iPS
, iCount
457 // first, see if there is a CRLF split between prev and s
458 if (prev
.endsWith(paraEnds
+ iCR
, 1)) {
459 if (s
.startsWith(paraEnds
+ iLF
, 1)) {
460 return 1; // split CRLF, include the LF
461 } else if (!s
.isEmpty()) {
462 return 0; // complete the last chunk
464 return -1; // wait for actual further contents to arrive
468 const UChar
*u
= s
.getBuffer(), *limit
= u
+ s
.length();
474 ((c
< uSP
) && (c
== uCR
|| c
== uLF
)) ||
481 return -1; // LF may be in the next chunk
482 } else if (*u
== uLF
) {
483 ++u
; // include the LF in this chunk
486 return (int32_t)(u
- s
.getBuffer());
490 return -1; // continue collecting the chunk
494 CNV_NO_FEFF
, // cannot convert the U+FEFF Unicode signature character (BOM)
495 CNV_WITH_FEFF
, // can convert the U+FEFF signature character
496 CNV_ADDS_FEFF
// automatically adds/detects the U+FEFF signature character
500 nibbleToHex(uint8_t n
) {
505 (UChar
)((0x61 - 10) + n
);
508 // check the converter's Unicode signature properties;
509 // the fromUnicode side of the converter must be in its initial state
510 // and will be reset again if it was used
512 cnvSigType(UConverter
*cnv
) {
516 // test if the output charset can convert U+FEFF
517 USet
*set
= uset_open(1, 0);
519 ucnv_getUnicodeSet(cnv
, set
, UCNV_ROUNDTRIP_SET
, &err
);
520 if (U_SUCCESS(err
) && uset_contains(set
, uSig
)) {
521 result
= CNV_WITH_FEFF
;
523 result
= CNV_NO_FEFF
; // an error occurred or U+FEFF cannot be converted
527 if (result
== CNV_WITH_FEFF
) {
528 // test if the output charset emits a signature anyway
529 const UChar a
[1] = { 0x61 }; // "a"
538 ucnv_fromUnicode(cnv
,
539 &out
, buffer
+ sizeof(buffer
),
542 ucnv_resetFromUnicode(cnv
);
544 if (NULL
!= ucnv_detectUnicodeSignature(buffer
, (int32_t)(out
- buffer
), NULL
, &err
) &&
547 result
= CNV_ADDS_FEFF
;
557 buf(NULL
), outbuf(NULL
), fromoffsets(NULL
),
558 bufsz(0), signature(0) {}
561 setBufferSize(size_t bufferSize
) {
564 buf
= new char[2 * bufsz
];
565 outbuf
= buf
+ bufsz
;
567 // +1 for an added U+FEFF in the intermediate Unicode buffer
568 fromoffsets
= new int32_t[bufsz
+ 1];
573 delete [] fromoffsets
;
576 UBool
convertFile(const char *pname
,
577 const char *fromcpage
,
578 UConverterToUCallback toucallback
,
581 UConverterFromUCallback fromucallback
,
582 const void *fromuctxt
,
584 const char *translit
,
585 const char *infilestr
,
586 FILE * outfile
, int verbose
);
588 friend int main(int argc
, char **argv
);
591 int32_t *fromoffsets
;
594 int8_t signature
; // add (1) or remove (-1) a U+FEFF Unicode signature character
597 // Convert a file from one encoding to another
599 ConvertFile::convertFile(const char *pname
,
600 const char *fromcpage
,
601 UConverterToUCallback toucallback
,
604 UConverterFromUCallback fromucallback
,
605 const void *fromuctxt
,
607 const char *translit
,
608 const char *infilestr
,
609 FILE * outfile
, int verbose
)
613 UConverter
*convfrom
= 0;
614 UConverter
*convto
= 0;
615 UErrorCode err
= U_ZERO_ERROR
;
617 const char *cbufp
, *prevbufp
;
620 uint32_t infoffset
= 0, outfoffset
= 0; /* Where we are in the file, for error reporting. */
622 const UChar
*unibuf
, *unibufbp
;
627 #if !UCONFIG_NO_TRANSLITERATION
628 Transliterator
*t
= 0; // Transliterator acting on Unicode data.
629 UnicodeString chunk
; // One chunk of the text being collected for transformation.
631 UnicodeString u
; // String to do the transliteration.
634 // use conversion offsets for error messages
635 // unless a transliterator is used -
636 // a text transformation will reorder characters in unpredictable ways
637 UBool useOffsets
= TRUE
;
639 // Open the correct input file or connect to stdin for reading input
641 if (infilestr
!= 0 && strcmp(infilestr
, "-")) {
642 infile
= fopen(infilestr
, "rb");
644 UnicodeString
str1(infilestr
, "");
645 str1
.append((UChar32
) 0);
646 UnicodeString
str2(strerror(errno
), "");
647 str2
.append((UChar32
) 0);
649 u_wmsg(stderr
, "cantOpenInputF", str1
.getBuffer(), str2
.getBuffer());
655 #ifdef USE_FILENO_BINARY_MODE
656 if (setmode(fileno(stdin
), O_BINARY
) == -1) {
658 u_wmsg(stderr
, "cantSetInBinMode");
665 fprintf(stderr
, "%s:\n", infilestr
);
668 #if !UCONFIG_NO_TRANSLITERATION
669 // Create transliterator as needed.
671 if (translit
!= NULL
&& *translit
) {
673 UnicodeString
str(translit
), pestr
;
675 /* Create from rules or by ID as needed. */
679 if (uprv_strchr(translit
, ':') || uprv_strchr(translit
, '>') || uprv_strchr(translit
, '<') || uprv_strchr(translit
, '>')) {
680 t
= Transliterator::createFromRules("Uconv", str
, UTRANS_FORWARD
, parse
, err
);
682 t
= Transliterator::createInstance(translit
, UTRANS_FORWARD
, err
);
685 if (U_FAILURE(err
)) {
686 str
.append((UChar32
) 0);
689 if (parse
.line
>= 0) {
690 UChar linebuf
[20], offsetbuf
[20];
691 uprv_itou(linebuf
, 20, parse
.line
, 10, 0);
692 uprv_itou(offsetbuf
, 20, parse
.offset
, 10, 0);
693 u_wmsg(stderr
, "cantCreateTranslitParseErr", str
.getTerminatedBuffer(),
694 u_wmsg_errorName(err
), linebuf
, offsetbuf
);
696 u_wmsg(stderr
, "cantCreateTranslit", str
.getTerminatedBuffer(),
697 u_wmsg_errorName(err
));
711 // Create codepage converter. If the codepage or its aliases weren't
712 // available, it returns NULL and a failure code. We also set the
713 // callbacks, and return errors in the same way.
715 convfrom
= ucnv_open(fromcpage
, &err
);
716 if (U_FAILURE(err
)) {
717 UnicodeString
str(fromcpage
, "");
719 u_wmsg(stderr
, "cantOpenFromCodeset", str
.getTerminatedBuffer(),
720 u_wmsg_errorName(err
));
723 ucnv_setToUCallBack(convfrom
, toucallback
, touctxt
, 0, 0, &err
);
724 if (U_FAILURE(err
)) {
726 u_wmsg(stderr
, "cantSetCallback", u_wmsg_errorName(err
));
730 convto
= ucnv_open(tocpage
, &err
);
731 if (U_FAILURE(err
)) {
732 UnicodeString
str(tocpage
, "");
734 u_wmsg(stderr
, "cantOpenToCodeset", str
.getTerminatedBuffer(),
735 u_wmsg_errorName(err
));
738 ucnv_setFromUCallBack(convto
, fromucallback
, fromuctxt
, 0, 0, &err
);
739 if (U_FAILURE(err
)) {
741 u_wmsg(stderr
, "cantSetCallback", u_wmsg_errorName(err
));
744 ucnv_setFallback(convto
, fallback
);
746 UBool willexit
, fromSawEndOfBytes
, toSawEndOfUnicode
;
749 // OK, we can convert now.
756 // input file offset at the beginning of the next buffer
759 rd
= fread(buf
, 1, bufsz
, infile
);
760 if (ferror(infile
) != 0) {
761 UnicodeString
str(strerror(errno
));
763 u_wmsg(stderr
, "cantRead", str
.getTerminatedBuffer());
767 // Convert the read buffer into the new encoding via Unicode.
768 // After the call 'unibufp' will be placed behind the last
769 // character that was converted in the 'unibuf'.
770 // Also the 'cbufp' is positioned behind the last converted
772 // At the last conversion in the file, flush should be set to
773 // true so that we get all characters converted.
775 // The converter must be flushed at the end of conversion so
776 // that characters on hold also will be written.
779 flush
= (UBool
)(rd
!= bufsz
);
781 // convert until the input is consumed
783 // remember the start of the current byte-to-Unicode conversion
786 unibuf
= unibufp
= u
.getBuffer((int32_t)bufsz
);
788 // Use bufsz instead of u.getCapacity() for the targetLimit
789 // so that we don't overflow fromoffsets[].
790 ucnv_toUnicode(convfrom
, &unibufp
, unibuf
+ bufsz
, &cbufp
,
791 buf
+ rd
, useOffsets
? fromoffsets
: NULL
, flush
, &err
);
793 ulen
= (int32_t)(unibufp
- unibuf
);
794 u
.releaseBuffer(U_SUCCESS(err
) ? ulen
: 0);
796 // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
797 // converting all of the input bytes.
798 // It works like this because ucnv_toUnicode() returns only under the
799 // following conditions:
800 // - an error occurred during conversion (an error code is set)
801 // - the target buffer is filled (the error code indicates an overflow)
802 // - the source is consumed
803 // That is, if the error code does not indicate a failure,
804 // not even an overflow, then the source must be consumed entirely.
805 fromSawEndOfBytes
= (UBool
)U_SUCCESS(err
);
807 if (err
== U_BUFFER_OVERFLOW_ERROR
) {
809 } else if (U_FAILURE(err
)) {
810 char pos
[32], errorBytes
[32];
811 int8_t i
, length
, errorLength
;
813 UErrorCode localError
= U_ZERO_ERROR
;
814 errorLength
= (int8_t)sizeof(errorBytes
);
815 ucnv_getInvalidChars(convfrom
, errorBytes
, &errorLength
, &localError
);
816 if (U_FAILURE(localError
) || errorLength
== 0) {
820 // print the input file offset of the start of the error bytes:
821 // input file offset of the current byte buffer +
822 // length of the just consumed bytes -
823 // length of the error bytes
825 (int8_t)sprintf(pos
, "%d",
826 (int)(infoffset
+ (cbufp
- buf
) - errorLength
));
828 // output the bytes that caused the error
830 for (i
= 0; i
< errorLength
; ++i
) {
832 str
.append((UChar
)uSP
);
834 str
.append(nibbleToHex((uint8_t)errorBytes
[i
] >> 4));
835 str
.append(nibbleToHex((uint8_t)errorBytes
[i
]));
839 u_wmsg(stderr
, "problemCvtToU",
840 UnicodeString(pos
, length
, "").getTerminatedBuffer(),
841 str
.getTerminatedBuffer(),
842 u_wmsg_errorName(err
));
845 err
= U_ZERO_ERROR
; /* reset the error for the rest of the conversion. */
848 // Replaced a check for whether the input was consumed by
849 // looping until it is; message key "premEndInput" now obsolete.
855 // remove a U+FEFF Unicode signature character if requested
857 if (u
.charAt(0) == uSig
) {
860 // account for the removed UChar and offset
864 // remove an offset from fromoffsets[] as well
865 // to keep the array parallel with the UChars
866 memmove(fromoffsets
, fromoffsets
+ 1, ulen
* 4);
873 #if !UCONFIG_NO_TRANSLITERATION
874 // Transliterate/transform if needed.
876 // For transformation, we use chunking code -
877 // collect Unicode input until, for example, an end-of-line,
878 // then transform and output-convert that and continue collecting.
879 // This makes the transformation result independent of the buffer size
880 // while avoiding the slower keyboard mode.
881 // The end-of-chunk characters are completely included in the
882 // transformed string in case they are to be transformed themselves.
888 chunkLimit
= getChunkLimit(chunk
, u
);
889 if (chunkLimit
< 0 && flush
&& fromSawEndOfBytes
) {
890 // use all of the rest at the end of the text
891 chunkLimit
= u
.length();
893 if (chunkLimit
>= 0) {
894 // complete the chunk and transform it
895 chunk
.append(u
, 0, chunkLimit
);
896 u
.remove(0, chunkLimit
);
897 t
->transliterate(chunk
);
899 // append the transformation result to the result and empty the chunk
903 // continue collecting the chunk
907 } while (!u
.isEmpty());
914 // add a U+FEFF Unicode signature character if requested
915 // and possible/necessary
917 if (u
.charAt(0) != uSig
&& cnvSigType(convto
) == CNV_WITH_FEFF
) {
918 u
.insert(0, (UChar
)uSig
);
921 // insert a pseudo-offset into fromoffsets[] as well
922 // to keep the array parallel with the UChars
923 memmove(fromoffsets
+ 1, fromoffsets
, ulen
* 4);
927 // account for the additional UChar and offset
933 // Convert the Unicode buffer into the destination codepage
934 // Again 'bufp' will be placed behind the last converted character
935 // And 'unibufp' will be placed behind the last converted unicode character
936 // At the last conversion flush should be set to true to ensure that
937 // all characters left get converted
939 unibuf
= unibufbp
= u
.getBuffer();
944 // Use fromSawEndOfBytes in addition to the flush flag -
945 // it indicates whether the intermediate Unicode string
946 // contains the very last UChars for the very last input bytes.
947 ucnv_fromUnicode(convto
, &bufp
, outbuf
+ bufsz
,
950 NULL
, (UBool
)(flush
&& fromSawEndOfBytes
), &err
);
952 // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
953 // converting all of the intermediate UChars.
954 // See comment for fromSawEndOfBytes.
955 toSawEndOfUnicode
= (UBool
)U_SUCCESS(err
);
957 if (err
== U_BUFFER_OVERFLOW_ERROR
) {
959 } else if (U_FAILURE(err
)) {
960 UChar errorUChars
[4];
964 int8_t i
, length
, errorLength
;
966 UErrorCode localError
= U_ZERO_ERROR
;
967 errorLength
= (int8_t)LENGTHOF(errorUChars
);
968 ucnv_getInvalidUChars(convto
, errorUChars
, &errorLength
, &localError
);
969 if (U_FAILURE(localError
) || errorLength
== 0) {
970 // need at least 1 so that we don't access beyond the length of fromoffsets[]
977 // Unicode buffer offset of the start of the error UChars
978 ferroffset
= (int32_t)((unibufbp
- unibuf
) - errorLength
);
979 if (ferroffset
< 0) {
980 // approximation - the character started in the previous Unicode buffer
984 // get the corresponding byte offset out of fromoffsets[]
985 // go back if the offset is not known for some of the UChars
988 fromoffset
= fromoffsets
[ferroffset
];
989 } while (fromoffset
< 0 && --ferroffset
>= 0);
991 // total input file offset =
992 // input file offset of the current byte buffer +
993 // byte buffer offset of where the current Unicode buffer is converted from +
994 // fromoffsets[Unicode offset]
995 ferroffset
= infoffset
+ (prevbufp
- buf
) + fromoffset
;
996 errtag
= "problemCvtFromU";
998 // Do not use fromoffsets if (t != NULL) because the Unicode text may
999 // be different from what the offsets refer to.
1001 // output file offset
1002 ferroffset
= (int32_t)(outfoffset
+ (bufp
- outbuf
));
1003 errtag
= "problemCvtFromUOut";
1006 length
= (int8_t)sprintf(pos
, "%u", (int)ferroffset
);
1008 // output the code points that caused the error
1010 for (i
= 0; i
< errorLength
;) {
1012 str
.append((UChar
)uSP
);
1014 U16_NEXT(errorUChars
, i
, errorLength
, c
);
1015 if (c
>= 0x100000) {
1016 str
.append(nibbleToHex((uint8_t)(c
>> 20)));
1019 str
.append(nibbleToHex((uint8_t)(c
>> 16)));
1021 str
.append(nibbleToHex((uint8_t)(c
>> 12)));
1022 str
.append(nibbleToHex((uint8_t)(c
>> 8)));
1023 str
.append(nibbleToHex((uint8_t)(c
>> 4)));
1024 str
.append(nibbleToHex((uint8_t)c
));
1028 u_wmsg(stderr
, errtag
,
1029 UnicodeString(pos
, length
, "").getTerminatedBuffer(),
1030 str
.getTerminatedBuffer(),
1031 u_wmsg_errorName(err
));
1032 u_wmsg(stderr
, "errorUnicode", str
.getTerminatedBuffer());
1035 err
= U_ZERO_ERROR
; /* reset the error for the rest of the conversion. */
1038 // Replaced a check for whether the intermediate Unicode characters were all consumed by
1039 // looping until they are; message key "premEnd" now obsolete.
1041 // Finally, write the converted buffer to the output file
1042 size_t outlen
= (size_t) (bufp
- outbuf
);
1043 outfoffset
+= (int32_t)(wr
= fwrite(outbuf
, 1, outlen
, outfile
));
1045 UnicodeString
str(strerror(errno
));
1047 u_wmsg(stderr
, "cantWrite", str
.getTerminatedBuffer());
1054 } while (!toSawEndOfUnicode
);
1055 } while (!fromSawEndOfBytes
);
1056 } while (!flush
); // Stop when we have flushed the
1057 // converters (this means that it's
1058 // the end of output)
1068 ucnv_close(convfrom
);
1071 #if !UCONFIG_NO_TRANSLITERATION
1075 if (infile
!= stdin
) {
1082 static void usage(const char *pname
, int ecode
) {
1085 UErrorCode err
= U_ZERO_ERROR
;
1086 FILE *fp
= ecode
? stderr
: stdout
;
1091 ures_getStringByKey(gBundle
, ecode
? "lcUsageWord" : "ucUsageWord",
1093 UnicodeString
upname(pname
, (int32_t)(uprv_strlen(pname
) + 1));
1094 UnicodeString
mname(msg
, msgLen
+ 1);
1096 res
= u_wmsg(fp
, "usage", mname
.getBuffer(), upname
.getBuffer());
1101 if (!u_wmsg(fp
, "help")) {
1102 /* Now dump callbacks and finish. */
1105 sizeof(transcode_callbacks
) / sizeof(*transcode_callbacks
);
1106 for (i
= 0; i
< count
; ++i
) {
1107 fprintf(fp
, " %s", transcode_callbacks
[i
].name
);
1117 main(int argc
, char **argv
)
1122 size_t bufsz
= DEFAULT_BUFSZ
;
1124 const char *fromcpage
= 0;
1125 const char *tocpage
= 0;
1126 const char *translit
= 0;
1127 const char *outfilestr
= 0;
1128 UBool fallback
= FALSE
;
1130 UConverterFromUCallback fromucallback
= UCNV_FROM_U_CALLBACK_STOP
;
1131 const void *fromuctxt
= 0;
1132 UConverterToUCallback toucallback
= UCNV_TO_U_CALLBACK_STOP
;
1133 const void *touctxt
= 0;
1135 char **iter
, **remainArgv
, **remainArgvLimit
;
1136 char **end
= argv
+ argc
;
1140 UBool printConvs
= FALSE
, printCanon
= FALSE
, printTranslits
= FALSE
;
1141 const char *printName
= 0;
1143 UBool verbose
= FALSE
;
1144 UErrorCode status
= U_ZERO_ERROR
;
1148 /* Initialize ICU */
1150 if (U_FAILURE(status
)) {
1151 fprintf(stderr
, "%s: can not initialize ICU. status = %s\n",
1152 argv
[0], u_errorName(status
));
1156 // Get and prettify pname.
1157 pname
= uprv_strrchr(*argv
, U_FILE_SEP_CHAR
);
1160 pname
= uprv_strrchr(*argv
, '/');
1169 // First, get the arguments from command-line
1170 // to know the codepages to convert between
1172 remainArgv
= remainArgvLimit
= argv
+ 1;
1173 for (iter
= argv
+ 1; iter
!= end
; iter
++) {
1174 // Check for from charset
1175 if (strcmp("-f", *iter
) == 0 || !strcmp("--from-code", *iter
)) {
1181 } else if (strcmp("-t", *iter
) == 0 || !strcmp("--to-code", *iter
)) {
1187 } else if (strcmp("-x", *iter
) == 0) {
1193 } else if (!strcmp("--fallback", *iter
)) {
1195 } else if (!strcmp("--no-fallback", *iter
)) {
1197 } else if (strcmp("-b", *iter
) == 0 || !strcmp("--block-size", *iter
)) {
1200 bufsz
= atoi(*iter
);
1201 if ((int) bufsz
<= 0) {
1203 UnicodeString
str(*iter
);
1205 u_wmsg(stderr
, "badBlockSize", str
.getTerminatedBuffer());
1211 } else if (strcmp("-l", *iter
) == 0 || !strcmp("--list", *iter
)) {
1212 if (printTranslits
) {
1216 } else if (strcmp("--default-code", *iter
) == 0) {
1217 if (printTranslits
) {
1220 printName
= ucnv_getDefaultName();
1221 } else if (strcmp("--list-code", *iter
) == 0) {
1222 if (printTranslits
) {
1228 UErrorCode e
= U_ZERO_ERROR
;
1229 printName
= ucnv_getAlias(*iter
, 0, &e
);
1230 if (U_FAILURE(e
) || !printName
) {
1231 UnicodeString
str(*iter
);
1233 u_wmsg(stderr
, "noSuchCodeset", str
.getTerminatedBuffer());
1238 } else if (strcmp("--canon", *iter
) == 0) {
1240 } else if (strcmp("-L", *iter
) == 0
1241 || !strcmp("--list-transliterators", *iter
)) {
1245 printTranslits
= TRUE
;
1246 } else if (strcmp("-h", *iter
) == 0 || !strcmp("-?", *iter
)
1247 || !strcmp("--help", *iter
)) {
1249 } else if (!strcmp("-c", *iter
)) {
1250 fromucallback
= UCNV_FROM_U_CALLBACK_SKIP
;
1251 } else if (!strcmp("--to-callback", *iter
)) {
1254 const struct callback_ent
*cbe
= findCallback(*iter
);
1256 fromucallback
= cbe
->fromu
;
1257 fromuctxt
= cbe
->fromuctxt
;
1259 UnicodeString
str(*iter
);
1261 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1267 } else if (!strcmp("--from-callback", *iter
)) {
1270 const struct callback_ent
*cbe
= findCallback(*iter
);
1272 toucallback
= cbe
->tou
;
1273 touctxt
= cbe
->touctxt
;
1275 UnicodeString
str(*iter
);
1277 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1283 } else if (!strcmp("-i", *iter
)) {
1284 toucallback
= UCNV_TO_U_CALLBACK_SKIP
;
1285 } else if (!strcmp("--callback", *iter
)) {
1288 const struct callback_ent
*cbe
= findCallback(*iter
);
1290 fromucallback
= cbe
->fromu
;
1291 fromuctxt
= cbe
->fromuctxt
;
1292 toucallback
= cbe
->tou
;
1293 touctxt
= cbe
->touctxt
;
1295 UnicodeString
str(*iter
);
1297 u_wmsg(stderr
, "unknownCallback", str
.getTerminatedBuffer());
1303 } else if (!strcmp("-s", *iter
) || !strcmp("--silent", *iter
)) {
1305 } else if (!strcmp("-v", *iter
) || !strcmp("--verbose", *iter
)) {
1307 } else if (!strcmp("-V", *iter
) || !strcmp("--version", *iter
)) {
1308 printf("%s v2.1 ICU " U_ICU_VERSION
"\n", pname
);
1310 } else if (!strcmp("-o", *iter
) || !strcmp("--output", *iter
)) {
1312 if (iter
!= end
&& !outfilestr
) {
1317 } else if (0 == strcmp("--add-signature", *iter
)) {
1319 } else if (0 == strcmp("--remove-signature", *iter
)) {
1321 } else if (**iter
== '-' && (*iter
)[1]) {
1324 // move a non-option up in argv[]
1325 *remainArgvLimit
++ = *iter
;
1329 if (printConvs
|| printName
) {
1330 return printConverters(pname
, printName
, printCanon
) ? 2 : 0;
1331 } else if (printTranslits
) {
1332 return printTransliterators(printCanon
) ? 3 : 0;
1335 if (!fromcpage
|| !uprv_strcmp(fromcpage
, "-")) {
1336 fromcpage
= ucnv_getDefaultName();
1338 if (!tocpage
|| !uprv_strcmp(tocpage
, "-")) {
1339 tocpage
= ucnv_getDefaultName();
1342 // Open the correct output file or connect to stdout for reading input
1343 if (outfilestr
!= 0 && strcmp(outfilestr
, "-")) {
1344 outfile
= fopen(outfilestr
, "wb");
1346 UnicodeString
str1(outfilestr
, "");
1347 UnicodeString
str2(strerror(errno
), "");
1349 u_wmsg(stderr
, "cantCreateOutputF",
1350 str1
.getBuffer(), str2
.getBuffer());
1356 #ifdef USE_FILENO_BINARY_MODE
1357 if (setmode(fileno(outfile
), O_BINARY
) == -1) {
1358 u_wmsg(stderr
, "cantSetOutBinMode");
1364 /* Loop again on the arguments to find all the input files, and
1367 cf
.setBufferSize(bufsz
);
1369 if(remainArgv
< remainArgvLimit
) {
1370 for (iter
= remainArgv
; iter
!= remainArgvLimit
; iter
++) {
1371 if (!cf
.convertFile(
1372 pname
, fromcpage
, toucallback
, touctxt
, tocpage
,
1373 fromucallback
, fromuctxt
, fallback
, translit
, *iter
,
1380 if (!cf
.convertFile(
1381 pname
, fromcpage
, toucallback
, touctxt
, tocpage
,
1382 fromucallback
, fromuctxt
, fallback
, translit
, 0,
1394 if (outfile
!= stdout
) {
1403 * Hey, Emacs, please set the following:
1406 * indent-tabs-mode: nil