/*****************************************************************************
*
-* Copyright (C) 1999-2003, International Business Machines
+* Copyright (C) 1999-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************/
/*
* uconv(1): an iconv(1)-like converter using ICU.
*
- * Original code by Jonas Utterstrรถm <jonas.utterstrom@vittran.norrnod.se>
+ * Original code by Jonas Utterström <jonas.utterstrom@vittran.norrnod.se>
* contributed in 1999.
*
* Conversion to the C conversion API and many improvements by
* Yves Arrouye <yves@realnames.com>, current maintainer.
*
+ * Markus Scherer maintainer from 2003.
+ * See source code repository history for changes.
*/
#include <unicode/utypes.h>
+#include <unicode/putil.h>
#include <unicode/ucnv.h>
+#include <unicode/uenum.h>
#include <unicode/unistr.h>
#include <unicode/translit.h>
+#include <unicode/uset.h>
+#include <unicode/uclean.h>
+#include <unicode/utf16.h>
#include <stdio.h>
#include <errno.h>
#include "unicode/uwmsg.h"
-#ifdef WIN32
-#include <string.h>
+U_NAMESPACE_USE
+
+#if U_PLATFORM_USES_ONLY_WIN32_API && !defined(__STRICT_ANSI__)
#include <io.h>
#include <fcntl.h>
+#if U_PLATFORM_USES_ONLY_WIN32_API
+#define USE_FILENO_BINARY_MODE 1
+/* Windows likes to rename Unix-like functions */
+#ifndef fileno
+#define fileno _fileno
+#endif
+#ifndef setmode
+#define setmode _setmode
+#endif
+#ifndef O_BINARY
+#define O_BINARY _O_BINARY
+#endif
+#endif
#endif
#ifdef UCONVMSG_LINK
ps = 1;
/* Set up our static data - if any */
-#ifdef UCONVMSG_LINK
+#if defined(UCONVMSG_LINK) && U_PLATFORM != U_PF_OS390 /* On z/OS, this is failing. */
udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err);
if (U_FAILURE(err)) {
fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
expected for convrters.txt(5). */
static int printConverters(const char *pname, const char *lookfor,
- int canon)
+ UBool canon)
{
UErrorCode err = U_ZERO_ERROR;
int32_t num;
} else {
uint16_t s;
+ if (canon) {
+ printf("{ ");
+ }
for (s = 0; s < num_stds; ++s) {
stds[s] = ucnv_getStandard(s, &err);
+ if (canon) {
+ printf("%s ", stds[s]);
+ }
if (U_FAILURE(err)) {
u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
- return -1;
+ goto error_cleanup;
}
}
+ if (canon) {
+ puts("}");
+ }
}
for (int32_t i = 0; i < num; i++) {
if (U_FAILURE(err)) {
printf("%s", name);
- UnicodeString str(name, (int32_t)(uprv_strlen(name) + 1));
+ UnicodeString str(name, "");
putchar('\t');
- u_wmsg(stderr, "cantGetAliases", str.getBuffer(),
+ u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
u_wmsg_errorName(err));
- return -1;
+ goto error_cleanup;
} else {
uint16_t a, s, t;
const char *alias = ucnv_getAlias(name, a, &err);
if (U_FAILURE(err)) {
- UnicodeString str(name, (int32_t)(uprv_strlen(name) + 1));
+ UnicodeString str(name, "");
putchar('\t');
- u_wmsg(stderr, "cantGetAliases", str.getBuffer(),
+ u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
u_wmsg_errorName(err));
- return -1;
+ goto error_cleanup;
}
- printf("%s", alias);
+ /* Print the current alias so that it looks right. */
+ printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") ,
+ alias,
+ (canon ? "" : " "));
/* Look (slowly, linear searching) for a tag. */
if (canon) {
- for (s = t = 0; s < num_stds; ++s) {
- const char *standard =
- ucnv_getStandardName(name, stds[s], &err);
- if (U_SUCCESS(err) && standard) {
- if (!strcmp(standard, alias)) {
- if (!t) {
- printf(" {");
- t = 1;
+ /* -1 to skip the last standard */
+ for (s = t = 0; s < num_stds-1; ++s) {
+ UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err);
+ if (U_SUCCESS(err)) {
+ /* List the standard tags */
+ const char *standardName;
+ UBool isFirst = TRUE;
+ UErrorCode enumError = U_ZERO_ERROR;
+ while ((standardName = uenum_next(nameEnum, NULL, &enumError))) {
+ /* See if this alias is supported by this standard. */
+ if (!strcmp(standardName, alias)) {
+ if (!t) {
+ printf(" {");
+ t = 1;
+ }
+ /* Print a * after the default standard name */
+ printf(" %s%s", stds[s], (isFirst ? "*" : ""));
}
- printf(" %s", stds[s]);
+ isFirst = FALSE;
}
}
}
printf(" }");
}
}
+ /* Terminate this entry. */
+ if (canon) {
+ puts("");
+ }
/* Move on. */
-
- if (a < num_aliases - 1) {
- putchar(a || !canon ? ' ' : '\t');
- }
}
- }
-
- /* Terminate this entry. */
-
- if (canon) {
- putchar('\n');
- } else if (i < num - 1) {
- putchar(' ');
+ /* Terminate this entry. */
+ if (!canon) {
+ puts("");
+ }
}
}
/* Success. */
return 0;
+error_cleanup:
+ uprv_free(stds);
+ return -1;
}
/* Print all available transliterators. If canon is non zero, print
one transliterator per line. */
-static int printTransliterators(int canon)
+static int printTransliterators(UBool canon)
{
#if UCONFIG_NO_TRANSLITERATION
printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
return 1;
#else
- int32_t numtrans = utrans_countAvailableIDs(), i;
- int buflen = 512;
- char *buf = (char *) uprv_malloc(buflen);
- char staticbuf[512];
+ UErrorCode status = U_ZERO_ERROR;
+ UEnumeration *ids = utrans_openIDs(&status);
+ int32_t i, numtrans = uenum_count(ids, &status);
char sepchar = canon ? '\n' : ' ';
- if (!buf) {
- buf = staticbuf;
- buflen = sizeof(staticbuf);
- }
+ for (i = 0; U_SUCCESS(status)&& (i < numtrans); ++i) {
+ int32_t len;
+ const char *nextTrans = uenum_next(ids, &len, &status);
- for (i = 0; i < numtrans; ++i) {
- int32_t len = utrans_getAvailableID(i, buf, buflen);
- if (len >= buflen - 1) {
- if (buf != staticbuf) {
- buflen <<= 1;
- if (buflen < len) {
- buflen = len + 64;
- }
- buf = (char *) uprv_realloc(buf, buflen);
- if (!buf) {
- buf = staticbuf;
- buflen = sizeof(staticbuf);
- }
- }
- utrans_getAvailableID(i, buf, buflen);
- if (len >= buflen) {
- uprv_strcpy(buf + buflen - 4, "..."); /* Truncate the name. */
- }
- }
-
- printf("%s", buf);
+ printf("%s", nextTrans);
if (i < numtrans - 1) {
putchar(sepchar);
}
}
+ uenum_close(ids);
+
/* Add a terminating newline if needed. */
if (sepchar != '\n') {
putchar('\n');
}
- /* Free temporary data. */
-
- if (buf != staticbuf) {
- uprv_free(buf);
- }
-
/* Success. */
return 0;
#endif
}
-/* Return the offset of a byte in its source, given the from and to offsets
- vectors and the byte offset itself. */
+enum {
+ uSP = 0x20, // space
+ uCR = 0xd, // carriage return
+ uLF = 0xa, // line feed
+ uNL = 0x85, // newline
+ uLS = 0x2028, // line separator
+ uPS = 0x2029, // paragraph separator
+ uSig = 0xfeff // signature/BOM character
+};
-static inline int32_t dataOffset(int32_t whereto,
- const int32_t *fromoffsets, int32_t fromsz,
- const int32_t *tooffsets, int32_t tosz) {
- if (whereto >= 0 && whereto < tosz) {
- whereto = tooffsets[whereto];
- if (whereto >= 0 && whereto < fromsz) {
- return fromoffsets[whereto];
+static inline int32_t
+getChunkLimit(const UnicodeString &prev, const UnicodeString &s) {
+ // find one of
+ // CR, LF, CRLF, NL, LS, PS
+ // for paragraph ends (see UAX #13/Unicode 4)
+ // and include it in the chunk
+ // all of these characters are on the BMP
+ // do not include FF or VT in case they are part of a paragraph
+ // (important for bidi contexts)
+ static const UChar paraEnds[] = {
+ 0xd, 0xa, 0x85, 0x2028, 0x2029
+ };
+ enum {
+ iCR, iLF, iNL, iLS, iPS, iCount
+ };
+
+ // first, see if there is a CRLF split between prev and s
+ if (prev.endsWith(paraEnds + iCR, 1)) {
+ if (s.startsWith(paraEnds + iLF, 1)) {
+ return 1; // split CRLF, include the LF
+ } else if (!s.isEmpty()) {
+ return 0; // complete the last chunk
+ } else {
+ return -1; // wait for actual further contents to arrive
}
}
- return 0;
+ const UChar *u = s.getBuffer(), *limit = u + s.length();
+ UChar c;
+
+ while (u < limit) {
+ c = *u++;
+ if (
+ ((c < uSP) && (c == uCR || c == uLF)) ||
+ (c == uNL) ||
+ ((c & uLS) == uLS)
+ ) {
+ if (c == uCR) {
+ // check for CRLF
+ if (u == limit) {
+ return -1; // LF may be in the next chunk
+ } else if (*u == uLF) {
+ ++u; // include the LF in this chunk
+ }
+ }
+ return (int32_t)(u - s.getBuffer());
+ }
+ }
+
+ return -1; // continue collecting the chunk
}
+enum {
+ CNV_NO_FEFF, // cannot convert the U+FEFF Unicode signature character (BOM)
+ CNV_WITH_FEFF, // can convert the U+FEFF signature character
+ CNV_ADDS_FEFF // automatically adds/detects the U+FEFF signature character
+};
+
+static inline UChar
+nibbleToHex(uint8_t n) {
+ n &= 0xf;
+ return
+ n <= 9 ?
+ (UChar)(0x30 + n) :
+ (UChar)((0x61 - 10) + n);
+}
+
+// check the converter's Unicode signature properties;
+// the fromUnicode side of the converter must be in its initial state
+// and will be reset again if it was used
+static int32_t
+cnvSigType(UConverter *cnv) {
+ UErrorCode err;
+ int32_t result;
+
+ // test if the output charset can convert U+FEFF
+ USet *set = uset_open(1, 0);
+ err = U_ZERO_ERROR;
+ ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err);
+ if (U_SUCCESS(err) && uset_contains(set, uSig)) {
+ result = CNV_WITH_FEFF;
+ } else {
+ result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted
+ }
+ uset_close(set);
+
+ if (result == CNV_WITH_FEFF) {
+ // test if the output charset emits a signature anyway
+ const UChar a[1] = { 0x61 }; // "a"
+ const UChar *in;
+
+ char buffer[20];
+ char *out;
+
+ in = a;
+ out = buffer;
+ err = U_ZERO_ERROR;
+ ucnv_fromUnicode(cnv,
+ &out, buffer + sizeof(buffer),
+ &in, a + 1,
+ NULL, TRUE, &err);
+ ucnv_resetFromUnicode(cnv);
+
+ if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) &&
+ U_SUCCESS(err)
+ ) {
+ result = CNV_ADDS_FEFF;
+ }
+ }
+
+ return result;
+}
+
+class ConvertFile {
+public:
+ ConvertFile() :
+ buf(NULL), outbuf(NULL), fromoffsets(NULL),
+ bufsz(0), signature(0) {}
+
+ void
+ setBufferSize(size_t bufferSize) {
+ bufsz = bufferSize;
+
+ buf = new char[2 * bufsz];
+ outbuf = buf + bufsz;
+
+ // +1 for an added U+FEFF in the intermediate Unicode buffer
+ fromoffsets = new int32_t[bufsz + 1];
+ }
+
+ ~ConvertFile() {
+ delete [] buf;
+ delete [] fromoffsets;
+ }
+
+ UBool convertFile(const char *pname,
+ const char *fromcpage,
+ UConverterToUCallback toucallback,
+ const void *touctxt,
+ const char *tocpage,
+ UConverterFromUCallback fromucallback,
+ const void *fromuctxt,
+ UBool fallback,
+ const char *translit,
+ const char *infilestr,
+ FILE * outfile, int verbose);
+private:
+ friend int main(int argc, char **argv);
+
+ char *buf, *outbuf;
+ int32_t *fromoffsets;
+
+ size_t bufsz;
+ int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character
+};
+
// Convert a file from one encoding to another
-static UBool convertFile(const char *pname,
+UBool
+ConvertFile::convertFile(const char *pname,
const char *fromcpage,
UConverterToUCallback toucallback,
const void *touctxt,
const char *tocpage,
UConverterFromUCallback fromucallback,
const void *fromuctxt,
- int fallback,
- size_t bufsz,
+ UBool fallback,
const char *translit,
const char *infilestr,
FILE * outfile, int verbose)
UConverter *convto = 0;
UErrorCode err = U_ZERO_ERROR;
UBool flush;
- const char *cbufp;
+ UBool closeFile = FALSE;
+ const char *cbufp, *prevbufp;
char *bufp;
- char *buf = 0;
uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */
- const UChar *unibufbp;
+ const UChar *unibuf, *unibufbp;
UChar *unibufp;
- UChar *unibuf = 0;
- int32_t *fromoffsets = 0, *tooffsets = 0;
- size_t rd, wr, tobufsz;
+ size_t rd, wr;
#if !UCONFIG_NO_TRANSLITERATION
Transliterator *t = 0; // Transliterator acting on Unicode data.
+ UnicodeString chunk; // One chunk of the text being collected for transformation.
#endif
UnicodeString u; // String to do the transliteration.
+ int32_t ulen;
+
+ // use conversion offsets for error messages
+ // unless a transliterator is used -
+ // a text transformation will reorder characters in unpredictable ways
+ UBool useOffsets = TRUE;
// Open the correct input file or connect to stdin for reading input
u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer());
return FALSE;
}
+ closeFile = TRUE;
} else {
infilestr = "-";
infile = stdin;
-#ifdef WIN32
+#ifdef USE_FILENO_BINARY_MODE
if (setmode(fileno(stdin), O_BINARY) == -1) {
initMsg(pname);
u_wmsg(stderr, "cantSetInBinMode");
parse.line = -1;
if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
- t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err);
+ t = Transliterator::createFromRules(UNICODE_STRING_SIMPLE("Uconv"), str, UTRANS_FORWARD, parse, err);
} else {
- t = Transliterator::createInstance(translit, UTRANS_FORWARD, err);
+ t = Transliterator::createInstance(UnicodeString(translit, -1, US_INV), UTRANS_FORWARD, err);
}
if (U_FAILURE(err)) {
UChar linebuf[20], offsetbuf[20];
uprv_itou(linebuf, 20, parse.line, 10, 0);
uprv_itou(offsetbuf, 20, parse.offset, 10, 0);
- u_wmsg(stderr, "cantCreateTranslitParseErr", str.getBuffer(),
+ u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(),
u_wmsg_errorName(err), linebuf, offsetbuf);
} else {
- u_wmsg(stderr, "cantCreateTranslit", str.getBuffer(),
+ u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(),
u_wmsg_errorName(err));
}
}
goto error_exit;
}
+
+ useOffsets = FALSE;
}
#endif
convfrom = ucnv_open(fromcpage, &err);
if (U_FAILURE(err)) {
- UnicodeString str(fromcpage, (int32_t)(uprv_strlen(fromcpage) + 1));
+ UnicodeString str(fromcpage, "");
initMsg(pname);
- u_wmsg(stderr, "cantOpenFromCodeset", str.getBuffer(),
+ u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(),
u_wmsg_errorName(err));
goto error_exit;
}
convto = ucnv_open(tocpage, &err);
if (U_FAILURE(err)) {
- UnicodeString str(tocpage, (int32_t)(uprv_strlen(tocpage) + 1));
+ UnicodeString str(tocpage, "");
initMsg(pname);
- u_wmsg(stderr, "cantOpenToCodeset", str.getBuffer(),
+ u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(),
u_wmsg_errorName(err));
goto error_exit;
}
}
ucnv_setFallback(convto, fallback);
- // To ensure that the buffer always is of enough size, we
- // must take the worst case scenario, that is the character in
- // the codepage that uses the most bytes and multiply it against
- // the buffer size.
-
- // use bufsz+1 to allow for additional BOM/signature character (U+FEFF)
- tobufsz = (bufsz+1) * ucnv_getMaxCharSize(convto);
-
- buf = new char[tobufsz];
- unibuf = new UChar[bufsz];
-
- fromoffsets = new int32_t[bufsz];
- tooffsets = new int32_t[tobufsz];
+ UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode;
+ int8_t sig;
// OK, we can convert now.
+ sig = signature;
+ rd = 0;
do {
- char willexit = 0;
+ willexit = FALSE;
+
+ // input file offset at the beginning of the next buffer
+ infoffset += rd;
rd = fread(buf, 1, bufsz, infile);
if (ferror(infile) != 0) {
UnicodeString str(strerror(errno));
- str.append((UChar32) 0);
initMsg(pname);
- u_wmsg(stderr, "cantRead", str.getBuffer());
+ u_wmsg(stderr, "cantRead", str.getTerminatedBuffer());
goto error_exit;
}
- // Convert the read buffer into the new coding
- // After the call 'unibufp' will be placed on the last
+ // Convert the read buffer into the new encoding via Unicode.
+ // After the call 'unibufp' will be placed behind the last
// character that was converted in the 'unibuf'.
- // Also the 'cbufp' is positioned on the last converted
+ // Also the 'cbufp' is positioned behind the last converted
// character.
// At the last conversion in the file, flush should be set to
- // true so that we get all characters converted
+ // true so that we get all characters converted.
//
// The converter must be flushed at the end of conversion so
// that characters on hold also will be written.
- unibufp = unibuf;
cbufp = buf;
- flush = rd != bufsz;
- ucnv_toUnicode(convfrom, &unibufp, unibufp + bufsz, &cbufp,
- cbufp + rd, fromoffsets, flush, &err);
-
- infoffset += (uint32_t)(cbufp - buf);
+ flush = (UBool)(rd != bufsz);
- if (U_FAILURE(err)) {
- char pos[32];
- sprintf(pos, "%u", infoffset - 1);
- UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
- initMsg(pname);
- u_wmsg(stderr, "problemCvtToU", str.getBuffer(), u_wmsg_errorName(err));
- willexit = 1;
- err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
- }
+ // convert until the input is consumed
+ do {
+ // remember the start of the current byte-to-Unicode conversion
+ prevbufp = cbufp;
+
+ unibuf = unibufp = u.getBuffer((int32_t)bufsz);
+
+ // Use bufsz instead of u.getCapacity() for the targetLimit
+ // so that we don't overflow fromoffsets[].
+ ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp,
+ buf + rd, useOffsets ? fromoffsets : NULL, flush, &err);
+
+ ulen = (int32_t)(unibufp - unibuf);
+ u.releaseBuffer(U_SUCCESS(err) ? ulen : 0);
+
+ // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
+ // converting all of the input bytes.
+ // It works like this because ucnv_toUnicode() returns only under the
+ // following conditions:
+ // - an error occurred during conversion (an error code is set)
+ // - the target buffer is filled (the error code indicates an overflow)
+ // - the source is consumed
+ // That is, if the error code does not indicate a failure,
+ // not even an overflow, then the source must be consumed entirely.
+ fromSawEndOfBytes = (UBool)U_SUCCESS(err);
+
+ if (err == U_BUFFER_OVERFLOW_ERROR) {
+ err = U_ZERO_ERROR;
+ } else if (U_FAILURE(err)) {
+ char pos[32], errorBytes[32];
+ int8_t i, length, errorLength;
+
+ UErrorCode localError = U_ZERO_ERROR;
+ errorLength = (int8_t)sizeof(errorBytes);
+ ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError);
+ if (U_FAILURE(localError) || errorLength == 0) {
+ errorLength = 1;
+ }
- // At the last conversion, the converted characters should be
- // equal to number of chars read.
+ // print the input file offset of the start of the error bytes:
+ // input file offset of the current byte buffer +
+ // length of the just consumed bytes -
+ // length of the error bytes
+ length =
+ (int8_t)sprintf(pos, "%d",
+ (int)(infoffset + (cbufp - buf) - errorLength));
+
+ // output the bytes that caused the error
+ UnicodeString str;
+ for (i = 0; i < errorLength; ++i) {
+ if (i > 0) {
+ str.append((UChar)uSP);
+ }
+ str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4));
+ str.append(nibbleToHex((uint8_t)errorBytes[i]));
+ }
- if (flush && !willexit && cbufp != (buf + rd)) {
- char pos[32];
- sprintf(pos, "%u", infoffset);
- UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
- initMsg(pname);
- u_wmsg(stderr, "premEndInput", str.getBuffer());
- willexit = 1;
- }
+ initMsg(pname);
+ u_wmsg(stderr, "problemCvtToU",
+ UnicodeString(pos, length, "").getTerminatedBuffer(),
+ str.getTerminatedBuffer(),
+ u_wmsg_errorName(err));
- // Prepare to transliterate and convert. Transliterate if needed.
+ willexit = TRUE;
+ err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
+ }
-#if !UCONFIG_NO_TRANSLITERATION
- if (t) {
- u.setTo(unibuf, (int32_t)(unibufp - unibuf)); // Copy into string.
- t->transliterate(u);
- } else
-#endif
- {
- u.setTo(unibuf, (int32_t)(unibufp - unibuf), (int32_t)(bufsz)); // Share the buffer.
- }
+ // Replaced a check for whether the input was consumed by
+ // looping until it is; message key "premEndInput" now obsolete.
- int32_t ulen = u.length();
+ if (ulen == 0) {
+ continue;
+ }
- // Convert the Unicode buffer into the destination codepage
- // Again 'bufp' will be placed on the last converted character
- // And 'unibufbp' will be placed on the last converted unicode character
- // At the last conversion flush should be set to true to ensure that
- // all characters left get converted
+ // remove a U+FEFF Unicode signature character if requested
+ if (sig < 0) {
+ if (u.charAt(0) == uSig) {
+ u.remove(0, 1);
- const UChar *unibufu = unibufbp = u.getBuffer();
+ // account for the removed UChar and offset
+ --ulen;
- do {
- int32_t len = ulen > (int32_t)bufsz ? (int32_t)bufsz : ulen;
+ if (useOffsets) {
+ // remove an offset from fromoffsets[] as well
+ // to keep the array parallel with the UChars
+ memmove(fromoffsets, fromoffsets + 1, ulen * 4);
+ }
- bufp = buf;
- unibufp = (UChar *) (unibufbp + len);
+ }
+ sig = 0;
+ }
- ucnv_fromUnicode(convto, &bufp, bufp + tobufsz,
- &unibufbp,
- unibufp,
- tooffsets, flush, &err);
+#if !UCONFIG_NO_TRANSLITERATION
+ // Transliterate/transform if needed.
+
+ // For transformation, we use chunking code -
+ // collect Unicode input until, for example, an end-of-line,
+ // then transform and output-convert that and continue collecting.
+ // This makes the transformation result independent of the buffer size
+ // while avoiding the slower keyboard mode.
+ // The end-of-chunk characters are completely included in the
+ // transformed string in case they are to be transformed themselves.
+ if (t != NULL) {
+ UnicodeString out;
+ int32_t chunkLimit;
+
+ do {
+ chunkLimit = getChunkLimit(chunk, u);
+ if (chunkLimit < 0 && flush && fromSawEndOfBytes) {
+ // use all of the rest at the end of the text
+ chunkLimit = u.length();
+ }
+ if (chunkLimit >= 0) {
+ // complete the chunk and transform it
+ chunk.append(u, 0, chunkLimit);
+ u.remove(0, chunkLimit);
+ t->transliterate(chunk);
+
+ // append the transformation result to the result and empty the chunk
+ out.append(chunk);
+ chunk.remove();
+ } else {
+ // continue collecting the chunk
+ chunk.append(u);
+ break;
+ }
+ } while (!u.isEmpty());
- if (U_FAILURE(err)) {
- const char *errtag;
- char pos[32];
+ u = out;
+ ulen = u.length();
+ }
+#endif
- uint32_t erroffset =
- dataOffset((int32_t)(bufp - buf - 1), fromoffsets, (int32_t)(bufsz), tooffsets, (int32_t)(tobufsz));
- int32_t ferroffset = (int32_t)(infoffset - (unibufp - unibufu) + erroffset);
+ // add a U+FEFF Unicode signature character if requested
+ // and possible/necessary
+ if (sig > 0) {
+ if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) {
+ u.insert(0, (UChar)uSig);
+
+ if (useOffsets) {
+ // insert a pseudo-offset into fromoffsets[] as well
+ // to keep the array parallel with the UChars
+ memmove(fromoffsets + 1, fromoffsets, ulen * 4);
+ fromoffsets[0] = -1;
+ }
- if ((int32_t) ferroffset < 0) {
- ferroffset = (int32_t)(outfoffset + (bufp - buf));
- errtag = "problemCvtFromUOut";
- } else {
- errtag = "problemCvtFromU";
+ // account for the additional UChar and offset
+ ++ulen;
}
- sprintf(pos, "%u", ferroffset);
- UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
- initMsg(pname);
- u_wmsg(stderr, errtag, str.getBuffer(),
- u_wmsg_errorName(err));
- willexit = 1;
+ sig = 0;
}
- // At the last conversion, the converted characters should be equal to number
- // of consumed characters.
- if (flush && !willexit && unibufbp != (unibufu + (size_t) (unibufp - unibufu))) {
- char pos[32];
- sprintf(pos, "%u", infoffset);
- UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
- initMsg(pname);
- u_wmsg(stderr, "premEnd", str.getBuffer());
- willexit = 1;
- }
+ // Convert the Unicode buffer into the destination codepage
+ // Again 'bufp' will be placed behind the last converted character
+ // And 'unibufp' will be placed behind the last converted unicode character
+ // At the last conversion flush should be set to true to ensure that
+ // all characters left get converted
+
+ unibuf = unibufbp = u.getBuffer();
+
+ do {
+ bufp = outbuf;
+
+ // Use fromSawEndOfBytes in addition to the flush flag -
+ // it indicates whether the intermediate Unicode string
+ // contains the very last UChars for the very last input bytes.
+ ucnv_fromUnicode(convto, &bufp, outbuf + bufsz,
+ &unibufbp,
+ unibuf + ulen,
+ NULL, (UBool)(flush && fromSawEndOfBytes), &err);
+
+ // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
+ // converting all of the intermediate UChars.
+ // See comment for fromSawEndOfBytes.
+ toSawEndOfUnicode = (UBool)U_SUCCESS(err);
+
+ if (err == U_BUFFER_OVERFLOW_ERROR) {
+ err = U_ZERO_ERROR;
+ } else if (U_FAILURE(err)) {
+ UChar errorUChars[4];
+ const char *errtag;
+ char pos[32];
+ UChar32 c;
+ int8_t i, length, errorLength;
+
+ UErrorCode localError = U_ZERO_ERROR;
+ errorLength = (int8_t)UPRV_LENGTHOF(errorUChars);
+ ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError);
+ if (U_FAILURE(localError) || errorLength == 0) {
+ // need at least 1 so that we don't access beyond the length of fromoffsets[]
+ errorLength = 1;
+ }
- // Finally, write the converted buffer to the output file
+ int32_t ferroffset;
+ if (useOffsets) {
+ // Unicode buffer offset of the start of the error UChars
+ ferroffset = (int32_t)((unibufbp - unibuf) - errorLength);
+ if (ferroffset < 0) {
+ // approximation - the character started in the previous Unicode buffer
+ ferroffset = 0;
+ }
- rd = (size_t) (bufp - buf);
- outfoffset += (int32_t)(wr = fwrite(buf, 1, rd, outfile));
- if (wr != rd) {
- UnicodeString str(strerror(errno), "");
- initMsg(pname);
- u_wmsg(stderr, "cantWrite", str.getBuffer());
- willexit = 1;
- }
+ // get the corresponding byte offset out of fromoffsets[]
+ // go back if the offset is not known for some of the UChars
+ int32_t fromoffset;
+ do {
+ fromoffset = fromoffsets[ferroffset];
+ } while (fromoffset < 0 && --ferroffset >= 0);
+
+ // total input file offset =
+ // input file offset of the current byte buffer +
+ // byte buffer offset of where the current Unicode buffer is converted from +
+ // fromoffsets[Unicode offset]
+ ferroffset = infoffset + (prevbufp - buf) + fromoffset;
+ errtag = "problemCvtFromU";
+ } else {
+ // Do not use fromoffsets if (t != NULL) because the Unicode text may
+ // be different from what the offsets refer to.
+
+ // output file offset
+ ferroffset = (int32_t)(outfoffset + (bufp - outbuf));
+ errtag = "problemCvtFromUOut";
+ }
- if (willexit) {
- goto error_exit;
- }
- } while ((ulen -= (int32_t)(bufsz)) > 0);
+ length = (int8_t)sprintf(pos, "%u", (int)ferroffset);
+
+ // output the code points that caused the error
+ UnicodeString str;
+ for (i = 0; i < errorLength;) {
+ if (i > 0) {
+ str.append((UChar)uSP);
+ }
+ U16_NEXT(errorUChars, i, errorLength, c);
+ if (c >= 0x100000) {
+ str.append(nibbleToHex((uint8_t)(c >> 20)));
+ }
+ if (c >= 0x10000) {
+ str.append(nibbleToHex((uint8_t)(c >> 16)));
+ }
+ str.append(nibbleToHex((uint8_t)(c >> 12)));
+ str.append(nibbleToHex((uint8_t)(c >> 8)));
+ str.append(nibbleToHex((uint8_t)(c >> 4)));
+ str.append(nibbleToHex((uint8_t)c));
+ }
+
+ initMsg(pname);
+ u_wmsg(stderr, errtag,
+ UnicodeString(pos, length, "").getTerminatedBuffer(),
+ str.getTerminatedBuffer(),
+ u_wmsg_errorName(err));
+ u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer());
+
+ willexit = TRUE;
+ err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
+ }
+
+ // Replaced a check for whether the intermediate Unicode characters were all consumed by
+ // looping until they are; message key "premEnd" now obsolete.
+
+ // Finally, write the converted buffer to the output file
+ size_t outlen = (size_t) (bufp - outbuf);
+ outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile));
+ if (wr != outlen) {
+ UnicodeString str(strerror(errno));
+ initMsg(pname);
+ u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer());
+ willexit = TRUE;
+ }
+
+ if (willexit) {
+ goto error_exit;
+ }
+ } while (!toSawEndOfUnicode);
+ } while (!fromSawEndOfBytes);
} while (!flush); // Stop when we have flushed the
// converters (this means that it's
// the end of output)
normal_exit:
// Cleanup.
- if (convfrom) ucnv_close(convfrom);
- if (convto) ucnv_close(convto);
+ ucnv_close(convfrom);
+ ucnv_close(convto);
#if !UCONFIG_NO_TRANSLITERATION
- if (t) delete t;
+ delete t;
#endif
- if (buf) delete[] buf;
- if (unibuf) delete[] unibuf;
-
- if (fromoffsets) delete[] fromoffsets;
- if (tooffsets) delete[] tooffsets;
-
- if (infile != stdin) {
+ if (closeFile) {
fclose(infile);
}
exit(ecode);
}
-int main(int argc, char **argv)
+extern int
+main(int argc, char **argv)
{
FILE *outfile;
int ret = 0;
- int seenf = 0;
size_t bufsz = DEFAULT_BUFSZ;
const char *tocpage = 0;
const char *translit = 0;
const char *outfilestr = 0;
- int fallback = 0;
+ UBool fallback = FALSE;
UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
const void *fromuctxt = 0;
UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
const void *touctxt = 0;
- char **iter;
+ char **iter, **remainArgv, **remainArgvLimit;
char **end = argv + argc;
const char *pname;
- int printConvs = 0, printCanon = 0;
+ UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE;
const char *printName = 0;
- int printTranslits = 0;
- int verbose = 0;
+ UBool verbose = FALSE;
+ UErrorCode status = U_ZERO_ERROR;
+
+ ConvertFile cf;
+
+ /* Initialize ICU */
+ u_init(&status);
+ if (U_FAILURE(status)) {
+ fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
+ argv[0], u_errorName(status));
+ exit(1);
+ }
// Get and prettify pname.
pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR);
-#ifdef WIN32
+#if U_PLATFORM_USES_ONLY_WIN32_API
if (!pname) {
pname = uprv_strrchr(*argv, '/');
}
// First, get the arguments from command-line
// to know the codepages to convert between
- // XXX When you add to this loop, you need to add to the similar loop
- // below.
-
+ remainArgv = remainArgvLimit = argv + 1;
for (iter = argv + 1; iter != end; iter++) {
// Check for from charset
if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
else
usage(pname, 1);
} else if (!strcmp("--fallback", *iter)) {
- fallback = 1;
+ fallback = TRUE;
} else if (!strcmp("--no-fallback", *iter)) {
- fallback = 0;
+ fallback = FALSE;
} else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
iter++;
if (iter != end) {
initMsg(pname);
UnicodeString str(*iter);
initMsg(pname);
- u_wmsg(stderr, "badBlockSize", str.getBuffer());
+ u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer());
return 3;
}
} else {
if (printTranslits) {
usage(pname, 1);
}
- printConvs = 1;
+ printConvs = TRUE;
} else if (strcmp("--default-code", *iter) == 0) {
if (printTranslits) {
usage(pname, 1);
if (U_FAILURE(e) || !printName) {
UnicodeString str(*iter);
initMsg(pname);
- u_wmsg(stderr, "noSuchCodeset", str.getBuffer());
+ u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer());
return 2;
}
} else
usage(pname, 1);
} else if (strcmp("--canon", *iter) == 0) {
- printCanon = 1;
+ printCanon = TRUE;
} else if (strcmp("-L", *iter) == 0
|| !strcmp("--list-transliterators", *iter)) {
if (printConvs) {
usage(pname, 1);
}
- printTranslits = 1;
+ printTranslits = TRUE;
} else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
|| !strcmp("--help", *iter)) {
usage(pname, 0);
} else {
UnicodeString str(*iter);
initMsg(pname);
- u_wmsg(stderr, "unknownCallback", str.getBuffer());
+ u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
return 4;
}
} else {
} else {
UnicodeString str(*iter);
initMsg(pname);
- u_wmsg(stderr, "unknownCallback", str.getBuffer());
+ u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
return 4;
}
} else {
} else {
UnicodeString str(*iter);
initMsg(pname);
- u_wmsg(stderr, "unknownCallback", str.getBuffer());
+ u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
return 4;
}
} else {
usage(pname, 1);
}
} else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
- verbose = 0;
+ verbose = FALSE;
} else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
- verbose = 1;
+ verbose = TRUE;
} else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
- printf("%s v2.0\n", pname);
+ printf("%s v2.1 ICU " U_ICU_VERSION "\n", pname);
return 0;
} else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
++iter;
} else {
usage(pname, 1);
}
+ } else if (0 == strcmp("--add-signature", *iter)) {
+ cf.signature = 1;
+ } else if (0 == strcmp("--remove-signature", *iter)) {
+ cf.signature = -1;
} else if (**iter == '-' && (*iter)[1]) {
usage(pname, 1);
+ } else {
+ // move a non-option up in argv[]
+ *remainArgvLimit++ = *iter;
}
}
} else {
outfilestr = "-";
outfile = stdout;
-#ifdef WIN32
+#ifdef USE_FILENO_BINARY_MODE
if (setmode(fileno(outfile), O_BINARY) == -1) {
u_wmsg(stderr, "cantSetOutBinMode");
exit(-1);
}
/* Loop again on the arguments to find all the input files, and
- convert them. XXX Cheap and sloppy. */
+ convert them. */
- for (iter = argv + 1; iter != end; iter++) {
- if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
- iter++;
- } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) {
- iter++;
- } else if (strcmp("-x", *iter) == 0) {
- iter++;
- } else if (!strcmp("--fallback", *iter)) {
- ;
- } else if (!strcmp("--no-fallback", *iter)) {
- ;
- } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
- iter++;
- } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
- ;
- } else if (strcmp("--default-code", *iter) == 0) {
- ;
- } else if (strcmp("--list-code", *iter) == 0) {
- ;
- } else if (strcmp("--canon", *iter) == 0) {
- ;
- } else if (strcmp("-L", *iter) == 0
- || !strcmp("--list-transliterators", *iter)) {
- ;
- } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
- || !strcmp("--help", *iter)) {
- ;
- } else if (!strcmp("-c", *iter)) {
- ;
- } else if (!strcmp("--to-callback", *iter)) {
- iter++;
- } else if (!strcmp("--from-callback", *iter)) {
- iter++;
- } else if (!strcmp("-i", *iter)) {
- ;
- } else if (!strcmp("--callback", *iter)) {
- iter++;
- } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
- ;
- } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
- ;
- } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
- ;
- } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
- ++iter;
- } else {
- seenf = 1;
- if (!convertFile
- (pname, fromcpage, toucallback, touctxt, tocpage,
- fromucallback, fromuctxt, fallback, bufsz, translit, *iter,
- outfile, verbose)) {
+ cf.setBufferSize(bufsz);
+
+ if(remainArgv < remainArgvLimit) {
+ for (iter = remainArgv; iter != remainArgvLimit; iter++) {
+ if (!cf.convertFile(
+ pname, fromcpage, toucallback, touctxt, tocpage,
+ fromucallback, fromuctxt, fallback, translit, *iter,
+ outfile, verbose)
+ ) {
goto error_exit;
}
}
- }
-
- if (!seenf) {
- if (!convertFile
- (pname, fromcpage, toucallback, touctxt, tocpage,
- fromucallback, fromuctxt, fallback, bufsz, translit, 0, outfile,
- verbose)) {
+ } else {
+ if (!cf.convertFile(
+ pname, fromcpage, toucallback, touctxt, tocpage,
+ fromucallback, fromuctxt, fallback, translit, 0,
+ outfile, verbose)
+ ) {
goto error_exit;
}
}
goto normal_exit;
error_exit:
+#if !UCONFIG_NO_LEGACY_CONVERSION
ret = 1;
+#else
+ fprintf(stderr, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n");
+#endif
normal_exit:
- if (outfile != stdout)
+ if (outfile != stdout) {
fclose(outfile);
+ }
+
+ u_cleanup();
return ret;
}