ltrPos) ? rtlPos : ltrPos, b2Len, parseError);
- goto CLEANUP;
- }
-
- //satisfy 3
- if( rightToLeft == TRUE &&
- !((firstCharDir == U_RIGHT_TO_LEFT || firstCharDir == U_RIGHT_TO_LEFT_ARABIC) &&
- (direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC))
- ){
- status = U_IDNA_CHECK_BIDI_ERROR;
- uprv_syntaxError(b2, rtlPos, b2Len, parseError);
- return FALSE;
- }
-
- if(b2Len <= destCapacity){
- uprv_memmove(dest,b2, b2Len*U_SIZEOF_UCHAR);
- }
-
-CLEANUP:
- if(b1!=b1Stack){
- uprv_free(b1);
- }
- if(b2!=b2Stack){
- uprv_free(b2);
- }
- return u_terminateUChars(dest, destCapacity, b2Len, &status);
-}
-
-
-UBool StringPrep::isLabelSeparator(UChar32 ch, UErrorCode& status){
- // check error status
- if(U_FAILURE(status)){
- return FALSE;
- }
-
- if(isDataLoaded(status)){
- int32_t result;
- UTRIE_GET16(&idnTrie,ch, result);
- if( (result & 0x07) == UIDNA_LABEL_SEPARATOR){
- return TRUE;
- }
- }
- return FALSE;
-}
-
-U_NAMESPACE_END
-
-#endif /* #if !UCONFIG_NO_IDNA */
diff --git a/icuSources/common/strprep.h b/icuSources/common/strprep.h
deleted file mode 100644
index 5b381a4c..00000000
--- a/icuSources/common/strprep.h
+++ /dev/null
@@ -1,365 +0,0 @@
-/*
- *******************************************************************************
- *
- * Copyright (C) 2003, International Business Machines
- * Corporation and others. All Rights Reserved.
- *
- *******************************************************************************
- * file name: strprep.h
- * encoding: US-ASCII
- * tab size: 8 (not used)
- * indentation:4
- *
- * created on: 2003feb1
- * created by: Ram Viswanadha
- */
-
-#ifndef STRPREP_H
-#define STRPREP_H
-
-#include "unicode/utypes.h"
-
-#if !UCONFIG_NO_IDNA
-
-#include "unicode/uobject.h"
-#include "unicode/uniset.h"
-#include "unicode/parseerr.h"
-
-U_NAMESPACE_BEGIN
-
-/**\file
- *
- * This API implements RF 3454 StringPrep standard.
- *
- * The steps for preparing strings are:
- *
- * 1) Map -- For each character in the input, check if it has a mapping
- * and, if so, replace it with its mapping.
- *
- * - Delete certain codepoints from the input because their
- * presence or absence in the protocol identifies should not
- * make two strings different
- * - Case Mapings
- *
If Normalization is turned off
- *
Get mappings from case map tables
- *
else
- *
Get mappings from case map tables for normalization
- *
Use u_getFC_NFKC_Closure for obtaining extra mappings
- *
- *
- * 2) Normalize -- Possibly normalize the result of step 1 using Unicode
- * normalization NFKC.
- *
- * 3) Prohibit -- Check for any characters that are not allowed in the
- * output. If any are found, return an error.
- *
- * 4) Check bidi -- Possibly check for right-to-left characters, and if
- * any are found, make sure that the whole string satisfies the
- * requirements for bidirectional strings. If the string does not
- * satisfy the requirements for bidirectional strings, return an
- * error.
- *
- * Some StringPrep profiles:
- * IDN: "Nameprep" http://www.ietf.org/rfc/rfc3491.txt
- * XMPP Node Identifiers: "Nodeprep" http://www.ietf.org/internet-drafts/draft-ietf-xmpp-nodeprep-01.txt
- * XMPP Resource Identifiers: "Resourceprep" http://www.ietf.org/internet-drafts/draft-ietf-xmpp-resourceprep-01.txt
- * ANONYMOUS SASL tokens: "plain" http://www.ietf.org/internet-drafts/draft-ietf-sasl-anon-00.txt
- * iSCSI http://www.ietf.org/internet-drafts/draft-ietf-ips-iscsi-string-prep-03.txt
- */
-class StringPrep : public UObject{
-
-protected:
- UVersionInfo unicodeVersion; /** The Character repertoire version of this profile */
- UBool bidiCheck; /** Option to turn BiDi checking on */
- UBool doNFKC; /** Option to turn NFKC on */
-
- /**
- * Protected default constructor sub classes
- */
- StringPrep(){};
-
-public:
- /**
- * Destructor
- */
- virtual inline ~StringPrep(){};
-
- /**
- * Map every character in input stream with mapping character
- * in the mapping table and populate the output stream.
- * For any individual character the mapping table may specify
- * that that a character be mapped to nothing, mapped to one
- * other character or to a string of other characters.
- *
- * @param src Pointer to UChar buffer containing a single label
- * @param srcLength Number of characters in the source label
- * @param dest Pointer to the destination buffer to receive the output
- * @param destCapacity The capacity of destination array
- * @param allowUnassigned Unassigned values can be converted to ASCII for query operations
- * If TRUE unassigned values are treated as normal Unicode code point.
- * If FALSE the operation fails with U_UNASSIGNED_CODE_POINT_FOUND error code.
- * @param status ICU error code in/out parameter.
- * Must fulfill U_SUCCESS before the function call.
- * @return The number of UChars in the destination buffer
- *
- */
- virtual int32_t map(const UChar* src, int32_t srcLength,
- UChar* dest, int32_t destCapacity,
- UBool allowUnassigned,
- UParseError* parseError,
- UErrorCode& status );
-
- /**
- * Normalize the input stream using Normalization Form KC (NFKC)
- *
- * @param src Pointer to UChar buffer containing a single label
- * @param srcLength Number of characters in the source label
- * @param dest Pointer to the destination buffer to receive the output
- * @param destCapacity The capacity of destination array
- * @param status ICU error code in/out parameter.
- * Must fulfill U_SUCCESS before the function call.
- * @return The number of UChars in the destination buffer
- *
- *
- */
- virtual int32_t normalize( const UChar* src, int32_t srcLength,
- UChar* dest, int32_t destCapacity,
- UErrorCode& status );
-
-
- /**
- * Prepare the input stream with for use. This operation maps, normalizes(NFKC),
- * checks for prohited and BiDi characters in the order defined by RFC 3454
- *
- * @param src Pointer to UChar buffer containing a single label
- * @param srcLength Number of characters in the source label
- * @param dest Pointer to the destination buffer to receive the output
- * @param destCapacity The capacity of destination array
- * @param allowUnassigned Unassigned values can be converted to ASCII for query operations
- * If TRUE unassigned values are treated as normal Unicode code point.
- * If FALSE the operation fails with U_UNASSIGNED_CODE_POINT error code.
- * @param status ICU error code in/out parameter.
- * Must fulfill U_SUCCESS before the function call.
- * @return The number of UChars in the destination buffer
- *
- *
- */
- virtual int32_t process(const UChar* src, int32_t srcLength,
- UChar* dest, int32_t destCapacity,
- UBool allowUnassigned,
- UParseError* parseError,
- UErrorCode& status );
-
- /**
- * Create a profile from prebuilt default Nameprep profile conforming to
- * nameprep internet draft (http://www.ietf.org/html.charters/idn-charter.html).
- * This is a built-in/unmodifiable profile.
- *
- * @param status ICU error code in/out parameter.
- * Must fulfill U_SUCCESS before the function call.
- * @return Pointer to StringPrep object that is created. Should be deleted by
- * by caller
- *
- *
- */
- static StringPrep* createNameprepInstance(UErrorCode& status);
-
- /**
- * Create a profile from prebuilt default StringPrep profile conforming to
- * RFC 3454 (ftp://ftp.rfc-editor.org/in-notes/rfc3454.txt).
- * User defined profiles can be created by getting the default profile and
- * adding mappings, removing mappings, turning options ON/OFF and prohibiting
- * characters from the output.
- *
- * @param status ICU error code in/out parameter.
- * Must fulfill U_SUCCESS before the function call.
- * @return Pointer to StringPrep object that is created. Should be deleted by
- * the caller.
- *
- *
- */
- static StringPrep* createDefaultInstance(UErrorCode& status);
-
- /**
- * Ascertain if the given code point is a Letter/Digit/Hyphen in the ASCII range
- *
- * @return TRUE is the code point is a Letter/Digit/Hyphen
- *
- *
- */
- static inline UBool isLDHChar(UChar32 ch);
-
- /**
- * Ascertain if the given code point is a label separator as specified by IDNA
- *
- * @return TRUE is the code point is a label separator
- *
- *
- */
- virtual UBool isLabelSeparator(UChar32 ch, UErrorCode& status);
-
- /**
- * Get the BiDi option of this profile
- *
- *
- */
- inline UBool getCheckBiDi();
-
- /**
- * Get the normalization (NFKC) option of this profile
- *
- * @return The normalization option
- *
- *
- */
- inline UBool getNormalization();
-
- /**
- * Get the Unicode version which this profile
- * conforms to
- *
- *
- */
- inline void getUnicodeVersion(UVersionInfo& info);
-
-private:
- // Boiler plate
-
- /**
- * Copy constructor.
- *
- */
- StringPrep(const StringPrep&);
-
- /**
- * Assignment operator.
- *
- */
- StringPrep& operator=(const StringPrep&);
-
- /**
- * Return true if another object is semantically equal to this one.
- *
- * @param other the object to be compared with.
- * @return true if another object is semantically equal to this one.
- *
- */
- UBool operator==(const StringPrep& other) const {return FALSE;};
-
- /**
- * Return true if another object is semantically unequal to this one.
- *
- * @param other the object to be compared with.
- * @return true if another object is semantically unequal to this one.
- *
- */
- UBool operator!=(const StringPrep& other) const { return !operator==(other); }
-
-public:
-
- /**
- * ICU "poor man's RTTI", returns a UClassID for this class.
- *
- *
- */
- static inline UClassID getStaticClassID();
-
- /**
- * ICU "poor man's RTTI", returns a UClassID for the actual class.
- *
- *
- */
- virtual inline UClassID getDynamicClassID() const;
-
-protected:
-
- /**
- * Sub classes that slightly modify the default profile
- * implement this method to remove characters to
- * the prohibited list. The default implementation does not
- * check if the data is loaded or not. The caller is responsible
- * for checking for data.
- *
- */
- virtual UBool isNotProhibited(UChar32 ch);
-
- /**
- * Sub classes that slightly modify the default profile
- * implement this method to remove characters to
- * the unassigned list. The default implementation does not
- * check if the data is loaded or not. The caller is responsible
- * for checking for data.
- */
- virtual UBool isUnassigned(UChar32 ch);
-
- /**
- * Ascertains if uidna.icu data file is loaded.
- * If data is not loaded, loads the data file.
- *
- *
- */
- static UBool isDataLoaded(UErrorCode& status);
-
-private:
-
- /**
- * The address of this static class variable serves as this class's ID
- * for ICU "poor man's RTTI".
- */
- static const char fgClassID;
-
-};
-
-inline UBool StringPrep::getCheckBiDi(){
- return bidiCheck;
-}
-
-
-inline UBool StringPrep::getNormalization(){
- return doNFKC;
-}
-
-inline void StringPrep::getUnicodeVersion(UVersionInfo& info){
- for(int32_t i=0; i< (int32_t)(sizeof(info)/sizeof(info[0])); i++){
- info[i] = unicodeVersion[i];
- }
-}
-
-inline UClassID StringPrep::getStaticClassID() {
- return (UClassID)&fgClassID;
-}
-
-inline UClassID StringPrep::getDynamicClassID() const {
- return getStaticClassID();
-}
-
-inline UBool StringPrep::isLDHChar(UChar32 ch){
- // high runner case
- if(ch>0x007A){
- return FALSE;
- }
- //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
- if( (ch==0x002D) ||
- (0x0030 <= ch && ch <= 0x0039) ||
- (0x0041 <= ch && ch <= 0x005A) ||
- (0x0061 <= ch && ch <= 0x007A)
- ){
- return TRUE;
- }
- return FALSE;
-}
-
-U_NAMESPACE_END
-
-#endif /* #if !UCONFIG_NO_IDNA */
-
-#endif
-
-/*
- * Hey, Emacs, please set the following:
- *
- * Local Variables:
- * indent-tabs-mode: nil
- * End:
- *
- */
diff --git a/icuSources/common/symtable.h b/icuSources/common/symtable.h
deleted file mode 100644
index 0a11c118..00000000
--- a/icuSources/common/symtable.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-**********************************************************************
-* Copyright (c) 2000, International Business Machines
-* Corporation and others. All Rights Reserved.
-**********************************************************************
-* Date Name Description
-* 02/04/00 aliu Creation.
-**********************************************************************
-*/
-#ifndef SYMTABLE_H
-#define SYMTABLE_H
-
-#include "unicode/utypes.h"
-#include "unicode/uobject.h"
-
-U_NAMESPACE_BEGIN
-
-class ParsePosition;
-class UnicodeFunctor;
-class UnicodeSet;
-class UnicodeString;
-
-/**
- * An interface that maps strings to objects. This interface defines
- * both lookup protocol and parsing. This allows different components
- * to share a symbol table and to handle name parsing uniformly. It
- * is expected that client parse code look for the SYMBOL_REF
- * character and, when seen, attempt to parse the characters after it
- * using parseReference().
- *
- * Currently, RuleBasedTransliterator and UnicodeSet use this
- * interface to share variable definitions.
- */
-class SymbolTable /* not : public UObject because this is an interface/mixin class */ {
-public:
-
- /**
- * The character preceding a symbol reference name.
- */
- enum { SYMBOL_REF = 0x0024 /*$*/ };
-
- /**
- * Destructor.
- */
- virtual inline ~SymbolTable() {};
-
- /**
- * Lookup the characters associated with this string and return it.
- * Return NULL if no such name exists. The resultant
- * string may have length zero.
- */
- virtual const UnicodeString* lookup(const UnicodeString& s) const = 0;
-
- /**
- * Lookup the UnicodeMatcher associated with the given character, and
- * return it. Return null if not found.
- */
- virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const = 0;
-
- /**
- * Parse a symbol reference name from the given string, starting
- * at the given position. If no valid symbol reference name is
- * found, return an empty string.
- * @param text the text to parse for the name
- * @param pos on entry, the index of the first character to parse.
- * This is the character following the SYMBOL_REF character. On
- * exit, the index after the last parsed character.
- * @param limit the index after the last character to be parsed.
- * @return the parsed name or an empty string.
- */
- virtual UnicodeString parseReference(const UnicodeString& text,
- ParsePosition& pos, int32_t limit) const = 0;
-};
-U_NAMESPACE_END
-
-
-#endif
diff --git a/icuSources/common/uarrsort.c b/icuSources/common/uarrsort.c
new file mode 100644
index 00000000..8bc967ce
--- /dev/null
+++ b/icuSources/common/uarrsort.c
@@ -0,0 +1,236 @@
+/*
+*******************************************************************************
+*
+* Copyright (C) 2003, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: uarrsort.c
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2003aug04
+* created by: Markus W. Scherer
+*
+* Internal function for sorting arrays.
+*/
+
+#include "unicode/utypes.h"
+#include "cmemory.h"
+#include "uarrsort.h"
+
+enum {
+ MIN_QSORT=9, /* from Knuth */
+ STACK_ITEM_SIZE=200
+};
+
+/* UComparator convenience implementations ---------------------------------- */
+
+U_CAPI int32_t U_EXPORT2
+uprv_uint16Comparator(const void *context, const void *left, const void *right) {
+ return (int32_t)*(const uint16_t *)left - (int32_t)*(const uint16_t *)right;
+}
+
+U_CAPI int32_t U_EXPORT2
+uprv_int32Comparator(const void *context, const void *left, const void *right) {
+ return *(const int32_t *)left - *(const int32_t *)right;
+}
+
+U_CAPI int32_t U_EXPORT2
+uprv_uint32Comparator(const void *context, const void *left, const void *right) {
+ uint32_t l=*(const uint32_t *)left, r=*(const uint32_t *)right;
+
+ /* compare directly because (l-r) would overflow the int32_t result */
+ if(lr */ {
+ return 1;
+ }
+}
+
+/* Straight insertion sort from Knuth vol. III, pg. 81 ---------------------- */
+
+static void
+doInsertionSort(char *array, int32_t start, int32_t limit, int32_t itemSize,
+ UComparator *cmp, const void *context, void *pv) {
+ int32_t i, j;
+
+ for(j=start+1; jstart; --i) {
+ if(/* v>=array[i-1] */ cmp(context, pv, array+(i-1)*itemSize)>=0) {
+ break;
+ }
+
+ /* array[i]=array[i-1]; */
+ uprv_memcpy(array+i*itemSize, array+(i-1)*itemSize, itemSize);
+ }
+
+ if(i!=j) {
+ /* array[i]=v; */
+ uprv_memcpy(array+i*itemSize, pv, itemSize);
+ }
+ }
+}
+
+static void
+insertionSort(char *array, int32_t length, int32_t itemSize,
+ UComparator *cmp, const void *context, UErrorCode *pErrorCode) {
+ UAlignedMemory v[STACK_ITEM_SIZE/sizeof(UAlignedMemory)+1];
+ void *pv;
+
+ /* allocate an intermediate item variable (v) */
+ if(itemSize<=STACK_ITEM_SIZE) {
+ pv=v;
+ } else {
+ pv=uprv_malloc(itemSize);
+ if(pv==NULL) {
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ }
+
+ doInsertionSort(array, 0, length, itemSize, cmp, context, pv);
+
+ if(pv!=v) {
+ uprv_free(pv);
+ }
+}
+
+/* QuickSort ---------------------------------------------------------------- */
+
+/*
+ * This implementation is semi-recursive:
+ * It recurses for the smaller sub-array to shorten the recursion depth,
+ * and loops for the larger sub-array.
+ *
+ * Loosely after QuickSort algorithms in
+ * Niklaus Wirth
+ * Algorithmen und Datenstrukturen mit Modula-2
+ * B.G. Teubner Stuttgart
+ * 4. Auflage 1986
+ * ISBN 3-519-02260-5
+ */
+static void
+subQuickSort(char *array, int32_t start, int32_t limit, int32_t itemSize,
+ UComparator *cmp, const void *context,
+ void *px, void *pw) {
+ int32_t left, right;
+
+ /* start and left are inclusive, limit and right are exclusive */
+ do {
+ if((start+MIN_QSORT)>=limit) {
+ doInsertionSort(array, start, limit, itemSize, cmp, context, px);
+ break;
+ }
+
+ left=start;
+ right=limit;
+
+ /* x=array[middle] */
+ uprv_memcpy(px, array+((start+limit)/2)*itemSize, itemSize);
+
+ do {
+ while(/* array[left]0 && array==NULL) || length<0 || itemSize<=0 || cmp==NULL) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+
+ if(length<=1) {
+ return;
+ } else if(length0 if left>right
+ *
+ * @internal
+ */
+typedef int32_t U_CALLCONV
+UComparator(const void *context, const void *left, const void *right);
+U_CDECL_END
+
+/**
+ * Array sorting function.
+ * Uses a UComparator for comparing array items to each other, and simple
+ * memory copying to move items.
+ *
+ * @param array The array to be sorted.
+ * @param length The number of items in the array.
+ * @param itemSize The size in bytes of each array item.
+ * @param cmp UComparator function used to compare two items each.
+ * @param context Application-specific pointer, passed through to the UComparator.
+ * @param sortStable If true, a stable sorting algorithm must be used.
+ * @param pErrorCode ICU in/out UErrorCode parameter.
+ *
+ * @internal
+ */
+U_CAPI void U_EXPORT2
+uprv_sortArray(void *array, int32_t length, int32_t itemSize,
+ UComparator *cmp, const void *context,
+ UBool sortStable, UErrorCode *pErrorCode);
+
+/**
+ * Convenience UComparator implementation for uint16_t arrays.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+uprv_uint16Comparator(const void *context, const void *left, const void *right);
+
+/**
+ * Convenience UComparator implementation for int32_t arrays.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+uprv_int32Comparator(const void *context, const void *left, const void *right);
+
+/**
+ * Convenience UComparator implementation for uint32_t arrays.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+uprv_uint32Comparator(const void *context, const void *left, const void *right);
+
+#endif
diff --git a/icuSources/common/uassert.h b/icuSources/common/uassert.h
index 0a6080db..9b1a1416 100644
--- a/icuSources/common/uassert.h
+++ b/icuSources/common/uassert.h
@@ -1,7 +1,7 @@
/*
******************************************************************************
*
-* Copyright (C) 2002, International Business Machines
+* Copyright (C) 2002-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@@ -21,8 +21,12 @@
#define U_ASSERT_H
/* utypes.h is included to get the proper define for uint8_t */
#include "unicode/utypes.h"
+#if U_RELEASE
+#define U_ASSERT(exp)
+#else
#include
#define U_ASSERT(exp) assert(exp)
#endif
+#endif
diff --git a/icuSources/common/ubidi.c b/icuSources/common/ubidi.c
index 63ee4fee..2e7713f0 100644
--- a/icuSources/common/ubidi.c
+++ b/icuSources/common/ubidi.c
@@ -1,7 +1,7 @@
/*
******************************************************************************
*
-* Copyright (C) 1999-2001, International Business Machines
+* Copyright (C) 1999-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@@ -107,28 +107,6 @@
* (L1) is not necessary in adjustWSLevels().
*/
-/* prototypes --------------------------------------------------------------- */
-
-static void
-getDirProps(UBiDi *pBiDi, const UChar *text);
-
-static UBiDiDirection
-resolveExplicitLevels(UBiDi *pBiDi);
-
-static UBiDiDirection
-checkExplicitLevels(UBiDi *pBiDi, UErrorCode *pErrorCode);
-
-static UBiDiDirection
-directionFromFlags(Flags flags);
-
-static void
-resolveImplicitLevels(UBiDi *pBiDi,
- int32_t start, int32_t limit,
- DirProp sor, DirProp eor);
-
-static void
-adjustWSLevels(UBiDi *pBiDi);
-
/* to avoid some conditional statements, use tiny constant arrays */
static const Flags flagLR[2]={ DIRPROP_FLAG(L), DIRPROP_FLAG(R) };
static const Flags flagE[2]={ DIRPROP_FLAG(LRE), DIRPROP_FLAG(RLE) };
@@ -281,199 +259,6 @@ ubidi_isInverse(UBiDi *pBiDi) {
}
}
-/* ubidi_setPara ------------------------------------------------------------ */
-
-U_CAPI void U_EXPORT2
-ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length,
- UBiDiLevel paraLevel, UBiDiLevel *embeddingLevels,
- UErrorCode *pErrorCode) {
- UBiDiDirection direction;
-
- /* check the argument values */
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
- return;
- } else if(pBiDi==NULL || text==NULL ||
- ((UBIDI_MAX_EXPLICIT_LEVELtext=text;
- pBiDi->length=length;
- pBiDi->paraLevel=paraLevel;
- pBiDi->direction=UBIDI_LTR;
- pBiDi->trailingWSStart=length; /* the levels[] will reflect the WS run */
-
- pBiDi->dirProps=NULL;
- pBiDi->levels=NULL;
- pBiDi->runs=NULL;
-
- if(length==0) {
- /*
- * For an empty paragraph, create a UBiDi object with the paraLevel and
- * the flags and the direction set but without allocating zero-length arrays.
- * There is nothing more to do.
- */
- if(IS_DEFAULT_LEVEL(paraLevel)) {
- pBiDi->paraLevel&=1;
- }
- if(paraLevel&1) {
- pBiDi->flags=DIRPROP_FLAG(R);
- pBiDi->direction=UBIDI_RTL;
- } else {
- pBiDi->flags=DIRPROP_FLAG(L);
- pBiDi->direction=UBIDI_LTR;
- }
-
- pBiDi->runCount=0;
- return;
- }
-
- pBiDi->runCount=-1;
-
- /*
- * Get the directional properties,
- * the flags bit-set, and
- * determine the partagraph level if necessary.
- */
- if(getDirPropsMemory(pBiDi, length)) {
- pBiDi->dirProps=pBiDi->dirPropsMemory;
- getDirProps(pBiDi, text);
- } else {
- *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
- return;
- }
-
- /* are explicit levels specified? */
- if(embeddingLevels==NULL) {
- /* no: determine explicit levels according to the (Xn) rules */\
- if(getLevelsMemory(pBiDi, length)) {
- pBiDi->levels=pBiDi->levelsMemory;
- direction=resolveExplicitLevels(pBiDi);
- } else {
- *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- } else {
- /* set BN for all explicit codes, check that all levels are paraLevel..UBIDI_MAX_EXPLICIT_LEVEL */
- pBiDi->levels=embeddingLevels;
- direction=checkExplicitLevels(pBiDi, pErrorCode);
- if(U_FAILURE(*pErrorCode)) {
- return;
- }
- }
-
- /*
- * The steps after (X9) in the UBiDi algorithm are performed only if
- * the paragraph text has mixed directionality!
- */
- pBiDi->direction=direction;
- switch(direction) {
- case UBIDI_LTR:
- /* make sure paraLevel is even */
- pBiDi->paraLevel=(UBiDiLevel)((pBiDi->paraLevel+1)&~1);
-
- /* all levels are implicitly at paraLevel (important for ubidi_getLevels()) */
- pBiDi->trailingWSStart=0;
- break;
- case UBIDI_RTL:
- /* make sure paraLevel is odd */
- pBiDi->paraLevel|=1;
-
- /* all levels are implicitly at paraLevel (important for ubidi_getLevels()) */
- pBiDi->trailingWSStart=0;
- break;
- default:
- /*
- * If there are no external levels specified and there
- * are no significant explicit level codes in the text,
- * then we can treat the entire paragraph as one run.
- * Otherwise, we need to perform the following rules on runs of
- * the text with the same embedding levels. (X10)
- * "Significant" explicit level codes are ones that actually
- * affect non-BN characters.
- * Examples for "insignificant" ones are empty embeddings
- * LRE-PDF, LRE-RLE-PDF-PDF, etc.
- */
- if(embeddingLevels==NULL && !(pBiDi->flags&DIRPROP_FLAG_MULTI_RUNS)) {
- resolveImplicitLevels(pBiDi, 0, length,
- GET_LR_FROM_LEVEL(pBiDi->paraLevel),
- GET_LR_FROM_LEVEL(pBiDi->paraLevel));
- } else {
- /* sor, eor: start and end types of same-level-run */
- UBiDiLevel *levels=pBiDi->levels;
- int32_t start, limit=0;
- UBiDiLevel level, nextLevel;
- DirProp sor, eor;
-
- /* determine the first sor and set eor to it because of the loop body (sor=eor there) */
- level=pBiDi->paraLevel;
- nextLevel=levels[0];
- if(levelparaLevel;
- }
-
- /* determine eor from max(level, nextLevel); sor is last run's eor */
- if((level&~UBIDI_LEVEL_OVERRIDE)<(nextLevel&~UBIDI_LEVEL_OVERRIDE)) {
- eor=GET_LR_FROM_LEVEL(nextLevel);
- } else {
- eor=GET_LR_FROM_LEVEL(level);
- }
-
- /* if the run consists of overridden directional types, then there
- are no implicit types to be resolved */
- if(!(level&UBIDI_LEVEL_OVERRIDE)) {
- resolveImplicitLevels(pBiDi, start, limit, sor, eor);
- } else {
- /* remove the UBIDI_LEVEL_OVERRIDE flags */
- do {
- levels[start++]&=~UBIDI_LEVEL_OVERRIDE;
- } while(startisInverse) {
- if(!ubidi_getRuns(pBiDi)) {
- *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- }
- break;
- }
-}
-
/* perform (P2)..(P3) ------------------------------------------------------- */
/*
@@ -520,6 +305,8 @@ getDirProps(UBiDi *pBiDi, const UChar *text) {
break;
}
}
+ } else {
+ flags|=DIRPROP_FLAG_LR(pBiDi->paraLevel);
}
/* get the rest of the directional properties and the flags bits */
@@ -544,6 +331,19 @@ getDirProps(UBiDi *pBiDi, const UChar *text) {
/* perform (X1)..(X9) ------------------------------------------------------- */
+/* determine if the text is mixed-directional or single-directional */
+static UBiDiDirection
+directionFromFlags(Flags flags) {
+ /* if the text contains AN and neutrals, then some neutrals may become RTL */
+ if(!(flags&MASK_RTL || ((flags&DIRPROP_FLAG(AN)) && (flags&MASK_POSSIBLE_N)))) {
+ return UBIDI_LTR;
+ } else if(!(flags&MASK_LTR)) {
+ return UBIDI_RTL;
+ } else {
+ return UBIDI_MIXED;
+ }
+}
+
/*
* Resolve the explicit levels as specified by explicit embedding codes.
* Recalculate the flags to have them reflect the real properties
@@ -596,7 +396,6 @@ getDirProps(UBiDi *pBiDi, const UChar *text) {
*
* This implementation assumes that UBIDI_MAX_EXPLICIT_LEVEL is odd.
*/
-
static UBiDiDirection
resolveExplicitLevels(UBiDi *pBiDi) {
const DirProp *dirProps=pBiDi->dirProps;
@@ -788,19 +587,6 @@ checkExplicitLevels(UBiDi *pBiDi, UErrorCode *pErrorCode) {
return directionFromFlags(flags);
}
-/* determine if the text is mixed-directional or single-directional */
-static UBiDiDirection
-directionFromFlags(Flags flags) {
- /* if the text contains AN and neutrals, then some neutrals may become RTL */
- if(!(flags&MASK_RTL || ((flags&DIRPROP_FLAG(AN)) && (flags&MASK_POSSIBLE_N)))) {
- return UBIDI_LTR;
- } else if(!(flags&MASK_LTR)) {
- return UBIDI_RTL;
- } else {
- return UBIDI_MIXED;
- }
-}
-
/* perform rules (Wn), (Nn), and (In) on a run of the text ------------------ */
/*
@@ -1211,7 +997,198 @@ adjustWSLevels(UBiDi *pBiDi) {
}
}
-/* -------------------------------------------------------------------------- */
+/* ubidi_setPara ------------------------------------------------------------ */
+
+U_CAPI void U_EXPORT2
+ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length,
+ UBiDiLevel paraLevel, UBiDiLevel *embeddingLevels,
+ UErrorCode *pErrorCode) {
+ UBiDiDirection direction;
+
+ /* check the argument values */
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return;
+ } else if(pBiDi==NULL || text==NULL ||
+ ((UBIDI_MAX_EXPLICIT_LEVELtext=text;
+ pBiDi->length=length;
+ pBiDi->paraLevel=paraLevel;
+ pBiDi->direction=UBIDI_LTR;
+ pBiDi->trailingWSStart=length; /* the levels[] will reflect the WS run */
+
+ pBiDi->dirProps=NULL;
+ pBiDi->levels=NULL;
+ pBiDi->runs=NULL;
+
+ if(length==0) {
+ /*
+ * For an empty paragraph, create a UBiDi object with the paraLevel and
+ * the flags and the direction set but without allocating zero-length arrays.
+ * There is nothing more to do.
+ */
+ if(IS_DEFAULT_LEVEL(paraLevel)) {
+ pBiDi->paraLevel&=1;
+ }
+ if(paraLevel&1) {
+ pBiDi->flags=DIRPROP_FLAG(R);
+ pBiDi->direction=UBIDI_RTL;
+ } else {
+ pBiDi->flags=DIRPROP_FLAG(L);
+ pBiDi->direction=UBIDI_LTR;
+ }
+
+ pBiDi->runCount=0;
+ return;
+ }
+
+ pBiDi->runCount=-1;
+
+ /*
+ * Get the directional properties,
+ * the flags bit-set, and
+ * determine the partagraph level if necessary.
+ */
+ if(getDirPropsMemory(pBiDi, length)) {
+ pBiDi->dirProps=pBiDi->dirPropsMemory;
+ getDirProps(pBiDi, text);
+ } else {
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+
+ /* are explicit levels specified? */
+ if(embeddingLevels==NULL) {
+ /* no: determine explicit levels according to the (Xn) rules */\
+ if(getLevelsMemory(pBiDi, length)) {
+ pBiDi->levels=pBiDi->levelsMemory;
+ direction=resolveExplicitLevels(pBiDi);
+ } else {
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ } else {
+ /* set BN for all explicit codes, check that all levels are paraLevel..UBIDI_MAX_EXPLICIT_LEVEL */
+ pBiDi->levels=embeddingLevels;
+ direction=checkExplicitLevels(pBiDi, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+ }
+
+ /*
+ * The steps after (X9) in the UBiDi algorithm are performed only if
+ * the paragraph text has mixed directionality!
+ */
+ pBiDi->direction=direction;
+ switch(direction) {
+ case UBIDI_LTR:
+ /* make sure paraLevel is even */
+ pBiDi->paraLevel=(UBiDiLevel)((pBiDi->paraLevel+1)&~1);
+
+ /* all levels are implicitly at paraLevel (important for ubidi_getLevels()) */
+ pBiDi->trailingWSStart=0;
+ break;
+ case UBIDI_RTL:
+ /* make sure paraLevel is odd */
+ pBiDi->paraLevel|=1;
+
+ /* all levels are implicitly at paraLevel (important for ubidi_getLevels()) */
+ pBiDi->trailingWSStart=0;
+ break;
+ default:
+ /*
+ * If there are no external levels specified and there
+ * are no significant explicit level codes in the text,
+ * then we can treat the entire paragraph as one run.
+ * Otherwise, we need to perform the following rules on runs of
+ * the text with the same embedding levels. (X10)
+ * "Significant" explicit level codes are ones that actually
+ * affect non-BN characters.
+ * Examples for "insignificant" ones are empty embeddings
+ * LRE-PDF, LRE-RLE-PDF-PDF, etc.
+ */
+ if(embeddingLevels==NULL && !(pBiDi->flags&DIRPROP_FLAG_MULTI_RUNS)) {
+ resolveImplicitLevels(pBiDi, 0, length,
+ GET_LR_FROM_LEVEL(pBiDi->paraLevel),
+ GET_LR_FROM_LEVEL(pBiDi->paraLevel));
+ } else {
+ /* sor, eor: start and end types of same-level-run */
+ UBiDiLevel *levels=pBiDi->levels;
+ int32_t start, limit=0;
+ UBiDiLevel level, nextLevel;
+ DirProp sor, eor;
+
+ /* determine the first sor and set eor to it because of the loop body (sor=eor there) */
+ level=pBiDi->paraLevel;
+ nextLevel=levels[0];
+ if(levelparaLevel;
+ }
+
+ /* determine eor from max(level, nextLevel); sor is last run's eor */
+ if((level&~UBIDI_LEVEL_OVERRIDE)<(nextLevel&~UBIDI_LEVEL_OVERRIDE)) {
+ eor=GET_LR_FROM_LEVEL(nextLevel);
+ } else {
+ eor=GET_LR_FROM_LEVEL(level);
+ }
+
+ /* if the run consists of overridden directional types, then there
+ are no implicit types to be resolved */
+ if(!(level&UBIDI_LEVEL_OVERRIDE)) {
+ resolveImplicitLevels(pBiDi, start, limit, sor, eor);
+ } else {
+ /* remove the UBIDI_LEVEL_OVERRIDE flags */
+ do {
+ levels[start++]&=~UBIDI_LEVEL_OVERRIDE;
+ } while(startisInverse) {
+ if(!ubidi_getRuns(pBiDi)) {
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ }
+ break;
+ }
+}
U_CAPI UBiDiDirection U_EXPORT2
ubidi_getDirection(const UBiDi *pBiDi) {
diff --git a/icuSources/common/ubidiln.c b/icuSources/common/ubidiln.c
index 6680cc60..6731cf2a 100644
--- a/icuSources/common/ubidiln.c
+++ b/icuSources/common/ubidiln.c
@@ -1,7 +1,7 @@
/*
******************************************************************************
*
-* Copyright (C) 1999-2001, International Business Machines
+* Copyright (C) 1999-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@@ -76,21 +76,38 @@
* change the now shared levels for (L1).
*/
-/* prototypes --------------------------------------------------------------- */
+/* handle trailing WS (L1) -------------------------------------------------- */
+/*
+ * setTrailingWSStart() sets the start index for a trailing
+ * run of WS in the line. This is necessary because we do not modify
+ * the paragraph's levels array that we just point into.
+ * Using trailingWSStart is another form of performing (L1).
+ *
+ * To make subsequent operations easier, we also include the run
+ * before the WS if it is at the paraLevel - we merge the two here.
+ */
static void
-setTrailingWSStart(UBiDi *pBiDi);
+setTrailingWSStart(UBiDi *pBiDi) {
+ /* pBiDi->direction!=UBIDI_MIXED */
-static void
-getSingleRun(UBiDi *pBiDi, UBiDiLevel level);
+ const DirProp *dirProps=pBiDi->dirProps;
+ UBiDiLevel *levels=pBiDi->levels;
+ int32_t start=pBiDi->length;
+ UBiDiLevel paraLevel=pBiDi->paraLevel;
-static void
-reorderLine(UBiDi *pBiDi, UBiDiLevel minLevel, UBiDiLevel maxLevel);
+ /* go backwards across all WS, BN, explicit codes */
+ while(start>0 && DIRPROP_FLAG(dirProps[start-1])&MASK_WS) {
+ --start;
+ }
-static UBool
-prepareReorder(const UBiDiLevel *levels, int32_t length,
- int32_t *indexMap,
- UBiDiLevel *pMinLevel, UBiDiLevel *pMaxLevel);
+ /* if the WS run can be merged with the previous run then do so here */
+ while(start>0 && levels[start-1]==paraLevel) {
+ --start;
+ }
+
+ pBiDi->trailingWSStart=start;
+}
/* ubidi_setLine ------------------------------------------------------------ */
@@ -295,39 +312,6 @@ ubidi_getLogicalRun(const UBiDi *pBiDi, int32_t logicalStart,
}
}
-/* handle trailing WS (L1) -------------------------------------------------- */
-
-/*
- * setTrailingWSStart() sets the start index for a trailing
- * run of WS in the line. This is necessary because we do not modify
- * the paragraph's levels array that we just point into.
- * Using trailingWSStart is another form of performing (L1).
- *
- * To make subsequent operations easier, we also include the run
- * before the WS if it is at the paraLevel - we merge the two here.
- */
-static void
-setTrailingWSStart(UBiDi *pBiDi) {
- /* pBiDi->direction!=UBIDI_MIXED */
-
- const DirProp *dirProps=pBiDi->dirProps;
- UBiDiLevel *levels=pBiDi->levels;
- int32_t start=pBiDi->length;
- UBiDiLevel paraLevel=pBiDi->paraLevel;
-
- /* go backwards across all WS, BN, explicit codes */
- while(start>0 && DIRPROP_FLAG(dirProps[start-1])&MASK_WS) {
- --start;
- }
-
- /* if the WS run can be merged with the previous run then do so here */
- while(start>0 && levels[start-1]==paraLevel) {
- --start;
- }
-
- pBiDi->trailingWSStart=start;
-}
-
/* runs API functions ------------------------------------------------------- */
U_CAPI int32_t U_EXPORT2
@@ -367,6 +351,144 @@ ubidi_getVisualRun(UBiDi *pBiDi, int32_t runIndex,
}
}
+/* in trivial cases there is only one trivial run; called by ubidi_getRuns() */
+static void
+getSingleRun(UBiDi *pBiDi, UBiDiLevel level) {
+ /* simple, single-run case */
+ pBiDi->runs=pBiDi->simpleRuns;
+ pBiDi->runCount=1;
+
+ /* fill and reorder the single run */
+ pBiDi->runs[0].logicalStart=MAKE_INDEX_ODD_PAIR(0, level);
+ pBiDi->runs[0].visualLimit=pBiDi->length;
+}
+
+/* reorder the runs array (L2) ---------------------------------------------- */
+
+/*
+ * Reorder the same-level runs in the runs array.
+ * Here, runCount>1 and maxLevel>=minLevel>=paraLevel.
+ * All the visualStart fields=logical start before reordering.
+ * The "odd" bits are not set yet.
+ *
+ * Reordering with this data structure lends itself to some handy shortcuts:
+ *
+ * Since each run is moved but not modified, and since at the initial maxLevel
+ * each sequence of same-level runs consists of only one run each, we
+ * don't need to do anything there and can predecrement maxLevel.
+ * In many simple cases, the reordering is thus done entirely in the
+ * index mapping.
+ * Also, reordering occurs only down to the lowest odd level that occurs,
+ * which is minLevel|1. However, if the lowest level itself is odd, then
+ * in the last reordering the sequence of the runs at this level or higher
+ * will be all runs, and we don't need the elaborate loop to search for them.
+ * This is covered by ++minLevel instead of minLevel|=1 followed
+ * by an extra reorder-all after the reorder-some loop.
+ * About a trailing WS run:
+ * Such a run would need special treatment because its level is not
+ * reflected in levels[] if this is not a paragraph object.
+ * Instead, all characters from trailingWSStart on are implicitly at
+ * paraLevel.
+ * However, for all maxLevel>paraLevel, this run will never be reordered
+ * and does not need to be taken into account. maxLevel==paraLevel is only reordered
+ * if minLevel==paraLevel is odd, which is done in the extra segment.
+ * This means that for the main reordering loop we don't need to consider
+ * this run and can --runCount. If it is later part of the all-runs
+ * reordering, then runCount is adjusted accordingly.
+ */
+static void
+reorderLine(UBiDi *pBiDi, UBiDiLevel minLevel, UBiDiLevel maxLevel) {
+ Run *runs;
+ UBiDiLevel *levels;
+ int32_t firstRun, endRun, limitRun, runCount,
+ temp;
+
+ /* nothing to do? */
+ if(maxLevel<=(minLevel|1)) {
+ return;
+ }
+
+ /*
+ * Reorder only down to the lowest odd level
+ * and reorder at an odd minLevel in a separate, simpler loop.
+ * See comments above for why minLevel is always incremented.
+ */
+ ++minLevel;
+
+ runs=pBiDi->runs;
+ levels=pBiDi->levels;
+ runCount=pBiDi->runCount;
+
+ /* do not include the WS run at paraLevel<=old minLevel except in the simple loop */
+ if(pBiDi->trailingWSStartlength) {
+ --runCount;
+ }
+
+ while(--maxLevel>=minLevel) {
+ firstRun=0;
+
+ /* loop for all sequences of runs */
+ for(;;) {
+ /* look for a sequence of runs that are all at >=maxLevel */
+ /* look for the first run of such a sequence */
+ while(firstRun=runCount) {
+ break; /* no more such runs */
+ }
+
+ /* look for the limit run of such a sequence (the run behind it) */
+ for(limitRun=firstRun; ++limitRun=maxLevel;) {}
+
+ /* Swap the entire sequence of runs from firstRun to limitRun-1. */
+ endRun=limitRun-1;
+ while(firstRuntrailingWSStart==pBiDi->length) {
+ --runCount;
+ }
+
+ /* Swap the entire sequence of all runs. (endRun==runCount) */
+ while(firstRunparaLevel);
- runs[runIndex].visualLimit+=limit;
+ int32_t trailingRun = ((pBiDi->paraLevel & 1) != 0)? 0 : runIndex;
+
+ ADD_ODD_BIT_FROM_LEVEL(runs[trailingRun].logicalStart, pBiDi->paraLevel);
}
}
}
@@ -501,142 +627,42 @@ ubidi_getRuns(UBiDi *pBiDi) {
return TRUE;
}
-/* in trivial cases there is only one trivial run; called by ubidi_getRuns() */
-static void
-getSingleRun(UBiDi *pBiDi, UBiDiLevel level) {
- /* simple, single-run case */
- pBiDi->runs=pBiDi->simpleRuns;
- pBiDi->runCount=1;
-
- /* fill and reorder the single run */
- pBiDi->runs[0].logicalStart=MAKE_INDEX_ODD_PAIR(0, level);
- pBiDi->runs[0].visualLimit=pBiDi->length;
-}
-
-/* reorder the runs array (L2) ---------------------------------------------- */
-
-/*
- * Reorder the same-level runs in the runs array.
- * Here, runCount>1 and maxLevel>=minLevel>=paraLevel.
- * All the visualStart fields=logical start before reordering.
- * The "odd" bits are not set yet.
- *
- * Reordering with this data structure lends itself to some handy shortcuts:
- *
- * Since each run is moved but not modified, and since at the initial maxLevel
- * each sequence of same-level runs consists of only one run each, we
- * don't need to do anything there and can predecrement maxLevel.
- * In many simple cases, the reordering is thus done entirely in the
- * index mapping.
- * Also, reordering occurs only down to the lowest odd level that occurs,
- * which is minLevel|1. However, if the lowest level itself is odd, then
- * in the last reordering the sequence of the runs at this level or higher
- * will be all runs, and we don't need the elaborate loop to search for them.
- * This is covered by ++minLevel instead of minLevel|=1 followed
- * by an extra reorder-all after the reorder-some loop.
- * About a trailing WS run:
- * Such a run would need special treatment because its level is not
- * reflected in levels[] if this is not a paragraph object.
- * Instead, all characters from trailingWSStart on are implicitly at
- * paraLevel.
- * However, for all maxLevel>paraLevel, this run will never be reordered
- * and does not need to be taken into account. maxLevel==paraLevel is only reordered
- * if minLevel==paraLevel is odd, which is done in the extra segment.
- * This means that for the main reordering loop we don't need to consider
- * this run and can --runCount. If it is later part of the all-runs
- * reordering, then runCount is adjusted accordingly.
- */
-static void
-reorderLine(UBiDi *pBiDi, UBiDiLevel minLevel, UBiDiLevel maxLevel) {
- Run *runs;
- UBiDiLevel *levels;
- int32_t firstRun, endRun, limitRun, runCount,
- temp;
-
- /* nothing to do? */
- if(maxLevel<=(minLevel|1)) {
- return;
- }
-
- /*
- * Reorder only down to the lowest odd level
- * and reorder at an odd minLevel in a separate, simpler loop.
- * See comments above for why minLevel is always incremented.
- */
- ++minLevel;
-
- runs=pBiDi->runs;
- levels=pBiDi->levels;
- runCount=pBiDi->runCount;
+static UBool
+prepareReorder(const UBiDiLevel *levels, int32_t length,
+ int32_t *indexMap,
+ UBiDiLevel *pMinLevel, UBiDiLevel *pMaxLevel) {
+ int32_t start;
+ UBiDiLevel level, minLevel, maxLevel;
- /* do not include the WS run at paraLevel<=old minLevel except in the simple loop */
- if(pBiDi->trailingWSStartlength) {
- --runCount;
+ if(levels==NULL || length<=0) {
+ return FALSE;
}
- while(--maxLevel>=minLevel) {
- firstRun=0;
-
- /* loop for all sequences of runs */
- for(;;) {
- /* look for a sequence of runs that are all at >=maxLevel */
- /* look for the first run of such a sequence */
- while(firstRun=runCount) {
- break; /* no more such runs */
- }
-
- /* look for the limit run of such a sequence (the run behind it) */
- for(limitRun=firstRun; ++limitRun=maxLevel;) {}
-
- /* Swap the entire sequence of runs from firstRun to limitRun-1. */
- endRun=limitRun-1;
- while(firstRun0;) {
+ level=levels[--start];
+ if(level>UBIDI_MAX_EXPLICIT_LEVEL+1) {
+ return FALSE;
}
- }
-
- /* now do maxLevel==old minLevel (==odd!), see above */
- if(!(minLevel&1)) {
- firstRun=0;
-
- /* include the trailing WS run in this complete reordering */
- if(pBiDi->trailingWSStart==pBiDi->length) {
- --runCount;
+ if(levelmaxLevel) {
+ maxLevel=level;
}
}
+ *pMinLevel=minLevel;
+ *pMaxLevel=maxLevel;
+
+ /* initialize the index map */
+ for(start=length; start>0;) {
+ --start;
+ indexMap[start]=start;
+ }
+
+ return TRUE;
}
/* reorder a line based on a levels array (L2) ------------------------------ */
@@ -764,44 +790,6 @@ ubidi_reorderVisual(const UBiDiLevel *levels, int32_t length, int32_t *indexMap)
} while(--maxLevel>=minLevel);
}
-static UBool
-prepareReorder(const UBiDiLevel *levels, int32_t length,
- int32_t *indexMap,
- UBiDiLevel *pMinLevel, UBiDiLevel *pMaxLevel) {
- int32_t start;
- UBiDiLevel level, minLevel, maxLevel;
-
- if(levels==NULL || length<=0) {
- return FALSE;
- }
-
- /* determine minLevel and maxLevel */
- minLevel=UBIDI_MAX_EXPLICIT_LEVEL+1;
- maxLevel=0;
- for(start=length; start>0;) {
- level=levels[--start];
- if(level>UBIDI_MAX_EXPLICIT_LEVEL+1) {
- return FALSE;
- }
- if(levelmaxLevel) {
- maxLevel=level;
- }
- }
- *pMinLevel=minLevel;
- *pMaxLevel=maxLevel;
-
- /* initialize the index map */
- for(start=length; start>0;) {
- --start;
- indexMap[start]=start;
- }
-
- return TRUE;
-}
-
/* API functions for logical<->visual mapping ------------------------------- */
U_CAPI int32_t U_EXPORT2
diff --git a/icuSources/common/ubrk.cpp b/icuSources/common/ubrk.cpp
index 8cfb93c2..710fd980 100644
--- a/icuSources/common/ubrk.cpp
+++ b/icuSources/common/ubrk.cpp
@@ -1,6 +1,6 @@
/*
*****************************************************************************************
-* Copyright (C) 1996-2001, International Business Machines
+* Copyright (C) 1996-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*****************************************************************************************
*/
@@ -265,4 +265,26 @@ ubrk_getRuleStatus(UBreakIterator *bi)
return ((RuleBasedBreakIterator *)bi)->getRuleStatus();
}
+U_CAPI int32_t U_EXPORT2
+ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status)
+{
+ return ((RuleBasedBreakIterator *)bi)->getRuleStatusVec(fillInVec, capacity, *status);
+}
+
+
+U_CAPI const char* U_EXPORT2
+ubrk_getLocaleByType(const UBreakIterator *bi,
+ ULocDataLocaleType type,
+ UErrorCode* status)
+{
+ if (bi == NULL) {
+ if (U_SUCCESS(*status)) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ }
+ return NULL;
+ }
+ return ((BreakIterator*)bi)->getLocaleID(type, *status);
+}
+
+
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/icuSources/common/ucase.c b/icuSources/common/ucase.c
new file mode 100644
index 00000000..76b26bb9
--- /dev/null
+++ b/icuSources/common/ucase.c
@@ -0,0 +1,1370 @@
+/*
+*******************************************************************************
+*
+* Copyright (C) 2004, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: ucase.c
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2004aug30
+* created by: Markus W. Scherer
+*
+* Low-level Unicode character/string case mapping code.
+* Much code moved here (and modified) from uchar.c.
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/uset.h"
+#include "unicode/udata.h" /* UDataInfo */
+#include "ucmndata.h" /* DataHeader */
+#include "udatamem.h"
+#include "umutex.h"
+#include "uassert.h"
+#include "cmemory.h"
+#include "utrie.h"
+#include "ucase.h"
+#include "ucln_cmn.h"
+
+struct UCaseProps {
+ UDataMemory *mem;
+ const int32_t *indexes;
+ const uint16_t *exceptions;
+
+ UTrie trie;
+ uint8_t formatVersion[4];
+};
+
+/* data loading etc. -------------------------------------------------------- */
+
+static UBool U_CALLCONV
+isAcceptable(void *context,
+ const char *type, const char *name,
+ const UDataInfo *pInfo) {
+ if(
+ pInfo->size>=20 &&
+ pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
+ pInfo->charsetFamily==U_CHARSET_FAMILY &&
+ pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */
+ pInfo->dataFormat[1]==UCASE_FMT_1 &&
+ pInfo->dataFormat[2]==UCASE_FMT_2 &&
+ pInfo->dataFormat[3]==UCASE_FMT_3 &&
+ pInfo->formatVersion[0]==1 &&
+ pInfo->formatVersion[2]==UTRIE_SHIFT &&
+ pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
+ ) {
+ UCaseProps *csp=(UCaseProps *)context;
+ uprv_memcpy(csp->formatVersion, pInfo->formatVersion, 4);
+ return TRUE;
+ } else {
+ return FALSE;
+ }
+}
+
+static UCaseProps *
+ucase_openData(UCaseProps *cspProto,
+ const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
+ UCaseProps *csp;
+ int32_t size, trieSize;
+
+ cspProto->indexes=(const int32_t *)bin;
+ if( cspProto->indexes[UCASE_IX_INDEX_TOP]<16 ||
+ (length>=0 && lengthindexes[UCASE_IX_LENGTH])
+ ) {
+ *pErrorCode=U_INVALID_FORMAT_ERROR;
+ return NULL;
+ }
+
+ /* get the trie address, after indexes[] */
+ size=cspProto->indexes[UCASE_IX_INDEX_TOP]*4;
+ bin+=size;
+ if(length>=0 && (length-=size)<16) {
+ *pErrorCode=U_INVALID_FORMAT_ERROR;
+ return NULL;
+ }
+
+ /* unserialize the trie */
+ trieSize=cspProto->indexes[UCASE_IX_TRIE_SIZE];
+ trieSize=utrie_unserialize(&cspProto->trie, bin, length>=0 ? length : trieSize, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return NULL;
+ }
+
+ /* get exceptions[] */
+ bin+=trieSize;
+ if(length>=0 && (length-=trieSize)<2*cspProto->indexes[UCASE_IX_EXC_LENGTH]) {
+ *pErrorCode=U_INVALID_FORMAT_ERROR;
+ return NULL;
+ }
+ cspProto->exceptions=(const uint16_t *)bin;
+
+ /* allocate, copy, and return the new UCaseProps */
+ csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps));
+ if(csp==NULL) {
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ } else {
+ uprv_memcpy(csp, cspProto, sizeof(UCaseProps));
+ return csp;
+ }
+}
+
+U_CAPI UCaseProps * U_EXPORT2
+ucase_open(UErrorCode *pErrorCode) {
+ UCaseProps cspProto={ NULL }, *csp;
+
+ cspProto.mem=udata_openChoice(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, isAcceptable, &cspProto, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return NULL;
+ }
+
+ csp=ucase_openData(
+ &cspProto,
+ udata_getMemory(cspProto.mem),
+ udata_getLength(cspProto.mem),
+ pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ udata_close(cspProto.mem);
+ return NULL;
+ } else {
+ return csp;
+ }
+}
+
+U_CAPI UCaseProps * U_EXPORT2
+ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
+ UCaseProps cspProto={ NULL };
+ const DataHeader *hdr;
+
+ if(U_FAILURE(*pErrorCode)) {
+ return NULL;
+ }
+ if(bin==NULL) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return NULL;
+ }
+
+ /* check the header */
+ if(length>=0 && length<20) {
+ *pErrorCode=U_INVALID_FORMAT_ERROR;
+ return NULL;
+ }
+ hdr=(const DataHeader *)bin;
+ if(
+ !(hdr->dataHeader.magic1==0xda && hdr->dataHeader.magic2==0x27 &&
+ hdr->info.isBigEndian==U_IS_BIG_ENDIAN &&
+ isAcceptable(&cspProto, UCASE_DATA_TYPE, UCASE_DATA_NAME, &hdr->info))
+ ) {
+ *pErrorCode=U_INVALID_FORMAT_ERROR;
+ return NULL;
+ }
+
+ bin+=hdr->dataHeader.headerSize;
+ if(length>=0) {
+ length-=hdr->dataHeader.headerSize;
+ }
+ return ucase_openData(&cspProto, bin, length, pErrorCode);
+}
+
+U_CAPI void U_EXPORT2
+ucase_close(UCaseProps *csp) {
+ if(csp!=NULL) {
+ udata_close(csp->mem);
+ uprv_free(csp);
+ }
+}
+
+/* UCaseProps singleton ----------------------------------------------------- */
+
+static UCaseProps *gCsp=NULL;
+static UErrorCode gErrorCode=U_ZERO_ERROR;
+static int8_t gHaveData=0;
+
+static UBool U_CALLCONV ucase_cleanup(void) {
+ ucase_close(gCsp);
+ gCsp=NULL;
+ gErrorCode=U_ZERO_ERROR;
+ gHaveData=0;
+ return TRUE;
+}
+
+U_CAPI UCaseProps * U_EXPORT2
+ucase_getSingleton(UErrorCode *pErrorCode) {
+ int8_t haveData;
+
+ if(U_FAILURE(*pErrorCode)) {
+ return NULL;
+ }
+
+ UMTX_CHECK(NULL, gHaveData, haveData);
+
+ if(haveData>0) {
+ /* data was loaded */
+ return gCsp;
+ } else if(haveData<0) {
+ /* data loading failed */
+ *pErrorCode=gErrorCode;
+ return NULL;
+ } else /* haveData==0 */ {
+ /* load the data */
+ UCaseProps *csp=ucase_open(pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ gHaveData=-1;
+ gErrorCode=*pErrorCode;
+ return NULL;
+ }
+
+ /* set the static variables */
+ umtx_lock(NULL);
+ if(gCsp==NULL) {
+ gCsp=csp;
+ csp=NULL;
+ gHaveData=1;
+ ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
+ }
+ umtx_unlock(NULL);
+
+ ucase_close(csp);
+ return gCsp;
+ }
+}
+
+/* Unicode case mapping data swapping --------------------------------------- */
+
+U_CAPI int32_t U_EXPORT2
+ucase_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ const UDataInfo *pInfo;
+ int32_t headerSize;
+
+ const uint8_t *inBytes;
+ uint8_t *outBytes;
+
+ const int32_t *inIndexes;
+ int32_t indexes[16];
+
+ int32_t i, offset, count, size;
+
+ /* udata_swapDataHeader checks the arguments */
+ headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ /* check data format and format version */
+ pInfo=(const UDataInfo *)((const char *)inData+4);
+ if(!(
+ pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */
+ pInfo->dataFormat[1]==UCASE_FMT_1 &&
+ pInfo->dataFormat[2]==UCASE_FMT_2 &&
+ pInfo->dataFormat[3]==UCASE_FMT_3 &&
+ pInfo->formatVersion[0]==1 &&
+ pInfo->formatVersion[2]==UTRIE_SHIFT &&
+ pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
+ )) {
+ udata_printError(ds, "ucase_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as case mapping data\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0]);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ inBytes=(const uint8_t *)inData+headerSize;
+ outBytes=(uint8_t *)outData+headerSize;
+
+ inIndexes=(const int32_t *)inBytes;
+
+ if(length>=0) {
+ length-=headerSize;
+ if(length<16*4) {
+ udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for case mapping data\n",
+ length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+ }
+
+ /* read the first 16 indexes (ICU 3.2/format version 1: UCASE_IX_TOP==16, might grow) */
+ for(i=0; i<16; ++i) {
+ indexes[i]=udata_readInt32(ds, inIndexes[i]);
+ }
+
+ /* get the total length of the data */
+ size=indexes[UCASE_IX_LENGTH];
+
+ if(length>=0) {
+ if(lengthswapArray32(ds, inBytes, count, outBytes, pErrorCode);
+ offset+=count;
+
+ /* swap the UTrie */
+ count=indexes[UCASE_IX_TRIE_SIZE];
+ utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
+ offset+=count;
+
+ /* swap the uint16_t exceptions[] */
+ count=indexes[UCASE_IX_EXC_LENGTH]*2;
+ ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
+ offset+=count;
+
+ U_ASSERT(offset==size);
+ }
+
+ return headerSize+size;
+}
+
+/* set of property starts for UnicodeSet ------------------------------------ */
+
+static UBool U_CALLCONV
+_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
+ /* add the start code point to the USet */
+ USetAdder *sa=(USetAdder *)context;
+ sa->add(sa->set, start);
+ return TRUE;
+}
+
+U_CAPI void U_EXPORT2
+ucase_addPropertyStarts(const UCaseProps *csp, USetAdder *sa, UErrorCode *pErrorCode) {
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+
+ /* add the start code point of each same-value range of the trie */
+ utrie_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
+
+ /* add code points with hardcoded properties, plus the ones following them */
+
+ /* (none right now, see comment below) */
+
+ /*
+ * Omit code points with hardcoded specialcasing properties
+ * because we do not build property UnicodeSets for them right now.
+ */
+}
+
+/* data access primitives --------------------------------------------------- */
+
+/* UTRIE_GET16() itself validates c */
+#define GET_PROPS(csp, c, result) \
+ UTRIE_GET16(&(csp)->trie, c, result);
+
+#define GET_CASE_TYPE(props) ((props)&UCASE_TYPE_MASK)
+#define GET_SIGNED_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
+#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
+
+#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
+
+/* number of bits in an 8-bit integer value */
+static const uint8_t flagsOffset[256]={
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+};
+
+#define HAS_SLOT(flags, index) ((flags)&(1<<(index)))
+#define SLOT_OFFSET(flags, index) flagsOffset[(flags)&((1<<(index))-1)]
+
+/*
+ * Get the value of an optional-value slot where HAS_SLOT(excWord, index).
+ *
+ * @param excWord (in) initial exceptions word
+ * @param index (in) desired slot index
+ * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
+ * moved to the last uint16_t of the value, use +1 for beginning of next slot
+ * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
+ */
+#define GET_SLOT_VALUE(excWord, index, pExc16, value) \
+ if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
+ (pExc16)+=SLOT_OFFSET(excWord, index); \
+ (value)=*pExc16; \
+ } else { \
+ (pExc16)+=2*SLOT_OFFSET(excWord, index); \
+ (value)=*pExc16++; \
+ (value)=((value)<<16)|*pExc16; \
+ }
+
+/* simple case mappings ----------------------------------------------------- */
+
+U_CAPI UChar32 U_EXPORT2
+ucase_tolower(const UCaseProps *csp, UChar32 c) {
+ uint16_t props;
+ GET_PROPS(csp, c, props);
+ if(!PROPS_HAS_EXCEPTION(props)) {
+ if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
+ c+=GET_SIGNED_DELTA(props);
+ }
+ } else {
+ const uint16_t *pe=GET_EXCEPTIONS(csp, props);
+ uint16_t excWord=*pe++;
+ if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
+ GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
+ }
+ }
+ return c;
+}
+
+U_CAPI UChar32 U_EXPORT2
+ucase_toupper(const UCaseProps *csp, UChar32 c) {
+ uint16_t props;
+ GET_PROPS(csp, c, props);
+ if(!PROPS_HAS_EXCEPTION(props)) {
+ if(GET_CASE_TYPE(props)==UCASE_LOWER) {
+ c+=GET_SIGNED_DELTA(props);
+ }
+ } else {
+ const uint16_t *pe=GET_EXCEPTIONS(csp, props);
+ uint16_t excWord=*pe++;
+ if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
+ GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
+ }
+ }
+ return c;
+}
+
+U_CAPI UChar32 U_EXPORT2
+ucase_totitle(const UCaseProps *csp, UChar32 c) {
+ uint16_t props;
+ GET_PROPS(csp, c, props);
+ if(!PROPS_HAS_EXCEPTION(props)) {
+ if(GET_CASE_TYPE(props)==UCASE_LOWER) {
+ c+=GET_SIGNED_DELTA(props);
+ }
+ } else {
+ const uint16_t *pe=GET_EXCEPTIONS(csp, props);
+ uint16_t excWord=*pe++;
+ int32_t index;
+ if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
+ index=UCASE_EXC_TITLE;
+ } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
+ index=UCASE_EXC_UPPER;
+ } else {
+ return c;
+ }
+ GET_SLOT_VALUE(excWord, index, pe, c);
+ }
+ return c;
+}
+
+/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
+U_CAPI int32_t U_EXPORT2
+ucase_getType(const UCaseProps *csp, UChar32 c) {
+ uint16_t props;
+ GET_PROPS(csp, c, props);
+ return GET_CASE_TYPE(props);
+}
+
+/** @return same as ucase_getType(), or <0 if c is case-ignorable */
+U_CAPI int32_t U_EXPORT2
+ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
+ int32_t type;
+ uint16_t props;
+ GET_PROPS(csp, c, props);
+ type=GET_CASE_TYPE(props);
+ if(type!=UCASE_NONE) {
+ return type;
+ } else if(
+ c==0x307 ||
+ (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE
+ ) {
+ return -1; /* case-ignorable */
+ } else {
+ return 0; /* c is neither cased nor case-ignorable */
+ }
+}
+
+/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
+static U_INLINE int32_t
+getDotType(const UCaseProps *csp, UChar32 c) {
+ uint16_t props;
+ GET_PROPS(csp, c, props);
+ if(!PROPS_HAS_EXCEPTION(props)) {
+ return props&UCASE_DOT_MASK;
+ } else {
+ const uint16_t *pe=GET_EXCEPTIONS(csp, props);
+ return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
+ }
+}
+
+U_CAPI UBool U_EXPORT2
+ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
+ return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
+}
+
+U_CAPI UBool U_EXPORT2
+ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
+ uint16_t props;
+ GET_PROPS(csp, c, props);
+ return (UBool)((props&UCASE_SENSITIVE)!=0);
+}
+
+/* public API (see uchar.h) ------------------------------------------------- */
+
+U_CAPI UBool U_EXPORT2
+u_isULowercase(UChar32 c) {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ UCaseProps *csp=ucase_getSingleton(&errorCode);
+ return (UBool)(csp!=NULL && UCASE_LOWER==ucase_getType(csp, c));
+}
+
+U_CAPI UBool U_EXPORT2
+u_isUUppercase(UChar32 c) {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ UCaseProps *csp=ucase_getSingleton(&errorCode);
+ return (UBool)(csp!=NULL && UCASE_UPPER==ucase_getType(csp, c));
+}
+
+/* Transforms the Unicode character to its lower case equivalent.*/
+U_CAPI UChar32 U_EXPORT2
+u_tolower(UChar32 c) {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ UCaseProps *csp=ucase_getSingleton(&errorCode);
+ if(csp!=NULL) {
+ return ucase_tolower(csp, c);
+ } else {
+ return c;
+ }
+}
+
+/* Transforms the Unicode character to its upper case equivalent.*/
+U_CAPI UChar32 U_EXPORT2
+u_toupper(UChar32 c) {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ UCaseProps *csp=ucase_getSingleton(&errorCode);
+ if(csp!=NULL) {
+ return ucase_toupper(csp, c);
+ } else {
+ return c;
+ }
+}
+
+/* Transforms the Unicode character to its title case equivalent.*/
+U_CAPI UChar32 U_EXPORT2
+u_totitle(UChar32 c) {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ UCaseProps *csp=ucase_getSingleton(&errorCode);
+ if(csp!=NULL) {
+ return ucase_totitle(csp, c);
+ } else {
+ return c;
+ }
+}
+
+/* return the simple case folding mapping for c */
+U_CAPI UChar32 U_EXPORT2
+u_foldCase(UChar32 c, uint32_t options) {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ UCaseProps *csp=ucase_getSingleton(&errorCode);
+ if(csp!=NULL) {
+ return ucase_fold(csp, c, options);
+ } else {
+ return c;
+ }
+}
+
+/* string casing ------------------------------------------------------------ */
+
+/*
+ * These internal functions form the core of string case mappings.
+ * They map single code points to result code points or strings and take
+ * all necessary conditions (context, locale ID, options) into account.
+ *
+ * They do not iterate over the source or write to the destination
+ * so that the same functions are useful for non-standard string storage,
+ * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
+ * For the same reason, the "surrounding text" context is passed in as a
+ * UCaseContextIterator which does not make any assumptions about
+ * the underlying storage.
+ *
+ * This section contains helper functions that check for conditions
+ * in the input text surrounding the current code point
+ * according to SpecialCasing.txt.
+ *
+ * Each helper function gets the index
+ * - after the current code point if it looks at following text
+ * - before the current code point if it looks at preceding text
+ *
+ * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
+ *
+ * Final_Sigma
+ * C is preceded by a sequence consisting of
+ * a cased letter and a case-ignorable sequence,
+ * and C is not followed by a sequence consisting of
+ * an ignorable sequence and then a cased letter.
+ *
+ * More_Above
+ * C is followed by one or more characters of combining class 230 (ABOVE)
+ * in the combining character sequence.
+ *
+ * After_Soft_Dotted
+ * The last preceding character with combining class of zero before C
+ * was Soft_Dotted,
+ * and there is no intervening combining character class 230 (ABOVE).
+ *
+ * Before_Dot
+ * C is followed by combining dot above (U+0307).
+ * Any sequence of characters with a combining class that is neither 0 nor 230
+ * may intervene between the current character and the combining dot above.
+ *
+ * The erratum from 2002-10-31 adds the condition
+ *
+ * After_I
+ * The last preceding base character was an uppercase I, and there is no
+ * intervening combining character class 230 (ABOVE).
+ *
+ * (See Jitterbug 2344 and the comments on After_I below.)
+ *
+ * Helper definitions in Unicode 3.2 UAX 21:
+ *
+ * D1. A character C is defined to be cased
+ * if it meets any of the following criteria:
+ *
+ * - The general category of C is Titlecase Letter (Lt)
+ * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
+ * - Given D = NFD(C), then it is not the case that:
+ * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
+ * (This third criterium does not add any characters to the list
+ * for Unicode 3.2. Ignored.)
+ *
+ * D2. A character C is defined to be case-ignorable
+ * if it meets either of the following criteria:
+ *
+ * - The general category of C is
+ * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
+ * Letter Modifier (Lm), or Symbol Modifier (Sk)
+ * - C is one of the following characters
+ * U+0027 APOSTROPHE
+ * U+00AD SOFT HYPHEN (SHY)
+ * U+2019 RIGHT SINGLE QUOTATION MARK
+ * (the preferred character for apostrophe)
+ *
+ * D3. A case-ignorable sequence is a sequence of
+ * zero or more case-ignorable characters.
+ */
+
+enum {
+ LOC_UNKNOWN,
+ LOC_ROOT,
+ LOC_TURKISH,
+ LOC_LITHUANIAN
+};
+
+#define is_a(c) ((c)=='a' || (c)=='A')
+#define is_e(c) ((c)=='e' || (c)=='E')
+#define is_i(c) ((c)=='i' || (c)=='I')
+#define is_l(c) ((c)=='l' || (c)=='L')
+#define is_r(c) ((c)=='r' || (c)=='R')
+#define is_t(c) ((c)=='t' || (c)=='T')
+#define is_u(c) ((c)=='u' || (c)=='U')
+#define is_z(c) ((c)=='z' || (c)=='Z')
+
+/* separator? */
+#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
+
+/*
+ * Requires non-NULL locale ID but otherwise does the equivalent of
+ * checking for language codes as if uloc_getLanguage() were called:
+ * Accepts both 2- and 3-letter codes and accepts case variants.
+ */
+static int32_t
+getCaseLocale(const char *locale, int32_t *locCache) {
+ int32_t result;
+ char c;
+
+ if(locCache!=NULL && (result=*locCache)!=LOC_UNKNOWN) {
+ return result;
+ }
+
+ result=LOC_ROOT;
+
+ /*
+ * This function used to use uloc_getLanguage(), but the current code
+ * removes the dependency of this low-level code on uloc implementation code
+ * and is faster because not the whole locale ID has to be
+ * examined and copied/transformed.
+ *
+ * Because this code does not want to depend on uloc, the caller must
+ * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
+ */
+ c=*locale++;
+ if(is_t(c)) {
+ /* tr or tur? */
+ c=*locale++;
+ if(is_u(c)) {
+ c=*locale++;
+ }
+ if(is_r(c)) {
+ c=*locale;
+ if(is_sep(c)) {
+ result=LOC_TURKISH;
+ }
+ }
+ } else if(is_a(c)) {
+ /* az or aze? */
+ c=*locale++;
+ if(is_z(c)) {
+ c=*locale++;
+ if(is_e(c)) {
+ c=*locale;
+ }
+ if(is_sep(c)) {
+ result=LOC_TURKISH;
+ }
+ }
+ } else if(is_l(c)) {
+ /* lt or lit? */
+ c=*locale++;
+ if(is_i(c)) {
+ c=*locale++;
+ }
+ if(is_t(c)) {
+ c=*locale;
+ if(is_sep(c)) {
+ result=LOC_LITHUANIAN;
+ }
+ }
+ }
+
+ if(locCache!=NULL) {
+ *locCache=result;
+ }
+ return result;
+}
+
+/* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */
+static UBool
+isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
+ UChar32 c;
+ uint16_t props;
+
+ if(iter==NULL) {
+ return FALSE;
+ }
+
+ for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
+ GET_PROPS(csp, c, props);
+ if(GET_CASE_TYPE(props)!=UCASE_NONE) {
+ return TRUE; /* followed by cased letter */
+ } else if(c==0x307 || (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE) {
+ /* case-ignorable, continue with the loop */
+ } else {
+ return FALSE; /* not ignorable */
+ }
+ }
+
+ return FALSE; /* not followed by cased letter */
+}
+
+/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
+static UBool
+isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
+ UChar32 c;
+ int32_t dotType;
+ int8_t dir;
+
+ if(iter==NULL) {
+ return FALSE;
+ }
+
+ for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
+ dotType=getDotType(csp, c);
+ if(dotType==UCASE_SOFT_DOTTED) {
+ return TRUE; /* preceded by TYPE_i */
+ } else if(dotType!=UCASE_OTHER_ACCENT) {
+ return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
+ }
+ }
+
+ return FALSE; /* not preceded by TYPE_i */
+}
+
+/*
+ * See Jitterbug 2344:
+ * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
+ * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
+ * we made those releases compatible with Unicode 3.2 which had not fixed
+ * a related bug in SpecialCasing.txt.
+ *
+ * From the Jitterbug 2344 text:
+ * ... this bug is listed as a Unicode erratum
+ * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
+ *
+ * There are two errors in SpecialCasing.txt.
+ * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
+ * 2. An incorrect context definition. Correct as follows:
+ * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
+ * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
+ * ---
+ * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
+ * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
+ * where the context After_I is defined as:
+ * The last preceding base character was an uppercase I, and there is no
+ * intervening combining character class 230 (ABOVE).
+ *
+ *
+ * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
+ *
+ * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
+ * # This matches the behavior of the canonically equivalent I-dot_above
+ *
+ * See also the description in this place in older versions of uchar.c (revision 1.100).
+ *
+ * Markus W. Scherer 2003-feb-15
+ */
+
+/* Is preceded by base character 'I' with no intervening cc=230 ? */
+static UBool
+isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
+ UChar32 c;
+ int32_t dotType;
+ int8_t dir;
+
+ if(iter==NULL) {
+ return FALSE;
+ }
+
+ for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
+ if(c==0x49) {
+ return TRUE; /* preceded by I */
+ }
+ dotType=getDotType(csp, c);
+ if(dotType!=UCASE_OTHER_ACCENT) {
+ return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
+ }
+ }
+
+ return FALSE; /* not preceded by I */
+}
+
+/* Is followed by one or more cc==230 ? */
+static UBool
+isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
+ UChar32 c;
+ int32_t dotType;
+ int8_t dir;
+
+ if(iter==NULL) {
+ return FALSE;
+ }
+
+ for(dir=1; (c=iter(context, dir))>=0; dir=0) {
+ dotType=getDotType(csp, c);
+ if(dotType==UCASE_ABOVE) {
+ return TRUE; /* at least one cc==230 following */
+ } else if(dotType!=UCASE_OTHER_ACCENT) {
+ return FALSE; /* next base character, no more cc==230 following */
+ }
+ }
+
+ return FALSE; /* no more cc==230 following */
+}
+
+/* Is followed by a dot above (without cc==230 in between) ? */
+static UBool
+isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
+ UChar32 c;
+ int32_t dotType;
+ int8_t dir;
+
+ if(iter==NULL) {
+ return FALSE;
+ }
+
+ for(dir=1; (c=iter(context, dir))>=0; dir=0) {
+ if(c==0x307) {
+ return TRUE;
+ }
+ dotType=getDotType(csp, c);
+ if(dotType!=UCASE_OTHER_ACCENT) {
+ return FALSE; /* next base character or cc==230 in between */
+ }
+ }
+
+ return FALSE; /* no dot above following */
+}
+
+U_CAPI int32_t U_EXPORT2
+ucase_toFullLower(const UCaseProps *csp, UChar32 c,
+ UCaseContextIterator *iter, void *context,
+ const UChar **pString,
+ const char *locale, int32_t *locCache) {
+ static const UChar
+ iDot[2]= { 0x69, 0x307 },
+ jDot[2]= { 0x6a, 0x307 },
+ iOgonekDot[3]= { 0x12f, 0x307 },
+ iDotGrave[3]= { 0x69, 0x307, 0x300 },
+ iDotAcute[3]= { 0x69, 0x307, 0x301 },
+ iDotTilde[3]= { 0x69, 0x307, 0x303 };
+
+ UChar32 result;
+ uint16_t props;
+
+ result=c;
+ GET_PROPS(csp, c, props);
+ if(!PROPS_HAS_EXCEPTION(props)) {
+ if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
+ result=c+GET_SIGNED_DELTA(props);
+ }
+ } else {
+ const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
+ uint16_t excWord=*pe++;
+ int32_t full;
+
+ pe2=pe;
+
+ if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
+ /* use hardcoded conditions and mappings */
+ int32_t loc=getCaseLocale(locale, locCache);
+
+ /*
+ * Test for conditional mappings first
+ * (otherwise the unconditional default mappings are always taken),
+ * then test for characters that have unconditional mappings in SpecialCasing.txt,
+ * then get the UnicodeData.txt mappings.
+ */
+ if( loc==LOC_LITHUANIAN &&
+ /* base characters, find accents above */
+ (((c==0x49 || c==0x4a || c==0x12e) &&
+ isFollowedByMoreAbove(csp, iter, context)) ||
+ /* precomposed with accent above, no need to find one */
+ (c==0xcc || c==0xcd || c==0x128))
+ ) {
+ /*
+ # Lithuanian
+
+ # Lithuanian retains the dot in a lowercase i when followed by accents.
+
+ # Introduce an explicit dot above when lowercasing capital I's and J's
+ # whenever there are more accents above.
+ # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
+
+ 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
+ 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
+ 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
+ 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
+ 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
+ 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
+ */
+ switch(c) {
+ case 0x49: /* LATIN CAPITAL LETTER I */
+ *pString=iDot;
+ return 2;
+ case 0x4a: /* LATIN CAPITAL LETTER J */
+ *pString=jDot;
+ return 2;
+ case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
+ *pString=iOgonekDot;
+ return 2;
+ case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
+ *pString=iDotGrave;
+ return 3;
+ case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
+ *pString=iDotAcute;
+ return 3;
+ case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
+ *pString=iDotTilde;
+ return 3;
+ default:
+ return 0; /* will not occur */
+ }
+ /* # Turkish and Azeri */
+ } else if(loc==LOC_TURKISH && c==0x130) {
+ /*
+ # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
+ # The following rules handle those cases.
+
+ 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
+ 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
+ */
+ return 0x69;
+ } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
+ /*
+ # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
+ # This matches the behavior of the canonically equivalent I-dot_above
+
+ 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
+ 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
+ */
+ return 0; /* remove the dot (continue without output) */
+ } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
+ /*
+ # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
+
+ 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
+ 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
+ */
+ return 0x131;
+ } else if(c==0x130) {
+ /*
+ # Preserve canonical equivalence for I with dot. Turkic is handled below.
+
+ 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+ */
+ *pString=iDot;
+ return 2;
+ } else if( c==0x3a3 &&
+ !isFollowedByCasedLetter(csp, iter, context, 1) &&
+ isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
+ ) {
+ /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
+ /*
+ # Special case for final form of sigma
+
+ 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
+ */
+ return 0x3c2; /* greek small final sigma */
+ } else {
+ /* no known conditional special case mapping, use a normal mapping */
+ }
+ } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
+ GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
+ full&=UCASE_FULL_LOWER;
+ if(full!=0) {
+ /* set the output pointer to the lowercase mapping */
+ *pString=pe+1;
+
+ /* return the string length */
+ return full;
+ }
+ }
+
+ if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
+ GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
+ }
+ }
+
+ return (result==c) ? ~result : result;
+}
+
+/* internal */
+static int32_t
+toUpperOrTitle(const UCaseProps *csp, UChar32 c,
+ UCaseContextIterator *iter, void *context,
+ const UChar **pString,
+ const char *locale, int32_t *locCache,
+ UBool upperNotTitle) {
+ UChar32 result;
+ uint16_t props;
+
+ result=c;
+ GET_PROPS(csp, c, props);
+ if(!PROPS_HAS_EXCEPTION(props)) {
+ if(GET_CASE_TYPE(props)==UCASE_LOWER) {
+ result=c+GET_SIGNED_DELTA(props);
+ }
+ } else {
+ const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
+ uint16_t excWord=*pe++;
+ int32_t full, index;
+
+ pe2=pe;
+
+ if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
+ /* use hardcoded conditions and mappings */
+ int32_t loc=getCaseLocale(locale, locCache);
+
+ if(loc==LOC_TURKISH && c==0x69) {
+ /*
+ # Turkish and Azeri
+
+ # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
+ # The following rules handle those cases.
+
+ # When uppercasing, i turns into a dotted capital I
+
+ 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
+ 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
+ */
+ return 0x130;
+ } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
+ /*
+ # Lithuanian
+
+ # Lithuanian retains the dot in a lowercase i when followed by accents.
+
+ # Remove DOT ABOVE after "i" with upper or titlecase
+
+ 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
+ */
+ return 0; /* remove the dot (continue without output) */
+ } else {
+ /* no known conditional special case mapping, use a normal mapping */
+ }
+ } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
+ GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
+
+ /* start of full case mapping strings */
+ ++pe;
+
+ /* skip the lowercase and case-folding result strings */
+ pe+=full&UCASE_FULL_LOWER;
+ full>>=4;
+ pe+=full&0xf;
+ full>>=4;
+
+ if(upperNotTitle) {
+ full&=0xf;
+ } else {
+ /* skip the uppercase result string */
+ pe+=full&0xf;
+ full=(full>>4)&0xf;
+ }
+
+ if(full!=0) {
+ /* set the output pointer to the result string */
+ *pString=pe;
+
+ /* return the string length */
+ return full;
+ }
+ }
+
+ if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
+ index=UCASE_EXC_TITLE;
+ } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
+ /* here, titlecase is same as uppercase */
+ index=UCASE_EXC_UPPER;
+ } else {
+ return ~c;
+ }
+ GET_SLOT_VALUE(excWord, index, pe2, result);
+ }
+
+ return (result==c) ? ~result : result;
+}
+
+U_CAPI int32_t U_EXPORT2
+ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
+ UCaseContextIterator *iter, void *context,
+ const UChar **pString,
+ const char *locale, int32_t *locCache) {
+ return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
+}
+
+U_CAPI int32_t U_EXPORT2
+ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
+ UCaseContextIterator *iter, void *context,
+ const UChar **pString,
+ const char *locale, int32_t *locCache) {
+ return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
+}
+
+/* case folding ------------------------------------------------------------- */
+
+/*
+ * Case folding is similar to lowercasing.
+ * The result may be a simple mapping, i.e., a single code point, or
+ * a full mapping, i.e., a string.
+ * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
+ * then only the lowercase mapping is stored.
+ *
+ * Some special cases are hardcoded because their conditions cannot be
+ * parsed and processed from CaseFolding.txt.
+ *
+ * Unicode 3.2 CaseFolding.txt specifies for its status field:
+
+# C: common case folding, common mappings shared by both simple and full mappings.
+# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
+# S: simple case folding, mappings to single characters where different from F.
+# T: special case for uppercase I and dotted uppercase I
+# - For non-Turkic languages, this mapping is normally not used.
+# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
+#
+# Usage:
+# A. To do a simple case folding, use the mappings with status C + S.
+# B. To do a full case folding, use the mappings with status C + F.
+#
+# The mappings with status T can be used or omitted depending on the desired case-folding
+# behavior. (The default option is to exclude them.)
+
+ * Unicode 3.2 has 'T' mappings as follows:
+
+0049; T; 0131; # LATIN CAPITAL LETTER I
+0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+
+ * while the default mappings for these code points are:
+
+0049; C; 0069; # LATIN CAPITAL LETTER I
+0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+
+ * U+0130 is otherwise lowercased to U+0069 (UnicodeData.txt).
+ *
+ * In case this code is used with CaseFolding.txt from an older version of Unicode
+ * where CaseFolding.txt contains mappings with a status of 'I' that
+ * have the opposite polarity ('I' mappings are included by default but excluded for Turkic),
+ * we must also hardcode the Unicode 3.2 mappings for the code points
+ * with 'I' mappings.
+ * Unicode 3.1.1 has 'I' mappings for U+0130 and U+0131.
+ * Unicode 3.2 has a 'T' mapping for U+0130, and lowercases U+0131 to itself (see UnicodeData.txt).
+ */
+
+/* return the simple case folding mapping for c */
+U_CAPI UChar32 U_EXPORT2
+ucase_fold(UCaseProps *csp, UChar32 c, uint32_t options) {
+ uint16_t props;
+ GET_PROPS(csp, c, props);
+ if(!PROPS_HAS_EXCEPTION(props)) {
+ if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
+ c+=GET_SIGNED_DELTA(props);
+ }
+ } else {
+ const uint16_t *pe=GET_EXCEPTIONS(csp, props);
+ uint16_t excWord=*pe++;
+ int32_t index;
+ if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
+ /* special case folding mappings, hardcoded */
+ if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
+ /* default mappings */
+ if(c==0x49) {
+ /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
+ return 0x69;
+ } else if(c==0x130) {
+ /* no simple default mapping for U+0130, use UnicodeData.txt */
+ return 0x69;
+ }
+ } else {
+ /* Turkic mappings */
+ if(c==0x49) {
+ /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
+ return 0x131;
+ } else if(c==0x130) {
+ /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
+ return 0x69;
+ }
+ }
+ }
+ if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
+ index=UCASE_EXC_FOLD;
+ } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
+ index=UCASE_EXC_LOWER;
+ } else {
+ return c;
+ }
+ GET_SLOT_VALUE(excWord, index, pe, c);
+ }
+ return c;
+}
+
+/*
+ * Issue for canonical caseless match (UAX #21):
+ * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
+ * canonical equivalence, unlike default-option casefolding.
+ * For example, I-grave and I + grave fold to strings that are not canonically
+ * equivalent.
+ * For more details, see the comment in unorm_compare() in unorm.cpp
+ * and the intermediate prototype changes for Jitterbug 2021.
+ * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
+ *
+ * This did not get fixed because it appears that it is not possible to fix
+ * it for uppercase and lowercase characters (I-grave vs. i-grave)
+ * together in a way that they still fold to common result strings.
+ */
+
+U_CAPI int32_t U_EXPORT2
+ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
+ const UChar **pString,
+ uint32_t options) {
+ static const UChar
+ iDot[2]= { 0x69, 0x307 };
+
+ UChar32 result;
+ uint16_t props;
+
+ result=c;
+ GET_PROPS(csp, c, props);
+ if(!PROPS_HAS_EXCEPTION(props)) {
+ if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
+ result=c+GET_SIGNED_DELTA(props);
+ }
+ } else {
+ const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
+ uint16_t excWord=*pe++;
+ int32_t full, index;
+
+ pe2=pe;
+
+ if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
+ /* use hardcoded conditions and mappings */
+ if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
+ /* default mappings */
+ if(c==0x49) {
+ /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
+ return 0x69;
+ } else if(c==0x130) {
+ /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
+ *pString=iDot;
+ return 2;
+ }
+ } else {
+ /* Turkic mappings */
+ if(c==0x49) {
+ /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
+ return 0x131;
+ } else if(c==0x130) {
+ /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
+ return 0x69;
+ }
+ }
+ } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
+ GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
+
+ /* start of full case mapping strings */
+ ++pe;
+
+ /* skip the lowercase result string */
+ pe+=full&UCASE_FULL_LOWER;
+ full=(full>>4)&0xf;
+
+ if(full!=0) {
+ /* set the output pointer to the result string */
+ *pString=pe;
+
+ /* return the string length */
+ return full;
+ }
+ }
+
+ if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
+ index=UCASE_EXC_FOLD;
+ } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
+ index=UCASE_EXC_LOWER;
+ } else {
+ return ~c;
+ }
+ GET_SLOT_VALUE(excWord, index, pe2, result);
+ }
+
+ return (result==c) ? ~result : result;
+}
diff --git a/icuSources/common/ucase.h b/icuSources/common/ucase.h
new file mode 100644
index 00000000..f137d9bf
--- /dev/null
+++ b/icuSources/common/ucase.h
@@ -0,0 +1,301 @@
+/*
+*******************************************************************************
+*
+* Copyright (C) 2004, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: ucase.h
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2004aug30
+* created by: Markus W. Scherer
+*
+* Low-level Unicode character/string case mapping code.
+*/
+
+#ifndef __UCASE_H__
+#define __UCASE_H__
+
+#include "unicode/utypes.h"
+#include "unicode/uset.h"
+#include "uset_imp.h"
+#include "udataswp.h"
+
+U_CDECL_BEGIN
+
+/* library API -------------------------------------------------------------- */
+
+struct UCaseProps;
+typedef struct UCaseProps UCaseProps;
+
+U_CAPI UCaseProps * U_EXPORT2
+ucase_open(UErrorCode *pErrorCode);
+
+U_CAPI UCaseProps * U_EXPORT2
+ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode);
+
+U_CAPI void U_EXPORT2
+ucase_close(UCaseProps *csp);
+
+
+U_CAPI UCaseProps * U_EXPORT2
+ucase_getSingleton(UErrorCode *pErrorCode);
+
+
+U_CAPI int32_t U_EXPORT2
+ucase_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode);
+
+U_CAPI void U_EXPORT2
+ucase_addPropertyStarts(const UCaseProps *csp, USetAdder *sa, UErrorCode *pErrorCode);
+
+/**
+ * Bit mask for getting just the options from a string compare options word
+ * that are relevant for case-insensitive string comparison.
+ * See uchar.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER.
+ * @internal
+ */
+#define _STRCASECMP_OPTIONS_MASK 0xffff
+
+/**
+ * Bit mask for getting just the options from a string compare options word
+ * that are relevant for case folding (of a single string or code point).
+ * See uchar.h.
+ * @internal
+ */
+#define _FOLD_CASE_OPTIONS_MASK 0xff
+
+/* single-code point functions */
+
+U_CAPI UChar32 U_EXPORT2
+ucase_tolower(const UCaseProps *csp, UChar32 c);
+
+U_CAPI UChar32 U_EXPORT2
+ucase_toupper(const UCaseProps *csp, UChar32 c);
+
+U_CAPI UChar32 U_EXPORT2
+ucase_totitle(const UCaseProps *csp, UChar32 c);
+
+U_CAPI UChar32 U_EXPORT2
+ucase_fold(UCaseProps *csp, UChar32 c, uint32_t options);
+
+/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
+U_CAPI int32_t U_EXPORT2
+ucase_getType(const UCaseProps *csp, UChar32 c);
+
+/** @return same as ucase_getType(), or <0 if c is case-ignorable */
+U_CAPI int32_t U_EXPORT2
+ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c);
+
+U_CAPI UBool U_EXPORT2
+ucase_isSoftDotted(const UCaseProps *csp, UChar32 c);
+
+U_CAPI UBool U_EXPORT2
+ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c);
+
+/* string case mapping functions */
+
+/**
+ * Iterator function for string case mappings, which need to look at the
+ * context (surrounding text) of a given character for conditional mappings.
+ *
+ * The iterator only needs to go backward or forward away from the
+ * character in question. It does not use any indexes on this interface.
+ * It does not support random access or an arbitrary change of
+ * iteration direction.
+ *
+ * The direction parameter either starts
+ *
+ * @param context A pointer to the iterator's working data.
+ * @param dir If <0 then start iterating backward from the character;
+ * if >0 then start iterating forward from the character;
+ * if 0 then continue iterating in the current direction.
+ * @return Next code point, or <0 when the iteration is done.
+ */
+typedef UChar32 U_CALLCONV
+UCaseContextIterator(void *context, int8_t dir);
+
+/**
+ * Sample struct which may be used by some implementations of
+ * UCaseContextIterator.
+ */
+struct UCaseContext {
+ void *p;
+ int32_t start, index, limit;
+ int32_t cpStart, cpLimit;
+ int8_t dir;
+ int8_t b1, b2, b3;
+};
+typedef struct UCaseContext UCaseContext;
+
+enum {
+ /**
+ * For string case mappings, a single character (a code point) is mapped
+ * either to itself (in which case in-place mapping functions do nothing),
+ * or to another single code point, or to a string.
+ * Aside from the string contents, these are indicated with a single int32_t
+ * value as follows:
+ *
+ * Mapping to self: Negative values (~self instead of -self to support U+0000)
+ *
+ * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH
+ *
+ * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is
+ * returned. Note that the string result may indeed have zero length.
+ */
+ UCASE_MAX_STRING_LENGTH=0x1f
+};
+
+/**
+ * Get the full lowercase mapping for c.
+ *
+ * @param csp Case mapping properties.
+ * @param c Character to be mapped.
+ * @param iter Character iterator, used for context-sensitive mappings.
+ * See UCaseContextIterator for details.
+ * If iter==NULL then a context-independent result is returned.
+ * @param context Pointer to be passed into iter.
+ * @param pString If the mapping result is a string, then the pointer is
+ * written to *pString.
+ * @param locale Locale ID for locale-dependent mappings.
+ * @param locCache Initialize to 0; may be used to cache the result of parsing
+ * the locale ID for subsequent calls.
+ * Can be NULL.
+ * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH.
+ *
+ * @see UCaseContextIterator
+ * @see UCASE_MAX_STRING_LENGTH
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+ucase_toFullLower(const UCaseProps *csp, UChar32 c,
+ UCaseContextIterator *iter, void *context,
+ const UChar **pString,
+ const char *locale, int32_t *locCache);
+
+U_CAPI int32_t U_EXPORT2
+ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
+ UCaseContextIterator *iter, void *context,
+ const UChar **pString,
+ const char *locale, int32_t *locCache);
+
+U_CAPI int32_t U_EXPORT2
+ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
+ UCaseContextIterator *iter, void *context,
+ const UChar **pString,
+ const char *locale, int32_t *locCache);
+
+U_CAPI int32_t U_EXPORT2
+ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
+ const UChar **pString,
+ uint32_t options);
+
+/* file definitions --------------------------------------------------------- */
+
+#define UCASE_DATA_NAME "ucase"
+#define UCASE_DATA_TYPE "icu"
+
+/* format "cAsE" */
+#define UCASE_FMT_0 0x63
+#define UCASE_FMT_1 0x41
+#define UCASE_FMT_2 0x53
+#define UCASE_FMT_3 0x45
+
+/* indexes into indexes[] */
+enum {
+ UCASE_IX_INDEX_TOP,
+ UCASE_IX_LENGTH,
+ UCASE_IX_TRIE_SIZE,
+ UCASE_IX_EXC_LENGTH,
+
+ UCASE_IX_MAX_FULL_LENGTH=15,
+ UCASE_IX_TOP=16
+};
+
+/* definitions for 16-bit case properties word ------------------------------ */
+
+/* 2-bit constants for types of cased characters */
+#define UCASE_TYPE_MASK 3
+enum {
+ UCASE_NONE,
+ UCASE_LOWER,
+ UCASE_UPPER,
+ UCASE_TITLE
+};
+
+#define UCASE_SENSITIVE 4
+#define UCASE_EXCEPTION 8
+
+#define UCASE_DOT_MASK 0x30
+enum {
+ UCASE_NO_DOT=0, /* normal characters with cc=0 */
+ UCASE_SOFT_DOTTED=0x10, /* soft-dotted characters with cc=0 */
+ UCASE_ABOVE=0x20, /* "above" accents with cc=230 */
+ UCASE_OTHER_ACCENT=0x30 /* other accent character (0>UCASE_DELTA_SHIFT)
+
+/* case-ignorable uses one of the delta bits, see gencase/store.c */
+#define UCASE_CASE_IGNORABLE 0x40
+
+/* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */
+#define UCASE_EXC_SHIFT 4
+#define UCASE_EXC_MASK 0xfff0
+#define UCASE_MAX_EXCEPTIONS 0x1000
+
+/* definitions for 16-bit main exceptions word ------------------------------ */
+
+/* first 8 bits indicate values in optional slots */
+enum {
+ UCASE_EXC_LOWER,
+ UCASE_EXC_FOLD,
+ UCASE_EXC_UPPER,
+ UCASE_EXC_TITLE,
+ UCASE_EXC_4, /* reserved */
+ UCASE_EXC_5, /* reserved */
+ UCASE_EXC_6, /* reserved */
+ UCASE_EXC_FULL_MAPPINGS,
+ UCASE_EXC_ALL_SLOTS /* one past the last slot */
+};
+
+/* each slot is 2 uint16_t instead of 1 */
+#define UCASE_EXC_DOUBLE_SLOTS 0x100
+
+/* reserved: exception bits 11..9 */
+
+/* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK< Data has not been loaded.
+ * < 0 -> Error occured attempting to load data.
+ * > 0 -> Data has been successfully loaded.
+ */
/* index values loaded from uprops.dat */
static int32_t indexes[UPROPS_INDEX_COUNT];
@@ -93,8 +95,7 @@ isAcceptable(void *context,
}
}
-UBool
-uchar_cleanup()
+static UBool U_CALLCONV uchar_cleanup(void)
{
if (propsData) {
udata_close(propsData);
@@ -107,13 +108,55 @@ uchar_cleanup()
propsVectors=NULL;
countPropsVectors=0;
dataErrorCode=U_ZERO_ERROR;
- havePropsData=FALSE;
+ havePropsData=0;
return TRUE;
}
-static int8_t
-loadPropsData(void) {
+struct UCharProps {
+ UDataMemory *propsData;
+ UTrie propsTrie, propsVectorsTrie;
+ const uint32_t *pData32;
+};
+typedef struct UCharProps UCharProps;
+
+/* open uprops.icu */
+static void
+_openProps(UCharProps *ucp, UErrorCode *pErrorCode) {
+ const uint32_t *p;
+ int32_t length;
+
+ ucp->propsData=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+
+ ucp->pData32=p=(const uint32_t *)udata_getMemory(ucp->propsData);
+
+ /* unserialize the trie; it is directly after the int32_t indexes[UPROPS_INDEX_COUNT] */
+ length=(int32_t)p[UPROPS_PROPS32_INDEX]*4;
+ length=utrie_unserialize(&ucp->propsTrie, (const uint8_t *)(p+UPROPS_INDEX_COUNT), length-64, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+ ucp->propsTrie.getFoldingOffset=getFoldingPropsOffset;
+
+ /* unserialize the properties vectors trie, if any */
+ if( p[UPROPS_ADDITIONAL_TRIE_INDEX]!=0 &&
+ p[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0
+ ) {
+ length=(int32_t)(p[UPROPS_ADDITIONAL_VECTORS_INDEX]-p[UPROPS_ADDITIONAL_TRIE_INDEX])*4;
+ length=utrie_unserialize(&ucp->propsVectorsTrie, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ uprv_memset(&ucp->propsVectorsTrie, 0, sizeof(ucp->propsVectorsTrie));
+ } else {
+ ucp->propsVectorsTrie.getFoldingOffset=getFoldingPropsOffset;
+ }
+ }
+}
+
+U_CFUNC int8_t
+uprv_loadPropsData(UErrorCode *pErrorCode) {
/* load Unicode character properties data from file if necessary */
/*
@@ -122,84 +165,201 @@ loadPropsData(void) {
* Check the readme and use u_init() if necessary.
*/
if(havePropsData==0) {
- UTrie trie={ 0 }, trie2={ 0 };
- UErrorCode errorCode=U_ZERO_ERROR;
- UDataMemory *data;
- const uint32_t *p=NULL;
- int32_t length;
+ UCharProps ucp={ NULL };
+ UCaseProps *csp;
- /* open the data outside the mutex block */
- data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
- dataErrorCode=errorCode;
- if(U_FAILURE(errorCode)) {
- return havePropsData=-1;
+ if(U_FAILURE(*pErrorCode)) {
+ return havePropsData;
}
- p=(const uint32_t *)udata_getMemory(data);
+ /* open the data outside the mutex block */
+ _openProps(&ucp, pErrorCode);
+
+ if(U_SUCCESS(*pErrorCode)) {
+ /* in the mutex block, set the data for this process */
+ umtx_lock(NULL);
+ if(propsData==NULL) {
+ propsData=ucp.propsData;
+ ucp.propsData=NULL;
+ pData32=ucp.pData32;
+ ucp.pData32=NULL;
+ uprv_memcpy(&propsTrie, &ucp.propsTrie, sizeof(propsTrie));
+ uprv_memcpy(&propsVectorsTrie, &ucp.propsVectorsTrie, sizeof(propsVectorsTrie));
+ csp=NULL;
+ }
- /* unserialize the trie; it is directly after the int32_t indexes[UPROPS_INDEX_COUNT] */
- length=(int32_t)p[UPROPS_PROPS32_INDEX]*4;
- length=utrie_unserialize(&trie, (const uint8_t *)(p+UPROPS_INDEX_COUNT), length-64, &errorCode);
- if(U_FAILURE(errorCode)) {
- dataErrorCode=errorCode;
- udata_close(data);
- return havePropsData=-1;
- }
- trie.getFoldingOffset=getFoldingPropsOffset;
-
- /* unserialize the properties vectors trie, if any */
- if( p[UPROPS_ADDITIONAL_TRIE_INDEX]!=0 &&
- p[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0
- ) {
- length=(int32_t)(p[UPROPS_ADDITIONAL_VECTORS_INDEX]-p[UPROPS_ADDITIONAL_TRIE_INDEX])*4;
- length=utrie_unserialize(&trie2, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, &errorCode);
- if(U_FAILURE(errorCode)) {
- uprv_memset(&trie2, 0, sizeof(trie2));
- } else {
- trie2.getFoldingOffset=getFoldingPropsOffset;
+ /* initialize some variables */
+ uprv_memcpy(indexes, pData32, sizeof(indexes));
+ props32Table=pData32+indexes[UPROPS_PROPS32_INDEX];
+ exceptionsTable=pData32+indexes[UPROPS_EXCEPTIONS_INDEX];
+ ucharsTable=(const UChar *)(pData32+indexes[UPROPS_EXCEPTIONS_TOP_INDEX]);
+
+ /* additional properties */
+ if(indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0) {
+ propsVectors=pData32+indexes[UPROPS_ADDITIONAL_VECTORS_INDEX];
+ countPropsVectors=indexes[UPROPS_RESERVED_INDEX]-indexes[UPROPS_ADDITIONAL_VECTORS_INDEX];
+ propsVectorsColumns=indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX];
}
- }
- /* in the mutex block, set the data for this process */
- umtx_lock(NULL);
- if(propsData==NULL) {
- propsData=data;
- data=NULL;
- pData32=p;
- p=NULL;
- uprv_memcpy(&propsTrie, &trie, sizeof(trie));
- uprv_memcpy(&propsVectorsTrie, &trie2, sizeof(trie2));
+ havePropsData=1;
+ umtx_unlock(NULL);
+ } else {
+ dataErrorCode=*pErrorCode;
+ havePropsData=-1;
}
- umtx_unlock(NULL);
-
- /* initialize some variables */
- uprv_memcpy(indexes, pData32, sizeof(indexes));
- props32Table=pData32+indexes[UPROPS_PROPS32_INDEX];
- exceptionsTable=pData32+indexes[UPROPS_EXCEPTIONS_INDEX];
- ucharsTable=(const UChar *)(pData32+indexes[UPROPS_EXCEPTIONS_TOP_INDEX]);
-
- /* additional properties */
- if(indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0) {
- propsVectors=pData32+indexes[UPROPS_ADDITIONAL_VECTORS_INDEX];
- countPropsVectors=indexes[UPROPS_RESERVED_INDEX]-indexes[UPROPS_ADDITIONAL_VECTORS_INDEX];
- propsVectorsColumns=indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX];
+ ucln_common_registerCleanup(UCLN_COMMON_UCHAR, uchar_cleanup);
+
+ /* if a different thread set it first, then close the extra data */
+ udata_close(ucp.propsData); /* NULL if it was set correctly */
+ }
+
+ return havePropsData;
+}
+
+
+static int8_t
+loadPropsData(void) {
+ UErrorCode errorCode = U_ZERO_ERROR;
+ int8_t retVal = uprv_loadPropsData(&errorCode);
+ return retVal;
+}
+
+
+/* Unicode properties data swapping ----------------------------------------- */
+
+U_CAPI int32_t U_EXPORT2
+uprops_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ const UDataInfo *pInfo;
+ int32_t headerSize, i;
+
+ int32_t dataIndexes[UPROPS_INDEX_COUNT];
+ const int32_t *inData32;
+
+ /* udata_swapDataHeader checks the arguments */
+ headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ /* check data format and format version */
+ pInfo=(const UDataInfo *)((const char *)inData+4);
+ if(!(
+ pInfo->dataFormat[0]==0x55 && /* dataFormat="UPro" */
+ pInfo->dataFormat[1]==0x50 &&
+ pInfo->dataFormat[2]==0x72 &&
+ pInfo->dataFormat[3]==0x6f &&
+ pInfo->formatVersion[0]==3 &&
+ pInfo->formatVersion[2]==UTRIE_SHIFT &&
+ pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
+ )) {
+ udata_printError(ds, "uprops_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not a Unicode properties file\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0]);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ /* the properties file must contain at least the indexes array */
+ if(length>=0 && (length-headerSize)=0) {
+ int32_t *outData32;
+
+ if((length-headerSize)<(4*dataIndexes[UPROPS_RESERVED_INDEX])) {
+ udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n",
+ length-headerSize);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
}
- havePropsData=1;
+ outData32=(int32_t *)((char *)outData+headerSize);
- /* if a different thread set it first, then close the extra data */
- if(data!=NULL) {
- udata_close(data); /* NULL if it was set correctly */
+ /* copy everything for inaccessible data (padding) */
+ if(inData32!=outData32) {
+ uprv_memcpy(outData32, inData32, 4*dataIndexes[UPROPS_RESERVED_INDEX]);
}
+
+ /* swap the indexes[16] */
+ ds->swapArray32(ds, inData32, 4*UPROPS_INDEX_COUNT, outData32, pErrorCode);
+
+ /*
+ * swap the main properties UTrie
+ * PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
+ */
+ utrie_swap(ds,
+ inData32+UPROPS_INDEX_COUNT,
+ 4*(dataIndexes[UPROPS_PROPS32_INDEX]-UPROPS_INDEX_COUNT),
+ outData32+UPROPS_INDEX_COUNT,
+ pErrorCode);
+
+ /*
+ * swap the properties and exceptions words
+ * P const uint32_t props32[i1-i0];
+ * E const uint32_t exceptions[i2-i1];
+ */
+ ds->swapArray32(ds,
+ inData32+dataIndexes[UPROPS_PROPS32_INDEX],
+ 4*(dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]-dataIndexes[UPROPS_PROPS32_INDEX]),
+ outData32+dataIndexes[UPROPS_PROPS32_INDEX],
+ pErrorCode);
+
+ /*
+ * swap the UChars
+ * U const UChar uchars[2*(i3-i2)];
+ */
+ ds->swapArray16(ds,
+ inData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX],
+ 4*(dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]-dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]),
+ outData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX],
+ pErrorCode);
+
+ /*
+ * swap the additional UTrie
+ * i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
+ */
+ utrie_swap(ds,
+ inData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX],
+ 4*(dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]-dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]),
+ outData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX],
+ pErrorCode);
+
+ /*
+ * swap the properties vectors
+ * PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
+ */
+ ds->swapArray32(ds,
+ inData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX],
+ 4*(dataIndexes[UPROPS_RESERVED_INDEX]-dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]),
+ outData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX],
+ pErrorCode);
}
- return havePropsData;
+ /* i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table */
+ return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX];
}
-/* constants and macros for access to the data */
+/* constants and macros for access to the data ------------------------------ */
/* getting a uint32_t properties word from the data */
-#define HAVE_DATA (havePropsData>0 || (havePropsData==0 && loadPropsData()>0))
+#define HAVE_DATA (havePropsData>0 || loadPropsData()>0)
#define VALIDATE(c) (((uint32_t)(c))<=0x10ffff && HAVE_DATA)
#define GET_PROPS_UNSAFE(c, result) \
UTRIE_GET16(&propsTrie, c, result); \
@@ -246,12 +406,17 @@ static const uint8_t flagsOffset[256]={
U_CFUNC UBool
uprv_haveProperties(UErrorCode *pErrorCode) {
- if(HAVE_DATA) {
- return TRUE;
- } else {
+ if(U_FAILURE(*pErrorCode)) {
+ return FALSE;
+ }
+ if(havePropsData==0) {
+ uprv_loadPropsData(pErrorCode);
+ }
+ if(havePropsData<0) {
*pErrorCode=dataErrorCode;
return FALSE;
}
+ return TRUE;
}
/* API functions ------------------------------------------------------------ */
@@ -353,6 +518,11 @@ u_isalpha(UChar32 c) {
return (UBool)((CAT_MASK(props)&U_GC_L_MASK)!=0);
}
+U_CAPI UBool U_EXPORT2
+u_isUAlphabetic(UChar32 c) {
+ return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0;
+}
+
/* Checks if ch is a letter or a decimal digit */
U_CAPI UBool U_EXPORT2
u_isalnum(UChar32 c) {
@@ -431,6 +601,11 @@ u_isblank(UChar32 c) {
}
}
+U_CAPI UBool U_EXPORT2
+u_isUWhiteSpace(UChar32 c) {
+ return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_WHITE_SPACE))!=0;
+}
+
/* Checks if the Unicode character is printable.*/
U_CAPI UBool U_EXPORT2
u_isprint(UChar32 c) {
@@ -518,79 +693,6 @@ u_isJavaIDPart(UChar32 c) {
u_isIDIgnorable(c));
}
-/* Transforms the Unicode character to its lower case equivalent.*/
-U_CAPI UChar32 U_EXPORT2
-u_tolower(UChar32 c) {
- uint32_t props;
- GET_PROPS(c, props);
- if(!PROPS_VALUE_IS_EXCEPTION(props)) {
- if(CAT_MASK(props)&(U_GC_LU_MASK|U_GC_LT_MASK)) {
- return c+GET_SIGNED_VALUE(props);
- }
- } else {
- const uint32_t *pe=GET_EXCEPTIONS(props);
- uint32_t firstExceptionValue=*pe;
- if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_LOWERCASE)) {
- int i=EXC_LOWERCASE;
- ++pe;
- ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
- return (UChar32)*pe;
- }
- }
- return c; /* no mapping - return c itself */
-}
-
-/* Transforms the Unicode character to its upper case equivalent.*/
-U_CAPI UChar32 U_EXPORT2
-u_toupper(UChar32 c) {
- uint32_t props;
- GET_PROPS(c, props);
- if(!PROPS_VALUE_IS_EXCEPTION(props)) {
- if(GET_CATEGORY(props)==U_LOWERCASE_LETTER) {
- return c-GET_SIGNED_VALUE(props);
- }
- } else {
- const uint32_t *pe=GET_EXCEPTIONS(props);
- uint32_t firstExceptionValue=*pe;
- if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_UPPERCASE)) {
- int i=EXC_UPPERCASE;
- ++pe;
- ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
- return (UChar32)*pe;
- }
- }
- return c; /* no mapping - return c itself */
-}
-
-/* Transforms the Unicode character to its title case equivalent.*/
-U_CAPI UChar32 U_EXPORT2
-u_totitle(UChar32 c) {
- uint32_t props;
- GET_PROPS(c, props);
- if(!PROPS_VALUE_IS_EXCEPTION(props)) {
- if(GET_CATEGORY(props)==U_LOWERCASE_LETTER) {
- /* here, titlecase is same as uppercase */
- return c-GET_SIGNED_VALUE(props);
- }
- } else {
- const uint32_t *pe=GET_EXCEPTIONS(props);
- uint32_t firstExceptionValue=*pe;
- if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_TITLECASE)) {
- int i=EXC_TITLECASE;
- ++pe;
- ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
- return (UChar32)*pe;
- } else if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_UPPERCASE)) {
- /* here, titlecase is same as uppercase */
- int i=EXC_UPPERCASE;
- ++pe;
- ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
- return (UChar32)*pe;
- }
- }
- return c; /* no mapping - return c itself */
-}
-
U_CAPI int32_t U_EXPORT2
u_charDigitValue(UChar32 c) {
uint32_t props, numericType;
@@ -826,83 +928,90 @@ uprv_getMaxValues(int32_t column) {
}
}
-static UBool U_CALLCONV
-_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
- /* add the start code point to the USet */
- uset_add((USet *)context, start);
- return TRUE;
+/*
+ * get Hangul Syllable Type
+ * implemented here so that uchar.c (uhst_addPropertyStarts())
+ * does not depend on uprops.c (u_getIntPropertyValue(c, UCHAR_HANGUL_SYLLABLE_TYPE))
+ */
+U_CFUNC UHangulSyllableType
+uchar_getHST(UChar32 c) {
+ /* purely algorithmic; hardcode known characters, check for assigned new ones */
+ if(c>UPROPS_AGE_SHIFT;
+ versionArray[0]=(uint8_t)(version>>4);
+ versionArray[1]=(uint8_t)(version&0xf);
+ versionArray[2]=versionArray[3]=0;
+ }
+}
+U_CAPI UScriptCode U_EXPORT2
+uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+ if((uint32_t)c>0x10ffff) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+
+ return (UScriptCode)(u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_MASK);
+}
+
+U_CAPI UBlockCode U_EXPORT2
+ublock_getCode(UChar32 c) {
+ return (UBlockCode)((u_getUnicodeProperties(c, 0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT);
+}
+
+/* property starts for UnicodeSet ------------------------------------------- */
+
+/* for Hangul_Syllable_Type */
U_CAPI void U_EXPORT2
-uchar_addPropertyStarts(USet *set, UErrorCode *pErrorCode) {
+uhst_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
UChar32 c;
int32_t value, value2;
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+
if(!HAVE_DATA) {
*pErrorCode=dataErrorCode;
return;
}
- /* add the start code point of each same-value range of each trie */
- utrie_enum(&propsTrie, NULL, _enumPropertyStartsRange, set);
- utrie_enum(&propsVectorsTrie, NULL, _enumPropertyStartsRange, set);
-
/* add code points with hardcoded properties, plus the ones following them */
- /* add for IS_THAT_CONTROL_SPACE() */
- uset_add(set, TAB); /* range TAB..CR */
- uset_add(set, CR+1);
- uset_add(set, 0x1c);
- uset_add(set, 0x1f+1);
- USET_ADD_CP_AND_NEXT(set, NL);
-
- /* add for u_isIDIgnorable() what was not added above */
- uset_add(set, DEL); /* range DEL..NBSP-1, NBSP added below */
- uset_add(set, HAIRSP);
- uset_add(set, RLM+1);
- uset_add(set, INHSWAP);
- uset_add(set, NOMDIG+1);
- USET_ADD_CP_AND_NEXT(set, ZWNBSP);
-
- /* add no-break spaces for u_isWhitespace() what was not added above */
- USET_ADD_CP_AND_NEXT(set, NBSP);
- USET_ADD_CP_AND_NEXT(set, FIGURESP);
- USET_ADD_CP_AND_NEXT(set, NNBSP);
-
- /* add for u_charDigitValue() */
- USET_ADD_CP_AND_NEXT(set, 0x3007);
- USET_ADD_CP_AND_NEXT(set, 0x4e00);
- USET_ADD_CP_AND_NEXT(set, 0x4e8c);
- USET_ADD_CP_AND_NEXT(set, 0x4e09);
- USET_ADD_CP_AND_NEXT(set, 0x56db);
- USET_ADD_CP_AND_NEXT(set, 0x4e94);
- USET_ADD_CP_AND_NEXT(set, 0x516d);
- USET_ADD_CP_AND_NEXT(set, 0x4e03);
- USET_ADD_CP_AND_NEXT(set, 0x516b);
- USET_ADD_CP_AND_NEXT(set, 0x4e5d);
-
- /* add for u_digit() */
- uset_add(set, U_a);
- uset_add(set, U_z+1);
- uset_add(set, U_A);
- uset_add(set, U_Z+1);
-
- /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
- uset_add(set, WJ); /* range WJ..NOMDIG */
- uset_add(set, 0xfff0);
- uset_add(set, 0xfffb+1);
- uset_add(set, 0xe0000);
- uset_add(set, 0xe0fff+1);
-
- /* add for UCHAR_GRAPHEME_BASE and others */
- USET_ADD_CP_AND_NEXT(set, CGJ);
-
- /* add for UCHAR_JOINING_TYPE */
- uset_add(set, ZWNJ); /* range ZWNJ..ZWJ */
- uset_add(set, ZWJ+1);
-
/*
* Add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE.
* First, we add fixed boundaries for the blocks of Jamos.
@@ -911,1148 +1020,120 @@ uchar_addPropertyStarts(USet *set, UErrorCode *pErrorCode) {
* at the end of the per-Jamo-block assignments in Unicode 4 or earlier.
* (These have not changed since Unicode 2.)
*/
- uset_add(set, 0x1100);
+ sa->add(sa->set, 0x1100);
value=U_HST_LEADING_JAMO;
for(c=0x115a; c<=0x115f; ++c) {
- value2=u_getIntPropertyValue(c, UCHAR_HANGUL_SYLLABLE_TYPE);
+ value2=uchar_getHST(c);
if(value!=value2) {
value=value2;
- uset_add(set, c);
+ sa->add(sa->set, c);
}
}
- uset_add(set, 0x1160);
+ sa->add(sa->set, 0x1160);
value=U_HST_VOWEL_JAMO;
for(c=0x11a3; c<=0x11a7; ++c) {
- value2=u_getIntPropertyValue(c, UCHAR_HANGUL_SYLLABLE_TYPE);
+ value2=uchar_getHST(c);
if(value!=value2) {
value=value2;
- uset_add(set, c);
+ sa->add(sa->set, c);
}
}
- uset_add(set, 0x11a8);
+ sa->add(sa->set, 0x11a8);
value=U_HST_TRAILING_JAMO;
for(c=0x11fa; c<=0x11ff; ++c) {
- value2=u_getIntPropertyValue(c, UCHAR_HANGUL_SYLLABLE_TYPE);
+ value2=uchar_getHST(c);
if(value!=value2) {
value=value2;
- uset_add(set, c);
+ sa->add(sa->set, c);
}
}
-
- /*
- * Omit code points with hardcoded specialcasing properties
- * because we do not build property UnicodeSets for them right now.
- */
-}
-
-/* string casing ------------------------------------------------------------ */
-
-/*
- * These internal string case mapping functions are here instead of ustring.c
- * because we need efficient access to the character properties.
- *
- * This section contains helper functions that check for conditions
- * in the input text surrounding the current code point
- * according to SpecialCasing.txt.
- *
- * Starting with ICU 2.1, the "surrounding text" is passed in as an instance of
- * UCharIterator to allow the core case mapping functions to be used
- * inside transliterators (using Replaceable instead of UnicodeString/UChar *)
- * etc.
- *
- * Each helper function gets the index
- * - after the current code point if it looks at following text
- * - before the current code point if it looks at preceding text
- *
- * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
- *
- * Final_Sigma
- * C is preceded by a sequence consisting of
- * a cased letter and a case-ignorable sequence,
- * and C is not followed by a sequence consisting of
- * an ignorable sequence and then a cased letter.
- *
- * More_Above
- * C is followed by one or more characters of combining class 230 (ABOVE)
- * in the combining character sequence.
- *
- * After_Soft_Dotted
- * The last preceding character with combining class of zero before C
- * was Soft_Dotted,
- * and there is no intervening combining character class 230 (ABOVE).
- *
- * Before_Dot
- * C is followed by combining dot above (U+0307).
- * Any sequence of characters with a combining class that is neither 0 nor 230
- * may intervene between the current character and the combining dot above.
- *
- * The erratum from 2002-10-31 adds the condition
- *
- * After_I
- * The last preceding base character was an uppercase I, and there is no
- * intervening combining character class 230 (ABOVE).
- *
- * (See Jitterbug 2344 and the comments on After_I below.)
- *
- * Helper definitions in Unicode 3.2 UAX 21:
- *
- * D1. A character C is defined to be cased
- * if it meets any of the following criteria:
- *
- * - The general category of C is Titlecase Letter (Lt)
- * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
- * - Given D = NFD(C), then it is not the case that:
- * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
- * (This third criterium does not add any characters to the list
- * for Unicode 3.2. Ignored.)
- *
- * D2. A character C is defined to be case-ignorable
- * if it meets either of the following criteria:
- *
- * - The general category of C is
- * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
- * Letter Modifier (Lm), or Symbol Modifier (Sk)
- * - C is one of the following characters
- * U+0027 APOSTROPHE
- * U+00AD SOFT HYPHEN (SHY)
- * U+2019 RIGHT SINGLE QUOTATION MARK
- * (the preferred character for apostrophe)
- *
- * D3. A case-ignorable sequence is a sequence of
- * zero or more case-ignorable characters.
- */
-
-#if UCONFIG_NO_NORMALIZATION
-/* no normalization - no combining classes */
-static U_INLINE uint8_t
-u_getCombiningClass(UChar32 c) {
- return 0;
-}
-#endif
-
-enum {
- LOC_ROOT,
- LOC_TURKISH,
- LOC_LITHUANIAN
-};
-
-static int32_t
-getCaseLocale(const char *locale) {
- char lang[32];
- UErrorCode errorCode;
- int32_t length;
-
- errorCode=U_ZERO_ERROR;
- length=uloc_getLanguage(locale, lang, sizeof(lang), &errorCode);
- if(U_FAILURE(errorCode) || length!=2) {
- return LOC_ROOT;
- }
-
- if( (lang[0]=='t' && lang[1]=='r') ||
- (lang[0]=='a' && lang[1]=='z')
- ) {
- return LOC_TURKISH;
- } else if(lang[0]=='l' && lang[1]=='t') {
- return LOC_LITHUANIAN;
- } else {
- return LOC_ROOT;
- }
-}
-
-/* Is case-ignorable? */
-static U_INLINE UBool
-isCaseIgnorable(UChar32 c, uint32_t category) {
- return (FLAG(category)&(_Mn|_Me|_Cf|_Lm|_Sk))!=0 ||
- c==0x27 || c==0xad || c==0x2019;
-}
-
-/* Is this a "cased" character? */
-static U_INLINE UBool
-isCased(UChar32 c, uint32_t category) {
- /* Lt+Uppercase+Lowercase = Lt+Lu+Ll+Other_Uppercase+Other_Lowercase */
- return (FLAG(category)&(_Lt|_Lu|_Ll))!=0 ||
- (u_getUnicodeProperties(c, 1)&(FLAG(UPROPS_UPPERCASE)|FLAG(UPROPS_LOWERCASE)))!=0;
-}
-
-/* Is Soft_Dotted? */
-static U_INLINE UBool
-isSoftDotted(UChar32 c) {
- return (u_getUnicodeProperties(c, 1)&FLAG(UPROPS_SOFT_DOTTED))!=0;
-}
-
-/* Is followed by {case-ignorable}* cased ? */
-static UBool
-isFollowedByCasedLetter(UCharIterator *iter, int32_t index) {
- /* This is volatile because AIX 5.1 Visual Age 5.0 in 32-bit mode can't
- optimize this correctly. It couldn't optimize (1UL<move(iter, index, UITER_ZERO);
- for(;;) {
- c=uiter_next32(iter);
- if(c<0) {
- break;
- }
- GET_PROPS_UNSAFE(c, props);
- category=GET_CATEGORY(props);
- if(isCased(c, category)) {
- return TRUE; /* followed by cased letter */
- }
- if(!isCaseIgnorable(c, category)) {
- return FALSE; /* not ignorable */
- }
- }
-
- return FALSE; /* not followed by cased letter */
-}
-
-/* Is preceded by cased {case-ignorable}* ? */
-static UBool
-isPrecededByCasedLetter(UCharIterator *iter, int32_t index) {
- /* This is volatile because AIX 5.1 Visual Age 5.0 in 32-bit mode can't
- optimize this correctly. It couldn't optimize (1UL<move(iter, index, UITER_ZERO);
- for(;;) {
- c=uiter_previous32(iter);
- if(c<0) {
- break;
- }
- GET_PROPS_UNSAFE(c, props);
- category=GET_CATEGORY(props);
- if(isCased(c, category)) {
- return TRUE; /* preceded by cased letter */
- }
- if(!isCaseIgnorable(c, category)) {
- return FALSE; /* not ignorable */
- }
- }
-
- return FALSE; /* not followed by cased letter */
-}
-
-/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
-static UBool
-isPrecededBySoftDotted(UCharIterator *iter, int32_t index) {
- int32_t c;
- uint8_t cc;
-
- if(iter==NULL) {
- return FALSE;
- }
-
- iter->move(iter, index, UITER_ZERO);
- for(;;) {
- c=uiter_previous32(iter);
- if(c<0) {
- break;
- }
- if(isSoftDotted(c)) {
- return TRUE; /* preceded by TYPE_i */
- }
-
- cc=u_getCombiningClass(c);
- if(cc==0 || cc==230) {
- return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
- }
+ /* Add Hangul type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE. */
+ for(c=HANGUL_BASE; c<(HANGUL_BASE+HANGUL_COUNT); c+=JAMO_T_COUNT) {
+ sa->add(sa->set, c);
+ sa->add(sa->set, c+1);
}
-
- return FALSE; /* not preceded by TYPE_i */
+ sa->add(sa->set, c);
}
-/*
- * See Jitterbug 2344:
- * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
- * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
- * we made those releases compatible with Unicode 3.2 which had not fixed
- * a related but in SpecialCasing.txt.
- *
- * From the Jitterbug 2344 text:
- * ... this bug is listed as a Unicode erratum
- * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
- *
- * There are two errors in SpecialCasing.txt.
- * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
- * 2. An incorrect context definition. Correct as follows:
- * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
- * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
- * ---
- * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
- * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
- * where the context After_I is defined as:
- * The last preceding base character was an uppercase I, and there is no
- * intervening combining character class 230 (ABOVE).
- *
- *
- * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
- *
- * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
- * # This matches the behavior of the canonically equivalent I-dot_above
- *
- * See also the description in this place in older versions of uchar.c (revision 1.100).
- *
- * Markus W. Scherer 2003-feb-15
- */
-
-/* Is preceded by base character 'I' with no intervening cc=230 ? */
-static UBool
-isPrecededBy_I(UCharIterator *iter, int32_t index) {
- int32_t c;
- uint8_t cc;
-
- if(iter==NULL) {
- return FALSE;
- }
-
- iter->move(iter, index, UITER_ZERO);
- for(;;) {
- c=uiter_previous32(iter);
- if(c<0) {
- break;
- }
- if(c==0x49) {
- return TRUE; /* preceded by I */
- }
-
- cc=u_getCombiningClass(c);
- if(cc==0 || cc==230) {
- return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
- }
- }
-
- return FALSE; /* not preceded by I */
-}
-
-/* Is followed by one or more cc==230 ? */
-static UBool
-isFollowedByMoreAbove(UCharIterator *iter, int32_t index) {
- int32_t c;
- uint8_t cc;
-
- if(iter==NULL) {
- return FALSE;
- }
-
- iter->move(iter, index, UITER_ZERO);
- for(;;) {
- c=uiter_next32(iter);
- if(c<0) {
- break;
- }
- cc=u_getCombiningClass(c);
- if(cc==230) {
- return TRUE; /* at least one cc==230 following */
- }
- if(cc==0) {
- return FALSE; /* next base character, no more cc==230 following */
- }
- }
-
- return FALSE; /* no more cc==230 following */
-}
-
-/* Is followed by a dot above (without cc==230 in between) ? */
-static UBool
-isFollowedByDotAbove(UCharIterator *iter, int32_t index) {
- int32_t c;
- uint8_t cc;
-
- if(iter==NULL) {
- return FALSE;
- }
-
- iter->move(iter, index, UITER_ZERO);
- for(;;) {
- c=uiter_next32(iter);
- if(c<0) {
- break;
- }
- if(c==0x307) {
- return TRUE;
- }
- cc=u_getCombiningClass(c);
- if(cc==0 || cc==230) {
- return FALSE; /* next base character or cc==230 in between */
- }
- }
-
- return FALSE; /* no dot above following */
-}
-
-/* lowercasing -------------------------------------------------------------- */
-
-/* internal, see ustr_imp.h */
-U_CAPI int32_t U_EXPORT2
-u_internalToLower(UChar32 c, UCharIterator *iter,
- UChar *dest, int32_t destCapacity,
- const char *locale) {
- UChar buffer[8];
- uint32_t props;
- UChar32 result;
- int32_t i, length;
-
- result=c;
- GET_PROPS(c, props);
- if(!PROPS_VALUE_IS_EXCEPTION(props)) {
- if(CAT_MASK(props)&(U_GC_LU_MASK|U_GC_LT_MASK)) {
- result=c+GET_SIGNED_VALUE(props);
- }
- } else {
- const UChar *u;
- const uint32_t *pe=GET_EXCEPTIONS(props);
- uint32_t firstExceptionValue=*pe, specialCasing;
- int32_t minLength;
-
- if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_SPECIAL_CASING)) {
- i=EXC_SPECIAL_CASING;
- ++pe;
- ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
- specialCasing=*pe;
- /* fill u and length with the case mapping result string */
- if(specialCasing&0x80000000) {
- /* use hardcoded conditions and mappings */
- int32_t loc=getCaseLocale(locale),
- srcIndex= iter!=NULL ? iter->getIndex(iter, UITER_CURRENT) : 0;
-
- /*
- * Test for conditional mappings first
- * (otherwise the unconditional default mappings are always taken),
- * then test for characters that have unconditional mappings in SpecialCasing.txt,
- * then get the UnicodeData.txt mappings.
- */
- if( loc==LOC_LITHUANIAN &&
- /* base characters, find accents above */
- (((c==0x49 || c==0x4a || c==0x12e) &&
- isFollowedByMoreAbove(iter, srcIndex)) ||
- /* precomposed with accent above, no need to find one */
- (c==0xcc || c==0xcd || c==0x128))
- ) {
- /*
- # Lithuanian
-
- # Lithuanian retains the dot in a lowercase i when followed by accents.
-
- # Introduce an explicit dot above when lowercasing capital I's and J's
- # whenever there are more accents above.
- # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
-
- 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
- 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
- 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
- 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
- 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
- 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
- */
- u=buffer;
- buffer[1]=0x307;
- switch(c) {
- case 0x49: /* LATIN CAPITAL LETTER I */
- buffer[0]=0x69;
- length=2;
- break;
- case 0x4a: /* LATIN CAPITAL LETTER J */
- buffer[0]=0x6a;
- length=2;
- break;
- case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
- buffer[0]=0x12f;
- length=2;
- break;
- case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
- buffer[0]=0x69;
- buffer[2]=0x300;
- length=3;
- break;
- case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
- buffer[0]=0x69;
- buffer[2]=0x301;
- length=3;
- break;
- case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
- buffer[0]=0x69;
- buffer[2]=0x303;
- length=3;
- break;
- default:
- return 0; /* will not occur */
- }
- /* # Turkish and Azeri */
- } else if(loc==LOC_TURKISH && c==0x130) {
- /*
- # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
- # The following rules handle those cases.
-
- 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
- 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
- */
- result=0x69;
- goto single;
- } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, srcIndex-1)) {
- /*
- # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
- # This matches the behavior of the canonically equivalent I-dot_above
-
- 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
- 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
- */
- return 0; /* remove the dot (continue without output) */
- } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, srcIndex)) {
- /*
- # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
-
- 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
- 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
- */
- result=0x131;
- goto single;
- } else if(c==0x130) {
- /*
- # Preserve canonical equivalence for I with dot. Turkic is handled below.
-
- 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
- */
- static const UChar iWithDot[2]={ 0x69, 0x307 };
- u=iWithDot;
- length=2;
- } else if( c==0x3a3 &&
- !isFollowedByCasedLetter(iter, srcIndex) &&
- isPrecededByCasedLetter(iter, srcIndex-1)
- ) {
- /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
- /*
- # Special case for final form of sigma
-
- 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
- */
- result=0x3c2; /* greek small final sigma */
- goto single;
- } else {
- /* no known conditional special case mapping, use a normal mapping */
- pe=GET_EXCEPTIONS(props); /* restore the initial exception pointer */
- firstExceptionValue=*pe;
- goto notSpecial;
- }
- } else {
- /* get the special case mapping string from the data file */
- u=ucharsTable+(specialCasing&0xffff);
- length=(int32_t)((*u++)&0x1f);
- }
-
- /* copy the result string */
- minLength = (length < destCapacity) ? length : destCapacity;
- i=0;
- while(idestCapacity) {
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- }
- return destIndex;
+static UBool U_CALLCONV
+_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
+ /* add the start code point to the USet */
+ USetAdder *sa=(USetAdder *)context;
+ sa->add(sa->set, start);
+ return TRUE;
}
-/* uppercasing -------------------------------------------------------------- */
-
-/* internal */
-static int32_t
-u_internalToUpperOrTitle(UChar32 c, UCharIterator *iter,
- UChar *dest, int32_t destCapacity,
- const char *locale,
- UBool upperNotTitle) {
- uint32_t props;
- UChar32 result;
- int32_t i, length;
-
- result=c;
- GET_PROPS(c, props);
- if(!PROPS_VALUE_IS_EXCEPTION(props)) {
- if(GET_CATEGORY(props)==U_LOWERCASE_LETTER) {
- result=c-GET_SIGNED_VALUE(props);
- }
- } else {
- const UChar *u;
- const uint32_t *pe=GET_EXCEPTIONS(props);
- uint32_t firstExceptionValue=*pe, specialCasing;
- if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_SPECIAL_CASING)) {
- i=EXC_SPECIAL_CASING;
- ++pe;
- ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
- specialCasing=*pe;
- /* fill u and length with the case mapping result string */
- if(specialCasing&0x80000000) {
- /* use hardcoded conditions and mappings */
- int32_t loc=getCaseLocale(locale),
- srcIndex= iter!=NULL ? iter->getIndex(iter, UITER_CURRENT) : 0;
-
- if(loc==LOC_TURKISH && c==0x69) {
- /*
- # Turkish and Azeri
-
- # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
- # The following rules handle those cases.
-
- # When uppercasing, i turns into a dotted capital I
-
- 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
- 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
- */
- result=0x130;
- goto single;
- } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, srcIndex-1)) {
- /*
- # Lithuanian
-
- # Lithuanian retains the dot in a lowercase i when followed by accents.
-
- # Remove DOT ABOVE after "i" with upper or titlecase
-
- 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
- */
- return 0; /* remove the dot (continue without output) */
- } else {
- /* no known conditional special case mapping, use a normal mapping */
- pe=GET_EXCEPTIONS(props); /* restore the initial exception pointer */
- firstExceptionValue=*pe;
- goto notSpecial;
- }
- } else {
- /* get the special case mapping string from the data file */
- u=ucharsTable+(specialCasing&0xffff);
- length=(int32_t)*u++;
-
- /* skip the lowercase result string */
- u+=length&0x1f;
- if(upperNotTitle) {
- length=(length>>5)&0x1f;
- } else {
- /* skip the uppercase result strings too */
- u+=(length>>5)&0x1f;
- length=(length>>10)&0x1f;
- }
- }
-
- /* copy the result string */
- i=0;
- while(iadd(sa->set, cp); sa->add(sa->set, cp+1)
-single:
- length=UTF_CHAR_LENGTH(result);
- if(length<=destCapacity) {
- /* write result to dest */
- i=0;
- UTF_APPEND_CHAR_UNSAFE(dest, i, result);
+U_CAPI void U_EXPORT2
+uchar_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
+ if(U_FAILURE(*pErrorCode)) {
+ return;
}
- return (result==c) ? -length : length;
-}
-
-/* internal, see ustr_imp.h */
-U_CAPI int32_t U_EXPORT2
-u_internalToUpper(UChar32 c, UCharIterator *iter,
- UChar *dest, int32_t destCapacity,
- const char *locale) {
- return u_internalToUpperOrTitle(c, iter, dest, destCapacity, locale, TRUE);
-}
-
-U_CFUNC int32_t
-u_internalStrToUpper(UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- const char *locale,
- UErrorCode *pErrorCode) {
- UCharIterator iter;
- uint32_t props;
- int32_t srcIndex, destIndex;
- UChar32 c;
- /* test early, once, if there is a data file */
if(!HAVE_DATA) {
- *pErrorCode=U_FILE_ACCESS_ERROR;
- return 0;
- }
-
- /* set up local variables */
- uiter_setString(&iter, src, srcLength);
-
- /* case mapping loop */
- srcIndex=destIndex=0;
- while(srcIndexdestCapacity) {
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ *pErrorCode=dataErrorCode;
+ return;
}
- return destIndex;
-}
-
-/* titlecasing -------------------------------------------------------------- */
-
-/* internal, see ustr_imp.h */
-U_CAPI int32_t U_EXPORT2
-u_internalToTitle(UChar32 c, UCharIterator *iter,
- UChar *dest, int32_t destCapacity,
- const char *locale) {
- return u_internalToUpperOrTitle(c, iter, dest, destCapacity, locale, FALSE);
-}
-/* case folding ------------------------------------------------------------- */
-
-/*
- * Case folding is similar to lowercasing.
- * The result may be a simple mapping, i.e., a single code point, or
- * a full mapping, i.e., a string.
- * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
- * then only the lowercase mapping is stored.
- *
- * Some special cases are hardcoded because their conditions cannot be
- * parsed and processed from CaseFolding.txt.
- *
- * Unicode 3.2 CaseFolding.txt specifies for its status field:
-
-# C: common case folding, common mappings shared by both simple and full mappings.
-# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
-# S: simple case folding, mappings to single characters where different from F.
-# T: special case for uppercase I and dotted uppercase I
-# - For non-Turkic languages, this mapping is normally not used.
-# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
-#
-# Usage:
-# A. To do a simple case folding, use the mappings with status C + S.
-# B. To do a full case folding, use the mappings with status C + F.
-#
-# The mappings with status T can be used or omitted depending on the desired case-folding
-# behavior. (The default option is to exclude them.)
-
- * Unicode 3.2 has 'T' mappings as follows:
-
-0049; T; 0131; # LATIN CAPITAL LETTER I
-0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
-
- * while the default mappings for these code points are:
-
-0049; C; 0069; # LATIN CAPITAL LETTER I
-0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
-
- * U+0130 is otherwise lowercased to U+0069 (UnicodeData.txt).
- *
- * In case this code is used with CaseFolding.txt from an older version of Unicode
- * where CaseFolding.txt contains mappings with a status of 'I' that
- * have the opposite polarity ('I' mappings are included by default but excluded for Turkic),
- * we must also hardcode the Unicode 3.2 mappings for the code points
- * with 'I' mappings.
- * Unicode 3.1.1 has 'I' mappings for U+0130 and U+0131.
- * Unicode 3.2 has a 'T' mapping for U+0130, and lowercases U+0131 to itself (see UnicodeData.txt).
- */
-
-/* return the simple case folding mapping for c */
-U_CAPI UChar32 U_EXPORT2
-u_foldCase(UChar32 c, uint32_t options) {
- uint32_t props;
- GET_PROPS(c, props);
- if(!PROPS_VALUE_IS_EXCEPTION(props)) {
- if(CAT_MASK(props)&(U_GC_LU_MASK|U_GC_LT_MASK)) {
- return c+GET_SIGNED_VALUE(props);
- }
- } else {
- const uint32_t *pe=GET_EXCEPTIONS(props);
- uint32_t firstExceptionValue=*pe;
- if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_CASE_FOLDING)) {
- const uint32_t *oldPE=pe;
- int i=EXC_CASE_FOLDING;
- ++pe;
- ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
- props=*pe;
- if(props!=0) {
- /* return the simple mapping, if there is one */
- const UChar *uchars=ucharsTable+(props&0xffff);
- UChar32 simple;
- i=0;
- UTF_NEXT_CHAR_UNSAFE(uchars, i, simple);
- if(simple!=0) {
- return simple;
- }
- /* fall through to use the lowercase exception value if there is no simple mapping */
- pe=oldPE;
- } else {
- /* special case folding mappings, hardcoded */
- if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
- /* default mappings */
- if(c==0x49) {
- /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
- return 0x69;
- } else if(c==0x130) {
- /* no simple default mapping for U+0130, use UnicodeData.txt */
- return 0x69;
- }
- } else {
- /* Turkic mappings */
- if(c==0x49) {
- /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
- return 0x131;
- } else if(c==0x130) {
- /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
- return 0x69;
- }
- }
- /* return c itself because there is no special mapping for it */
- return c;
- }
- }
- /* not else! - allow to fall through from above */
- if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_LOWERCASE)) {
- int i=EXC_LOWERCASE;
- ++pe;
- ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
- return (UChar32)*pe;
- }
- }
- return c; /* no mapping - return c itself */
-}
+ /* add the start code point of each same-value range of each trie */
+ utrie_enum(&propsTrie, NULL, _enumPropertyStartsRange, sa);
+ utrie_enum(&propsVectorsTrie, NULL, _enumPropertyStartsRange, sa);
-/*
- * Issue for canonical caseless match (UAX #21):
- * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
- * canonical equivalence, unlike default-option casefolding.
- * For example, I-grave and I + grave fold to strings that are not canonically
- * equivalent.
- * For more details, see the comment in unorm_compare() in unorm.cpp
- * and the intermediate prototype changes for Jitterbug 2021.
- * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
- *
- * This did not get fixed because it appears that it is not possible to fix
- * it for uppercase and lowercase characters (I-grave vs. i-grave)
- * together in a way that they still fold to common result strings.
- */
+ /* add code points with hardcoded properties, plus the ones following them */
-/* internal, see ustr_imp.h */
-U_CAPI int32_t U_EXPORT2
-u_internalFoldCase(UChar32 c,
- UChar *dest, int32_t destCapacity,
- uint32_t options) {
- uint32_t props;
- UChar32 result;
- int32_t i, length;
-
- result=c;
- GET_PROPS_UNSAFE(c, props);
- if(!PROPS_VALUE_IS_EXCEPTION(props)) {
- if(CAT_MASK(props)&(U_GC_LU_MASK|U_GC_LT_MASK)) {
- /* same as lowercase */
- result=c+GET_SIGNED_VALUE(props);
- }
- } else {
- const uint32_t *pe=GET_EXCEPTIONS(props);
- uint32_t firstExceptionValue=*pe;
- if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_CASE_FOLDING)) {
- i=EXC_CASE_FOLDING;
- ++pe;
- ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
- props=*pe;
- if(props!=0) {
- /* return the full mapping */
- const UChar *uchars=ucharsTable+(props&0xffff)+2;
- int32_t minLength;
-
- length=props>>24;
- minLength = (length < destCapacity) ? length : destCapacity;
-
- /* copy the result string */
- i=0;
- while(iadd(sa->set, TAB); /* range TAB..CR */
+ sa->add(sa->set, CR+1);
+ sa->add(sa->set, 0x1c);
+ sa->add(sa->set, 0x1f+1);
+ USET_ADD_CP_AND_NEXT(sa, NL);
-/* single: */
- length=UTF_CHAR_LENGTH(result);
- if(length<=destCapacity) {
- /* write result to dest */
- i=0;
- UTF_APPEND_CHAR_UNSAFE(dest, i, result);
- }
- return (result==c) ? -length : length;
-}
+ /* add for u_isIDIgnorable() what was not added above */
+ sa->add(sa->set, DEL); /* range DEL..NBSP-1, NBSP added below */
+ sa->add(sa->set, HAIRSP);
+ sa->add(sa->set, RLM+1);
+ sa->add(sa->set, INHSWAP);
+ sa->add(sa->set, NOMDIG+1);
+ USET_ADD_CP_AND_NEXT(sa, ZWNBSP);
-/* case-fold the source string using the full mappings */
-U_CFUNC int32_t
-u_internalStrFoldCase(UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- uint32_t options,
- UErrorCode *pErrorCode) {
- uint32_t props;
- int32_t srcIndex, destIndex;
- UChar32 c;
+ /* add no-break spaces for u_isWhitespace() what was not added above */
+ USET_ADD_CP_AND_NEXT(sa, NBSP);
+ USET_ADD_CP_AND_NEXT(sa, FIGURESP);
+ USET_ADD_CP_AND_NEXT(sa, NNBSP);
- /* test early, once, if there is a data file */
- if(!HAVE_DATA) {
- *pErrorCode=U_FILE_ACCESS_ERROR;
- return 0;
- }
+ /* add for u_charDigitValue() */
+ USET_ADD_CP_AND_NEXT(sa, 0x3007);
+ USET_ADD_CP_AND_NEXT(sa, 0x4e00);
+ USET_ADD_CP_AND_NEXT(sa, 0x4e8c);
+ USET_ADD_CP_AND_NEXT(sa, 0x4e09);
+ USET_ADD_CP_AND_NEXT(sa, 0x56db);
+ USET_ADD_CP_AND_NEXT(sa, 0x4e94);
+ USET_ADD_CP_AND_NEXT(sa, 0x516d);
+ USET_ADD_CP_AND_NEXT(sa, 0x4e03);
+ USET_ADD_CP_AND_NEXT(sa, 0x516b);
+ USET_ADD_CP_AND_NEXT(sa, 0x4e5d);
- /* case mapping loop */
- srcIndex=destIndex=0;
- while(srcIndexadd(sa->set, U_a);
+ sa->add(sa->set, U_z+1);
+ sa->add(sa->set, U_A);
+ sa->add(sa->set, U_Z+1);
- /* handle 1:1 code point mappings from UnicodeData.txt */
- if(c<=0xffff) {
- if(destIndexadd(sa->set, WJ); /* range WJ..NOMDIG */
+ sa->add(sa->set, 0xfff0);
+ sa->add(sa->set, 0xfffb+1);
+ sa->add(sa->set, 0xe0000);
+ sa->add(sa->set, 0xe0fff+1);
- if(destIndexdestCapacity) {
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- }
- return destIndex;
+ /* add for UCHAR_JOINING_TYPE */
+ sa->add(sa->set, ZWNJ); /* range ZWNJ..ZWJ */
+ sa->add(sa->set, ZWJ+1);
}
diff --git a/icuSources/common/uchriter.cpp b/icuSources/common/uchriter.cpp
index 2573baa9..ae73695a 100644
--- a/icuSources/common/uchriter.cpp
+++ b/icuSources/common/uchriter.cpp
@@ -1,6 +1,6 @@
/*
******************************************************************************
-* Copyright (C) 1998-2001, International Business Machines Corporation and *
+* Copyright (C) 1998-2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
******************************************************************************
*/
@@ -11,7 +11,7 @@
U_NAMESPACE_BEGIN
-const char UCharCharacterIterator::fgClassID = 0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCharCharacterIterator)
UCharCharacterIterator::UCharCharacterIterator()
: CharacterIterator(),
diff --git a/icuSources/common/ucln.h b/icuSources/common/ucln.h
index cb3150dc..2f3f98c8 100644
--- a/icuSources/common/ucln.h
+++ b/icuSources/common/ucln.h
@@ -1,7 +1,7 @@
/*
******************************************************************************
* *
-* Copyright (C) 2001-2003, International Business Machines *
+* Copyright (C) 2001-2004, International Business Machines *
* Corporation and others. All Rights Reserved. *
* *
******************************************************************************
@@ -50,14 +50,16 @@
typedef enum ECleanupLibraryType {
UCLN_START = -1,
UCLN_CUSTOM, /* Custom is for anyone else. */
- UCLN_LAYOUT,
UCLN_LAYOUTEX,
- UCLN_USTDIO,
+ UCLN_LAYOUT,
+ UCLN_IO,
UCLN_I18N,
UCLN_COMMON /* This must be the last one to cleanup. */
} ECleanupLibraryType;
-typedef UBool cleanupFunc(void);
+U_CDECL_BEGIN
+typedef UBool U_CALLCONV cleanupFunc(void);
+U_CDECL_END
U_CAPI void U_EXPORT2 ucln_registerCleanup(ECleanupLibraryType type,
cleanupFunc *func);
diff --git a/icuSources/common/ucln_cmn.c b/icuSources/common/ucln_cmn.c
index e74f3015..c14b3dbc 100644
--- a/icuSources/common/ucln_cmn.c
+++ b/icuSources/common/ucln_cmn.c
@@ -1,7 +1,7 @@
/*
******************************************************************************
* *
-* Copyright (C) 2001-2003, International Business Machines *
+* Copyright (C) 2001-2004, International Business Machines *
* Corporation and others. All Rights Reserved. *
* *
******************************************************************************
@@ -16,113 +16,37 @@
#include "unicode/utypes.h"
#include "unicode/uclean.h"
+#include "utracimp.h"
#include "ustr_imp.h"
#include "unormimp.h"
#include "ucln_cmn.h"
#include "umutex.h"
#include "ucln.h"
+#include "cmemory.h"
+#include "uassert.h"
-static cleanupFunc *gCleanupFunctions[UCLN_COMMON] = {
- NULL,
- NULL,
- NULL,
- NULL,
- NULL
-};
+static cleanupFunc *gCommonCleanupFunctions[UCLN_COMMON_COUNT];
-U_CAPI void U_EXPORT2
-ucln_registerCleanup(ECleanupLibraryType type,
- cleanupFunc *func)
+void ucln_common_registerCleanup(ECleanupCommonType type,
+ cleanupFunc *func)
{
- if (UCLN_START < type && type < UCLN_COMMON)
+ U_ASSERT(UCLN_COMMON_START < type && type < UCLN_COMMON_COUNT);
+ if (UCLN_COMMON_START < type && type < UCLN_COMMON_COUNT)
{
- gCleanupFunctions[type] = func;
+ gCommonCleanupFunctions[type] = func;
}
}
-/************************************************
- The cleanup order is important in this function.
- Please be sure that you have read ucln.h
- ************************************************/
-U_CAPI void U_EXPORT2
-u_cleanup(void)
-{
+U_CFUNC UBool ucln_common_lib_cleanup(void) {
+ ECleanupCommonType commonFunc;
- ECleanupLibraryType libType = UCLN_START;
- while (++libType < UCLN_COMMON)
- {
- if (gCleanupFunctions[libType])
+ for (commonFunc = UCLN_COMMON_START+1; commonFunc
+#endif
+
+U_CFUNC uint16_t
+udata_getHeaderSize(const DataHeader *udh) {
+ if(udh==NULL) {
+ return 0;
+ } else if(udh->info.isBigEndian==U_IS_BIG_ENDIAN) {
+ /* same endianness */
+ return udh->dataHeader.headerSize;
+ } else {
+ /* opposite endianness */
+ uint16_t x=udh->dataHeader.headerSize;
+ return (uint16_t)((x<<8)|(x>>8));
+ }
+}
+
+U_CFUNC uint16_t
+udata_getInfoSize(const UDataInfo *info) {
+ if(info==NULL) {
+ return 0;
+ } else if(info->isBigEndian==U_IS_BIG_ENDIAN) {
+ /* same endianness */
+ return info->size;
+ } else {
+ /* opposite endianness */
+ uint16_t x=info->size;
+ return (uint16_t)((x<<8)|(x>>8));
+ }
+}
-/*----------------------------------------------------------------------------------*
- * *
- * Pointer TOCs. TODO: This form of table-of-contents should be removed because *
- * DLLs must be relocated on loading to correct the pointer values *
- * and this operation makes shared memory mapping of the data *
- * much less likely to work. *
- * *
- *----------------------------------------------------------------------------------*/
+/*-----------------------------------------------------------------------------*
+ * *
+ * Pointer TOCs. TODO: This form of table-of-contents should be removed *
+ * because DLLs must be relocated on loading to correct the *
+ * pointer values and this operation makes shared memory *
+ * mapping of the data much less likely to work. *
+ * *
+ *-----------------------------------------------------------------------------*/
typedef struct {
const char *entryName;
const DataHeader *pHeader;
@@ -48,30 +79,19 @@ typedef struct {
} PointerTOC;
+/* definition of OffsetTOC struct types moved to ucmndata.h */
-typedef struct {
- int32_t nameOffset;
- int32_t dataOffset;
-} OffsetTOCEntry;
-
-
-typedef struct {
- uint32_t count;
- OffsetTOCEntry entry[2]; /* Acutal size of array is from count. */
-} OffsetTOC;
-
-
-/*----------------------------------------------------------------------------------*
- * *
- * entry point lookup implementations *
- * *
- *----------------------------------------------------------------------------------*/
+/*-----------------------------------------------------------------------------*
+ * *
+ * entry point lookup implementations *
+ * *
+ *-----------------------------------------------------------------------------*/
static uint32_t offsetTOCEntryCount(const UDataMemory *pData) {
int32_t retVal=0;
- const OffsetTOC *toc = (OffsetTOC *)pData->toc;
+ const UDataOffsetTOC *toc = (UDataOffsetTOC *)pData->toc;
if (toc != NULL) {
retVal = toc->count;
- }
+ }
return retVal;
}
@@ -79,40 +99,60 @@ static uint32_t offsetTOCEntryCount(const UDataMemory *pData) {
static const DataHeader *
offsetTOCLookupFn(const UDataMemory *pData,
const char *tocEntryName,
+ int32_t *pLength,
UErrorCode *pErrorCode) {
- const OffsetTOC *toc = (OffsetTOC *)pData->toc;
+ const UDataOffsetTOC *toc = (UDataOffsetTOC *)pData->toc;
if(toc!=NULL) {
const char *base=(const char *)pData->toc;
- uint32_t start, limit, number;
+ uint32_t start, limit, number, lastNumber;
+ int32_t strResult;
+ const UDataOffsetTOCEntry *entry;
/* perform a binary search for the data in the common data's table of contents */
+#if defined (UDATA_DEBUG_DUMP)
+ /* list the contents of the TOC each time .. not recommended */
+ for(start=0;startcount;start++) {
+ fprintf(stderr, "\tx%d: %s\n", start, &base[toc->entry[start].nameOffset]);
+ }
+#endif
+
start=0;
limit=toc->count; /* number of names in this table of contents */
+ lastNumber=limit;
+ entry=toc->entry;
if (limit == 0) { /* Stub common data library used during build is empty. */
return NULL;
}
- while(startentry[number].nameOffset])<0) {
+ for (;;) {
+ number = (start+limit)/2;
+ if (lastNumber == number) { /* Have we moved? */
+ break; /* We haven't moved, and it wasn't found. */
+ }
+ lastNumber = number;
+ strResult = uprv_strcmp(tocEntryName, base+entry[number].nameOffset);
+ if(strResult<0) {
limit=number;
- } else {
+ } else if (strResult>0) {
start=number;
}
- }
-
- if(uprv_strcmp(tocEntryName, &base[toc->entry[start].nameOffset])==0) {
- /* found it */
+ else {
+ /* found it */
#ifdef UDATA_DEBUG
-/* fprintf(stderr, "Found: %p\n",(base+toc[2*start+1])) */
- fprintf(stderr, "Found it\n");
+ fprintf(stderr, "%s: Found.\n", tocEntryName);
#endif
- return (const DataHeader *)&base[toc->entry[start].dataOffset];
- } else {
+ entry += number; /* Alias the entry to the current entry. */
+ if((number+1) < toc->count) {
+ *pLength = (int32_t)(entry[1].dataOffset - entry->dataOffset);
+ } else {
+ *pLength = -1;
+ }
+ return (const DataHeader *)(base+entry->dataOffset);
+ }
+ }
#ifdef UDATA_DEBUG
- fprintf(stderr, "Not found.\n");
+ fprintf(stderr, "%s: Not found.\n", tocEntryName);
#endif
- return NULL;
- }
+ return NULL;
} else {
#ifdef UDATA_DEBUG
fprintf(stderr, "returning header\n");
@@ -135,34 +175,54 @@ static uint32_t pointerTOCEntryCount(const UDataMemory *pData) {
static const DataHeader *pointerTOCLookupFn(const UDataMemory *pData,
const char *name,
+ int32_t *pLength,
UErrorCode *pErrorCode) {
if(pData->toc!=NULL) {
const PointerTOC *toc = (PointerTOC *)pData->toc;
- uint32_t start, limit, number;
+ uint32_t start, limit, number, lastNumber;
+ int32_t strResult;
+
+#if defined (UDATA_DEBUG_DUMP)
+ /* list the contents of the TOC each time .. not recommended */
+ for(start=0;startcount;start++) {
+ fprintf(stderr, "\tx%d: %s\n", start, toc->entry[start].entryName);
+ }
+#endif
/* perform a binary search for the data in the common data's table of contents */
start=0;
- limit=toc->count;
+ limit=toc->count;
+ lastNumber=limit;
if (limit == 0) { /* Stub common data library used during build is empty. */
return NULL;
}
- while(startentry[number].entryName)<0) {
+ for (;;) {
+ number = (start+limit)/2;
+ if (lastNumber == number) { /* Have we moved? */
+ break; /* We haven't moved, and it wasn't found. */
+ }
+ lastNumber = number;
+ strResult = uprv_strcmp(name, toc->entry[number].entryName);
+ if(strResult<0) {
limit=number;
- } else {
+ } else if (strResult>0) {
start=number;
}
+ else {
+ /* found it */
+#ifdef UDATA_DEBUG
+ fprintf(STDErr, "%s: Found.\n", toc->entry[number].entryName);
+#endif
+ *pLength=-1;
+ return UDataMemory_normalizeDataPointer(toc->entry[number].pHeader);
+ }
}
-
- if(uprv_strcmp(name, toc->entry[start].entryName)==0) {
- /* found it */
- return UDataMemory_normalizeDataPointer(toc->entry[start].pHeader);
- } else {
- return NULL;
- }
+#ifdef UDATA_DEBUG
+ fprintf(stderr, "%s: Not found.\n", name);
+#endif
+ return NULL;
} else {
return pData->pHeader;
}
@@ -176,7 +236,7 @@ static const commonDataFuncs ToCPFuncs = {pointerTOCLookupFn, pointerTOCEntryCou
/*----------------------------------------------------------------------*
* *
* checkCommonData Validate the format of a common data file. *
- * Fill in the virtual function ptr based on TOC type *
+ * Fill in the virtual function ptr based on TOC type *
* If the data is invalid, close the UDataMemory *
* and set the appropriate error code. *
* *
@@ -202,7 +262,7 @@ void udata_checkCommonData(UDataMemory *udm, UErrorCode *err) {
) {
/* dataFormat="CmnD" */
udm->vFuncs = &CmnDFuncs;
- udm->toc=(const char *)udm->pHeader+udm->pHeader->dataHeader.headerSize;
+ udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader);
}
else if(udm->pHeader->info.dataFormat[0]==0x54 &&
udm->pHeader->info.dataFormat[1]==0x6f &&
@@ -212,7 +272,7 @@ void udata_checkCommonData(UDataMemory *udm, UErrorCode *err) {
) {
/* dataFormat="ToCP" */
udm->vFuncs = &ToCPFuncs;
- udm->toc=(const char *)udm->pHeader+udm->pHeader->dataHeader.headerSize;
+ udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader);
}
else {
/* dataFormat not recognized */
@@ -228,3 +288,22 @@ void udata_checkCommonData(UDataMemory *udm, UErrorCode *err) {
}
}
+/*
+ * TODO: Add a udata_swapPackageHeader() function that swaps an ICU .dat package
+ * header but not its sub-items.
+ * This function will be needed for automatic runtime swapping.
+ * Sub-items should not be swapped to limit the swapping to the parts of the
+ * package that are actually used.
+ *
+ * Since lengths of items are implicit in the order and offsets of their
+ * ToC entries, and since offsets are relative to the start of the ToC,
+ * a swapped version may need to generate a different data structure
+ * with pointers to the original data items and with their lengths
+ * (-1 for the last one if it is not known), and maybe even pointers to the
+ * swapped versions of the items.
+ * These pointers to swapped versions would establish a cache;
+ * instead, each open data item could simply own the storage for its swapped
+ * data. This fits better with the current design.
+ *
+ * markus 2003sep18 Jitterbug 2235
+ */
diff --git a/icuSources/common/ucmndata.h b/icuSources/common/ucmndata.h
index 3ee38997..9634ed84 100644
--- a/icuSources/common/ucmndata.h
+++ b/icuSources/common/ucmndata.h
@@ -1,7 +1,7 @@
/*
******************************************************************************
*
-* Copyright (C) 1999-2001, International Business Machines
+* Copyright (C) 1999-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************/
@@ -43,6 +43,33 @@ typedef struct {
UDataInfo info;
} DataHeader;
+typedef struct {
+ uint32_t nameOffset;
+ uint32_t dataOffset;
+} UDataOffsetTOCEntry;
+
+typedef struct {
+ uint32_t count;
+ UDataOffsetTOCEntry entry[2]; /* Actual size of array is from count. */
+} UDataOffsetTOC;
+
+/**
+ * Get the header size from a const DataHeader *udh.
+ * Handles opposite-endian data.
+ *
+ * @internal
+ */
+U_CFUNC uint16_t
+udata_getHeaderSize(const DataHeader *udh);
+
+/**
+ * Get the UDataInfo.size from a const UDataInfo *info.
+ * Handles opposite-endian data.
+ *
+ * @internal
+ */
+U_CFUNC uint16_t
+udata_getInfoSize(const UDataInfo *info);
/*
* "Virtual" functions for data lookup.
@@ -54,6 +81,7 @@ typedef struct {
typedef const DataHeader *
(* LookupFn)(const UDataMemory *pData,
const char *tocEntryName,
+ int32_t *pLength,
UErrorCode *pErrorCode);
typedef uint32_t
diff --git a/icuSources/common/ucmp8.c b/icuSources/common/ucmp8.c
index a26f4dc1..f22c2e13 100644
--- a/icuSources/common/ucmp8.c
+++ b/icuSources/common/ucmp8.c
@@ -1,7 +1,7 @@
/*
********************************************************************
-* COPYRIGHT:
-* Copyright (c) 1997-2001, International Business Machines Corporation and
+* COPYRIGHT:
+* Copyright (c) 1997-2004, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************
*/
@@ -9,12 +9,6 @@
#include "ucmp8.h"
#include "cmemory.h"
-static int32_t findOverlappingPosition(CompactByteArray* this_obj,
- uint32_t start,
- const UChar *tempIndex,
- int32_t tempIndexCount,
- uint32_t cycle);
-
/* internal constants*/
@@ -27,18 +21,18 @@ ucmp8_getkBlockCount() { return UCMP8_kBlockCount;}
U_CAPI void U_EXPORT2
ucmp8_initBogus(CompactByteArray* array)
{
- CompactByteArray* this_obj = array;
-
- if (this_obj == NULL) return;
-
- this_obj->fStructSize = sizeof(CompactByteArray);
- this_obj->fArray = NULL;
- this_obj->fIndex = NULL;
- this_obj->fCount = UCMP8_kUnicodeCount;
- this_obj->fCompact = FALSE;
- this_obj->fBogus = TRUE;
- this_obj->fAlias = FALSE;
- this_obj->fIAmOwned = TRUE;
+ CompactByteArray* this_obj = array;
+
+ if (this_obj == NULL) return;
+
+ this_obj->fStructSize = sizeof(CompactByteArray);
+ this_obj->fArray = NULL;
+ this_obj->fIndex = NULL;
+ this_obj->fCount = UCMP8_kUnicodeCount;
+ this_obj->fCompact = FALSE;
+ this_obj->fBogus = TRUE;
+ this_obj->fAlias = FALSE;
+ this_obj->fIAmOwned = TRUE;
}
/* debug flags*/
@@ -66,42 +60,42 @@ ucmp8_init(CompactByteArray* array, int8_t defaultValue)
* to data position number 8, which has elements "bced". In the compressed
* version, index# 2 points to data position 1, which also has "bced"
*/
- CompactByteArray* this_obj = array;
- int32_t i;
-
- if (this_obj == NULL) return;
-
- this_obj->fStructSize = sizeof(CompactByteArray);
- this_obj->fArray = NULL;
- this_obj->fIndex = NULL;
- this_obj->fCount = UCMP8_kUnicodeCount;
- this_obj->fCompact = FALSE;
- this_obj->fBogus = FALSE;
- this_obj->fAlias = FALSE;
- this_obj->fIAmOwned = TRUE;
-
-
- this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
- if (!this_obj->fArray)
+ CompactByteArray* this_obj = array;
+ int32_t i;
+
+ if (this_obj == NULL) return;
+
+ this_obj->fStructSize = sizeof(CompactByteArray);
+ this_obj->fArray = NULL;
+ this_obj->fIndex = NULL;
+ this_obj->fCount = UCMP8_kUnicodeCount;
+ this_obj->fCompact = FALSE;
+ this_obj->fBogus = FALSE;
+ this_obj->fAlias = FALSE;
+ this_obj->fIAmOwned = TRUE;
+
+
+ this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
+ if (!this_obj->fArray)
{
- this_obj->fBogus = TRUE;
- return;
+ this_obj->fBogus = TRUE;
+ return;
}
- this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
- if (!this_obj->fIndex)
+ this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
+ if (!this_obj->fIndex)
{
- uprv_free(this_obj->fArray);
- this_obj->fArray = NULL;
- this_obj->fBogus = TRUE;
- return;
+ uprv_free(this_obj->fArray);
+ this_obj->fArray = NULL;
+ this_obj->fBogus = TRUE;
+ return;
}
- for (i = 0; i < UCMP8_kUnicodeCount; ++i)
+ for (i = 0; i < UCMP8_kUnicodeCount; ++i)
{
- this_obj->fArray[i] = defaultValue;
+ this_obj->fArray[i] = defaultValue;
}
- for (i = 0; i < UCMP8_kIndexCount; ++i)
+ for (i = 0; i < UCMP8_kIndexCount; ++i)
{
- this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
+ this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
}
}
@@ -128,46 +122,45 @@ ucmp8_open(int8_t defaultValue)
* to data position number 8, which has elements "bced". In the compressed
* version, index# 2 points to data position 1, which also has "bced"
*/
- CompactByteArray* this_obj = (CompactByteArray*) uprv_malloc(sizeof(CompactByteArray));
- int32_t i;
-
- if (this_obj == NULL) return NULL;
+ CompactByteArray* this_obj = (CompactByteArray*) uprv_malloc(sizeof(CompactByteArray));
+ int32_t i;
- this_obj->fStructSize = sizeof(CompactByteArray);
- this_obj->fArray = NULL;
- this_obj->fIndex = NULL;
- this_obj->fCount = UCMP8_kUnicodeCount;
- this_obj->fCompact = FALSE;
- this_obj->fBogus = FALSE;
- this_obj->fAlias = FALSE;
- this_obj->fIAmOwned = FALSE;
+ if (this_obj == NULL) return NULL;
+ this_obj->fStructSize = sizeof(CompactByteArray);
+ this_obj->fArray = NULL;
+ this_obj->fIndex = NULL;
+ this_obj->fCount = UCMP8_kUnicodeCount;
+ this_obj->fCompact = FALSE;
+ this_obj->fBogus = FALSE;
+ this_obj->fAlias = FALSE;
+ this_obj->fIAmOwned = FALSE;
- this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
- if (!this_obj->fArray)
+ this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
+ if (!this_obj->fArray)
{
- this_obj->fBogus = TRUE;
- return NULL;
+ this_obj->fBogus = TRUE;
+ return NULL;
}
- this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
- if (!this_obj->fIndex)
+ this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
+ if (!this_obj->fIndex)
{
- uprv_free(this_obj->fArray);
- this_obj->fArray = NULL;
- this_obj->fBogus = TRUE;
- return NULL;
+ uprv_free(this_obj->fArray);
+ this_obj->fArray = NULL;
+ this_obj->fBogus = TRUE;
+ return NULL;
}
- for (i = 0; i < UCMP8_kUnicodeCount; ++i)
+ for (i = 0; i < UCMP8_kUnicodeCount; ++i)
{
- this_obj->fArray[i] = defaultValue;
+ this_obj->fArray[i] = defaultValue;
}
- for (i = 0; i < UCMP8_kIndexCount; ++i)
+ for (i = 0; i < UCMP8_kIndexCount; ++i)
{
- this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
+ this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
}
- return this_obj;
+ return this_obj;
}
U_CAPI CompactByteArray* U_EXPORT2
@@ -206,19 +199,19 @@ ucmp8_initAdopt(CompactByteArray *this_obj,
int8_t *newValues,
int32_t count)
{
- if (this_obj) {
- this_obj->fCount = count;
- this_obj->fBogus = FALSE;
- this_obj->fStructSize = sizeof(CompactByteArray);
-
- this_obj->fArray = newValues;
- this_obj->fIndex = indexArray;
- this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
- this_obj->fAlias = FALSE;
- this_obj->fIAmOwned = TRUE;
- }
+ if (this_obj) {
+ this_obj->fCount = count;
+ this_obj->fBogus = FALSE;
+ this_obj->fStructSize = sizeof(CompactByteArray);
+
+ this_obj->fArray = newValues;
+ this_obj->fIndex = indexArray;
+ this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
+ this_obj->fAlias = FALSE;
+ this_obj->fIAmOwned = TRUE;
+ }
- return this_obj;
+ return this_obj;
}
U_CAPI CompactByteArray* U_EXPORT2
@@ -227,78 +220,78 @@ ucmp8_initAlias(CompactByteArray *this_obj,
int8_t *newValues,
int32_t count)
{
- if (this_obj) {
- this_obj->fArray = NULL;
- this_obj->fIndex = NULL;
- this_obj->fCount = count;
- this_obj->fBogus = FALSE;
- this_obj->fStructSize = sizeof(CompactByteArray);
-
- this_obj->fArray = newValues;
- this_obj->fIndex = indexArray;
- this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
- this_obj->fAlias = TRUE;
- this_obj->fIAmOwned = TRUE;
- }
+ if (this_obj) {
+ this_obj->fArray = NULL;
+ this_obj->fIndex = NULL;
+ this_obj->fCount = count;
+ this_obj->fBogus = FALSE;
+ this_obj->fStructSize = sizeof(CompactByteArray);
+
+ this_obj->fArray = newValues;
+ this_obj->fIndex = indexArray;
+ this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
+ this_obj->fAlias = TRUE;
+ this_obj->fIAmOwned = TRUE;
+ }
- return this_obj;
+ return this_obj;
}
/*=======================================================*/
U_CAPI void U_EXPORT2
-ucmp8_close(CompactByteArray* this_obj)
+ucmp8_close(CompactByteArray* this_obj)
{
- if(this_obj != NULL) {
- if(!this_obj->fAlias) {
- if(this_obj->fArray != NULL) {
- uprv_free(this_obj->fArray);
- }
- if(this_obj->fIndex != NULL) {
- uprv_free(this_obj->fIndex);
- }
+ if(this_obj != NULL) {
+ if(!this_obj->fAlias) {
+ if(this_obj->fArray != NULL) {
+ uprv_free(this_obj->fArray);
+ }
+ if(this_obj->fIndex != NULL) {
+ uprv_free(this_obj->fIndex);
+ }
+ }
+ if(!this_obj->fIAmOwned) /* Called if 'init' was called instead of 'open'. */
+ {
+ uprv_free(this_obj);
+ }
}
- if(!this_obj->fIAmOwned) /* Called if 'init' was called instead of 'open'. */
- {
- uprv_free(this_obj);
- }
- }
}
/*=======================================================*/
-
+
U_CAPI void U_EXPORT2
-ucmp8_expand(CompactByteArray* this_obj)
+ucmp8_expand(CompactByteArray* this_obj)
{
- /* can optimize later.
- * if we have to expand, then walk through the blocks instead of using Get
- * this code unpacks the array by copying the blocks to the normalized position.
- * Example: Compressed
- * INDEX# 0 1 2 3 4
- * INDEX 0 4 1 8 2 ...
- * ARRAY abcdeabazyabc...
- * turns into
- * Example: Expanded
- * INDEX# 0 1 2 3 4
- * INDEX 0 4 8 12 16 ...
- * ARRAY abcdeababcedzyabcdea...
- */
+ /* can optimize later.
+ * if we have to expand, then walk through the blocks instead of using Get
+ * this code unpacks the array by copying the blocks to the normalized position.
+ * Example: Compressed
+ * INDEX# 0 1 2 3 4
+ * INDEX 0 4 1 8 2 ...
+ * ARRAY abcdeabazyabc...
+ * turns into
+ * Example: Expanded
+ * INDEX# 0 1 2 3 4
+ * INDEX 0 4 8 12 16 ...
+ * ARRAY abcdeababcedzyabcdea...
+ */
int32_t i;
- if (this_obj->fCompact)
+ if (this_obj->fCompact)
{
int8_t* tempArray;
tempArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
- if (!tempArray)
+ if (!tempArray)
{
this_obj->fBogus = TRUE;
return;
}
- for (i = 0; i < UCMP8_kUnicodeCount; ++i)
+ for (i = 0; i < UCMP8_kUnicodeCount; ++i)
{
tempArray[i] = ucmp8_get(this_obj,(UChar)i); /* HSYS : How expand?*/
}
- for (i = 0; i < UCMP8_kIndexCount; ++i)
+ for (i = 0; i < UCMP8_kIndexCount; ++i)
{
this_obj->fIndex[i] = (uint16_t)(i<< UCMP8_kBlockShift);
}
@@ -309,7 +302,7 @@ ucmp8_expand(CompactByteArray* this_obj)
}
}
-
+
/*=======================================================*/
/* this_obj->fArray: an array to be overlapped
@@ -319,60 +312,62 @@ ucmp8_expand(CompactByteArray* this_obj)
* inputHash[i] = XOR of values from i-count+1 to i
*/
static int32_t
-findOverlappingPosition(CompactByteArray* this_obj,
+findOverlappingPosition(CompactByteArray* this_obj,
uint32_t start,
const UChar* tempIndex,
int32_t tempIndexCount,
- uint32_t cycle)
+ uint32_t cycle)
{
- /* this_obj is a utility routine for finding blocks that overlap.
- * IMPORTANT: the cycle number is very important. Small cycles take a lot
- * longer to work. In some cases, they may be able to get better compaction.
- */
-
- int32_t i;
- int32_t j;
- int32_t currentCount;
-
- for (i = 0; i < tempIndexCount; i += cycle)
- {
- currentCount = UCMP8_kBlockCount;
- if (i + UCMP8_kBlockCount > tempIndexCount)
- {
- currentCount = tempIndexCount - i;
- }
- for (j = 0; j < currentCount; ++j)
+ /* this_obj is a utility routine for finding blocks that overlap.
+ * IMPORTANT: the cycle number is very important. Small cycles take a lot
+ * longer to work. In some cases, they may be able to get better compaction.
+ */
+
+ int32_t i;
+ int32_t j;
+ int32_t currentCount;
+
+ for (i = 0; i < tempIndexCount; i += cycle)
{
- if (this_obj->fArray[start + j] != this_obj->fArray[tempIndex[i + j]]) break;
+ currentCount = UCMP8_kBlockCount;
+ if (i + UCMP8_kBlockCount > tempIndexCount)
+ {
+ currentCount = tempIndexCount - i;
+ }
+ for (j = 0; j < currentCount; ++j)
+ {
+ if (this_obj->fArray[start + j] != this_obj->fArray[tempIndex[i + j]])
+ break;
}
- if (j == currentCount) break;
+ if (j == currentCount)
+ break;
}
-
- return i;
+
+ return i;
}
U_CAPI UBool U_EXPORT2
ucmp8_isBogus(const CompactByteArray* this_obj)
{
- return (UBool)(this_obj == NULL || this_obj->fBogus);
+ return (UBool)(this_obj == NULL || this_obj->fBogus);
}
U_CAPI const int8_t* U_EXPORT2
ucmp8_getArray(const CompactByteArray* this_obj)
{
- return this_obj->fArray;
+ return this_obj->fArray;
}
U_CAPI const uint16_t* U_EXPORT2
ucmp8_getIndex(const CompactByteArray* this_obj)
{
- return this_obj->fIndex;
+ return this_obj->fIndex;
}
U_CAPI int32_t U_EXPORT2
ucmp8_getCount(const CompactByteArray* this_obj)
{
- return this_obj->fCount;
+ return this_obj->fCount;
}
@@ -381,12 +376,12 @@ ucmp8_set(CompactByteArray* this_obj,
UChar c,
int8_t value)
{
- if (this_obj->fCompact == TRUE)
+ if (this_obj->fCompact == TRUE)
{
- ucmp8_expand(this_obj);
- if (this_obj->fBogus) return;
+ ucmp8_expand(this_obj);
+ if (this_obj->fBogus) return;
}
- this_obj->fArray[(int32_t)c] = value;
+ this_obj->fArray[(int32_t)c] = value;
}
@@ -396,176 +391,182 @@ ucmp8_setRange(CompactByteArray* this_obj,
UChar end,
int8_t value)
{
- int32_t i;
- if (this_obj->fCompact == TRUE)
+ int32_t i;
+ if (this_obj->fCompact == TRUE)
{
- ucmp8_expand(this_obj);
- if (this_obj->fBogus) return;
+ ucmp8_expand(this_obj);
+ if (this_obj->fBogus)
+ return;
}
- for (i = start; i <= end; ++i)
+ for (i = start; i <= end; ++i)
{
- this_obj->fArray[i] = value;
+ this_obj->fArray[i] = value;
}
}
/*=======================================================*/
-
+
U_CAPI void U_EXPORT2
ucmp8_compact(CompactByteArray* this_obj,
- uint32_t cycle)
+ uint32_t cycle)
{
- if (!this_obj->fCompact)
+ if (!this_obj->fCompact)
{
- /* this_obj actually does the compaction.
- * it walks throught the contents of the expanded array, finding the
- * first block in the data that matches the contents of the current index.
- * As it works, it keeps an updated pointer to the last position,
- * so that it knows how big to make the final array
- * If the matching succeeds, then the index will point into the data
- * at some earlier position.
- * If the matching fails, then last position pointer will be bumped,
- * and the index will point to that last block of data.
- */
- UChar* tempIndex;
- int32_t tempIndexCount;
- int8_t* tempArray;
- int32_t iBlock, iIndex;
-
- /* fix cycle, must be 0 < cycle <= blockcount*/
- if (cycle < 0) cycle = 1;
- else if (cycle > (uint32_t)UCMP8_kBlockCount) cycle = UCMP8_kBlockCount;
-
- /* make temp storage, larger than we need*/
- tempIndex = (UChar*) uprv_malloc(sizeof(UChar)* UCMP8_kUnicodeCount);
- if (!tempIndex)
- {
- this_obj->fBogus = TRUE;
- return;
- }
- /* set up first block.*/
- tempIndexCount = UCMP8_kBlockCount;
- for (iIndex = 0; iIndex < UCMP8_kBlockCount; ++iIndex)
- {
- tempIndex[iIndex] = (uint16_t)iIndex;
- }; /* endfor (iIndex = 0; .....)*/
- this_obj->fIndex[0] = 0;
-
- /* for each successive block, find out its first position in the compacted array*/
- for (iBlock = 1; iBlock < UCMP8_kIndexCount; ++iBlock)
- {
- int32_t newCount, firstPosition, block;
- block = iBlock << UCMP8_kBlockShift;
- /* if (debugSmall) if (block > debugSmallLimit) break;*/
- firstPosition = findOverlappingPosition(this_obj,
- block,
- tempIndex,
- tempIndexCount,
- cycle);
-
- /* if not contained in the current list, copy the remainder
- * invariant; cumulativeHash[iBlock] = XOR of values from iBlock-kBlockCount+1 to iBlock
- * we do this_obj by XORing out cumulativeHash[iBlock-kBlockCount]
- */
- newCount = firstPosition + UCMP8_kBlockCount;
- if (newCount > tempIndexCount)
+ /* this_obj actually does the compaction.
+ * it walks throught the contents of the expanded array, finding the
+ * first block in the data that matches the contents of the current index.
+ * As it works, it keeps an updated pointer to the last position,
+ * so that it knows how big to make the final array
+ * If the matching succeeds, then the index will point into the data
+ * at some earlier position.
+ * If the matching fails, then last position pointer will be bumped,
+ * and the index will point to that last block of data.
+ */
+ UChar* tempIndex;
+ int32_t tempIndexCount;
+ int8_t* tempArray;
+ int32_t iBlock, iIndex;
+
+ /* fix cycle, must be 0 < cycle <= blockcount*/
+ if (cycle <= 0)
+ cycle = 1;
+ else if (cycle > (uint32_t)UCMP8_kBlockCount)
+ cycle = UCMP8_kBlockCount;
+
+ /* make temp storage, larger than we need*/
+ tempIndex = (UChar*) uprv_malloc(sizeof(UChar)* UCMP8_kUnicodeCount);
+ if (!tempIndex)
{
- for (iIndex = tempIndexCount; iIndex < newCount; ++iIndex)
+ this_obj->fBogus = TRUE;
+ return;
+ }
+ /* set up first block.*/
+ tempIndexCount = UCMP8_kBlockCount;
+ for (iIndex = 0; iIndex < UCMP8_kBlockCount; ++iIndex)
{
- tempIndex[iIndex] = (uint16_t)(iIndex - firstPosition + block);
- } /* endfor (iIndex = tempIndexCount....)*/
+ tempIndex[iIndex] = (uint16_t)iIndex;
+ } /* endfor (iIndex = 0; .....)*/
+ this_obj->fIndex[0] = 0;
+
+ /* for each successive block, find out its first position in the compacted array*/
+ for (iBlock = 1; iBlock < UCMP8_kIndexCount; ++iBlock)
+ {
+ int32_t newCount, firstPosition, block;
+ block = iBlock << UCMP8_kBlockShift;
+ /* if (debugSmall) if (block > debugSmallLimit) break;*/
+ firstPosition = findOverlappingPosition(this_obj,
+ block,
+ tempIndex,
+ tempIndexCount,
+ cycle);
+
+ /* if not contained in the current list, copy the remainder
+ * invariant; cumulativeHash[iBlock] = XOR of values from iBlock-kBlockCount+1 to iBlock
+ * we do this_obj by XORing out cumulativeHash[iBlock-kBlockCount]
+ */
+ newCount = firstPosition + UCMP8_kBlockCount;
+ if (newCount > tempIndexCount)
+ {
+ for (iIndex = tempIndexCount; iIndex < newCount; ++iIndex)
+ {
+ tempIndex[iIndex] = (uint16_t)(iIndex - firstPosition + block);
+ } /* endfor (iIndex = tempIndexCount....)*/
tempIndexCount = newCount;
} /* endif (newCount > tempIndexCount)*/
- this_obj->fIndex[iBlock] = (uint16_t)firstPosition;
+ this_obj->fIndex[iBlock] = (uint16_t)firstPosition;
} /* endfor (iBlock = 1.....)*/
-
- /* now allocate and copy the items into the array*/
- tempArray = (int8_t*) uprv_malloc(tempIndexCount * sizeof(int8_t));
- if (!tempArray)
- {
- this_obj->fBogus = TRUE;
- uprv_free(tempIndex);
- return;
+
+ /* now allocate and copy the items into the array*/
+ tempArray = (int8_t*) uprv_malloc(tempIndexCount * sizeof(int8_t));
+ if (!tempArray)
+ {
+ this_obj->fBogus = TRUE;
+ uprv_free(tempIndex);
+ return;
}
- for (iIndex = 0; iIndex < tempIndexCount; ++iIndex)
- {
- tempArray[iIndex] = this_obj->fArray[tempIndex[iIndex]];
+ for (iIndex = 0; iIndex < tempIndexCount; ++iIndex)
+ {
+ tempArray[iIndex] = this_obj->fArray[tempIndex[iIndex]];
}
- uprv_free(this_obj->fArray);
- this_obj->fArray = tempArray;
- this_obj->fCount = tempIndexCount;
-
-
- /* free up temp storage*/
- uprv_free(tempIndex);
- this_obj->fCompact = TRUE;
+ uprv_free(this_obj->fArray);
+ this_obj->fArray = tempArray;
+ this_obj->fCount = tempIndexCount;
+
+
+ /* free up temp storage*/
+ uprv_free(tempIndex);
+ this_obj->fCompact = TRUE;
} /* endif (!this_obj->fCompact)*/
}
-U_CAPI uint32_t U_EXPORT2 ucmp8_flattenMem (const CompactByteArray* array, UMemoryStream *MS)
+#define MEMORY_WRITE(destAddr, source, sizeSoFar, len) \
+ if (destAddr) {\
+ uprv_memcpy(destAddr+sizeSoFar, source, len);\
+ }\
+ sizeSoFar += (len)
+
+U_CAPI uint32_t U_EXPORT2 ucmp8_flattenMem (const CompactByteArray* array, uint8_t *MS)
{
- int32_t size = 0;
-
- uprv_mstrm_write32(MS, ICU_UCMP8_VERSION);
- size += 4;
-
- uprv_mstrm_write32(MS, array->fCount);
- size += 4;
-
- uprv_mstrm_writeBlock(MS, array->fIndex, sizeof(array->fIndex[0])*UCMP8_kIndexCount);
- size += sizeof(array->fIndex[0])*UCMP8_kIndexCount;
-
- uprv_mstrm_writeBlock(MS, array->fArray, sizeof(array->fArray[0])*array->fCount);
- size += sizeof(array->fArray[0])*array->fCount;
-
- while(size%4) /* end padding */
- {
- uprv_mstrm_writePadding(MS, 1); /* Pad total so far to even size */
- size += 1;
- }
-
- return size;
+ int32_t size = 0;
+ static const int32_t version = ICU_UCMP8_VERSION;
+
+ MEMORY_WRITE(MS, &version, size, 4);
+
+ MEMORY_WRITE(MS, &array->fCount, size, 4);
+
+ MEMORY_WRITE(MS, array->fIndex, size, sizeof(array->fIndex[0])*UCMP8_kIndexCount);
+
+ MEMORY_WRITE(MS, array->fArray, size, sizeof(array->fArray[0])*array->fCount);
+
+ while(size%4) /* end padding */
+ {
+ uint8_t pad = 0;
+ MEMORY_WRITE(MS, &pad, size, 1);
+ }
+
+ return size;
}
/* We use sizeof(*array), etc so that this code can be as portable as
- possible between the ucmpX_ family.
+ possible between the ucmpX_ family.
*/
U_CAPI void U_EXPORT2 ucmp8_initFromData(CompactByteArray *this_obj, const uint8_t **source, UErrorCode *status)
{
- uint32_t i;
- const uint8_t *oldSource = *source;
-
- if(U_FAILURE(*status))
- return;
-
- this_obj->fArray = NULL;
- this_obj->fIndex = NULL;
- this_obj->fBogus = FALSE;
- this_obj->fStructSize = sizeof(CompactByteArray);
- this_obj->fCompact = TRUE;
- this_obj->fAlias = TRUE;
- this_obj->fIAmOwned = TRUE;
-
- i = * ((const uint32_t*) *source);
- (*source) += 4;
-
- if(i != ICU_UCMP8_VERSION)
- {
- *status = U_INVALID_FORMAT_ERROR;
- return;
- }
-
- this_obj->fCount = * ((const uint32_t*)*source);
- (*source) += 4;
-
- this_obj->fIndex = (uint16_t*) *source;
- (*source) += sizeof(this_obj->fIndex[0])*UCMP8_kIndexCount;
-
- this_obj->fArray = (int8_t*) *source;
- (*source) += sizeof(this_obj->fArray[0])*this_obj->fCount;
-
- /* eat up padding */
- while((*source-(oldSource))%4)
- (*source)++;
+ uint32_t i;
+ const uint8_t *oldSource = *source;
+
+ if(U_FAILURE(*status))
+ return;
+
+ this_obj->fArray = NULL;
+ this_obj->fIndex = NULL;
+ this_obj->fBogus = FALSE;
+ this_obj->fStructSize = sizeof(CompactByteArray);
+ this_obj->fCompact = TRUE;
+ this_obj->fAlias = TRUE;
+ this_obj->fIAmOwned = TRUE;
+
+ i = * ((const uint32_t*) *source);
+ (*source) += 4;
+
+ if(i != ICU_UCMP8_VERSION)
+ {
+ *status = U_INVALID_FORMAT_ERROR;
+ return;
+ }
+
+ this_obj->fCount = * ((const uint32_t*)*source);
+ (*source) += 4;
+
+ this_obj->fIndex = (uint16_t*) *source;
+ (*source) += sizeof(this_obj->fIndex[0])*UCMP8_kIndexCount;
+
+ this_obj->fArray = (int8_t*) *source;
+ (*source) += sizeof(this_obj->fArray[0])*this_obj->fCount;
+
+ /* eat up padding */
+ while((*source-(oldSource))%4)
+ (*source)++;
}
diff --git a/icuSources/common/ucmp8.h b/icuSources/common/ucmp8.h
index 6d46c321..b4951825 100644
--- a/icuSources/common/ucmp8.h
+++ b/icuSources/common/ucmp8.h
@@ -1,7 +1,7 @@
/*
********************************************************************
* COPYRIGHT:
- * Copyright (c) 1996-2001, International Business Machines Corporation and
+ * Copyright (c) 1996-2004, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************
*/
@@ -16,7 +16,6 @@
*/
#define ICU_UCMP8_VERSION 0x01260000
-#include "umemstrm.h"
#include "unicode/utypes.h"
/*====================================
@@ -227,12 +226,17 @@ U_CAPI const uint16_t* U_EXPORT2 ucmp8_getIndex(const CompactByteArray* array);
U_CAPI void U_EXPORT2 ucmp8_compact(CompactByteArray* array,
uint32_t cycle);
-/** Expanded takes the array back to a 65536 element array*/
-/* @param array The CompactByteArray to be expanded*/
+/** Expanded takes the array back to a 65536 element array
+ * @param array The CompactByteArray to be expanded
+ */
U_CAPI void U_EXPORT2 ucmp8_expand(CompactByteArray* array);
-/** (more) INTERNAL USE ONLY **/
-U_CAPI uint32_t U_EXPORT2 ucmp8_flattenMem (const CompactByteArray* array, UMemoryStream *MS);
+/**
+ * Flatten into a memory structure. Pass in NULL to pre-flight to get the required size.
+ * @internal
+ */
+U_CAPI uint32_t U_EXPORT2 ucmp8_flattenMem(const CompactByteArray* array, uint8_t *MS);
+
/* initializes an existing CBA from memory. Will cause ucmp8_close() to not deallocate anything. */
U_CAPI void U_EXPORT2 ucmp8_initFromData(CompactByteArray* array, const uint8_t **source, UErrorCode *status);
diff --git a/icuSources/common/ucnv.c b/icuSources/common/ucnv.c
index 3cf75768..635c78fa 100644
--- a/icuSources/common/ucnv.c
+++ b/icuSources/common/ucnv.c
@@ -1,7 +1,7 @@
/*
******************************************************************************
*
-* Copyright (C) 1998-2003, International Business Machines
+* Copyright (C) 1998-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@@ -20,78 +20,24 @@
*/
#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
#include "unicode/ustring.h"
-#include "unicode/ures.h"
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
#include "unicode/uset.h"
+#include "putilimp.h"
#include "cmemory.h"
#include "cstring.h"
+#include "uassert.h"
+#include "utracimp.h"
#include "ustr_imp.h"
#include "ucnv_imp.h"
#include "ucnv_io.h"
#include "ucnv_cnv.h"
#include "ucnv_bld.h"
-#if 0
-/* debugging for converters */
-# include
-void UCNV_DEBUG_LOG(const char *what, const char *who, const void *p, int l)
-{
- static FILE *f = NULL;
- if(f==NULL)
- {
- /* stderr, or open another file */
- f = stderr;
- /* f = fopen("c:\\UCNV_DEBUG_LOG.txt", "w"); */
- }
- if (!what) {
- what = "(null)";
- }
- if (!who) {
- who = "(null)";
- }
- if (!p) {
- p = "(null)";
- }
-
- fprintf(f, "%p\t:%d\t%-20s\t%-10s\n",
- p, l, who, what);
-
- fflush(f);
-}
-
-
-/* dump the contents of a converter */
-static void UCNV_DEBUG_CNV(const UConverter *c, int line)
-{
- UErrorCode err = U_ZERO_ERROR;
- fprintf(stderr, "%p\t:%d\t", c, line);
- if(c!=NULL) {
- const char *name = ucnv_getName(c, &err);
- if (!name) {
- name = "(null)";
- }
- fprintf(stderr, "%s\t", name);
-
- fprintf(stderr, "shr=%p, ref=%x\n",
- c->sharedData,
- c->sharedData->referenceCounter);
- } else {
- fprintf(stderr, "DEMISED\n");
- }
-}
-
-# define UCNV_DEBUG 1
-# define UCNV_DEBUG_LOG(x,y,z) UCNV_DEBUG_LOG(x,y,z,__LINE__)
-# define UCNV_DEBUG_CNV(c) UCNV_DEBUG_CNV(c, __LINE__)
-#else
-# define UCNV_DEBUG_LOG(x,y,z)
-# define UCNV_DEBUG_CNV(c)
-#endif
-
-
-
/* size of intermediate and preflighting buffers in ucnv_convert() */
#define CHUNK_SIZE 1024
@@ -103,6 +49,7 @@ typedef struct UAmbiguousConverter {
static const UAmbiguousConverter ambiguousConverters[]={
{ "ibm-942_P120-1999", 0xa5 },
{ "ibm-943_P130-1999", 0xa5 },
+ { "ibm-897_P100-1995", 0xa5 },
{ "ibm-33722_P120-1999", 0xa5 },
{ "ibm-949_P110-1999", 0x20a9 },
{ "ibm-1363_P110-1997", 0x20a9 },
@@ -128,13 +75,10 @@ ucnv_open (const char *name,
UConverter *r;
if (err == NULL || U_FAILURE (*err)) {
- UCNV_DEBUG_LOG("open", name, NULL);
return NULL;
}
r = ucnv_createConverter(NULL, name, err);
- UCNV_DEBUG_LOG("open", name, r);
- UCNV_DEBUG_CNV(r);
return r;
}
@@ -215,34 +159,26 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
NULL
};
+ UTRACE_ENTRY_OC(UTRACE_UCNV_CLONE);
+
if (status == NULL || U_FAILURE(*status)){
+ UTRACE_EXIT_STATUS(status? *status: U_ILLEGAL_ARGUMENT_ERROR);
return 0;
}
if (!pBufferSize || !cnv){
- *status = U_ILLEGAL_ARGUMENT_ERROR;
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ UTRACE_EXIT_STATUS(*status);
return 0;
}
- UCNV_DEBUG_LOG("cloning FROM", ucnv_getName(cnv,status), cnv);
- UCNV_DEBUG_LOG("cloning WITH", "memory", stackBuffer);
- UCNV_DEBUG_CNV(cnv);
+ UTRACE_DATA3(UTRACE_OPEN_CLOSE, "clone converter %s at %p into stackBuffer %p",
+ ucnv_getName(cnv, status), cnv, stackBuffer);
- /* Pointers on 64-bit platforms need to be aligned
- * on a 64-bit boundry in memory.
- */
- if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
- int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
- *pBufferSize -= offsetUp;
- stackBufferChars += offsetUp;
- }
-
- stackBuffer = (void *)stackBufferChars;
-
if (cnv->sharedData->impl->safeClone != NULL) {
/* call the custom safeClone function for sizing */
bufferSizeNeeded = 0;
- cnv->sharedData->impl->safeClone(cnv, stackBuffer, &bufferSizeNeeded, status);
+ cnv->sharedData->impl->safeClone(cnv, NULL, &bufferSizeNeeded, status);
}
else
{
@@ -252,10 +188,27 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
if (*pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
*pBufferSize = bufferSizeNeeded;
+ UTRACE_EXIT_VALUE(bufferSizeNeeded);
return 0;
}
+ /* Pointers on 64-bit platforms need to be aligned
+ * on a 64-bit boundary in memory.
+ */
+ if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
+ int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
+ if(*pBufferSize > offsetUp) {
+ *pBufferSize -= offsetUp;
+ stackBufferChars += offsetUp;
+ } else {
+ /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
+ *pBufferSize = 1;
+ }
+ }
+
+ stackBuffer = (void *)stackBufferChars;
+
/* Now, see if we must allocate any memory */
if (*pBufferSize < bufferSizeNeeded || stackBuffer == NULL)
{
@@ -264,6 +217,7 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
if(localConverter == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
+ UTRACE_EXIT_STATUS(*status);
return NULL;
}
@@ -279,6 +233,8 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
allocatedConverter = NULL;
}
+ uprv_memset(localConverter, 0, bufferSizeNeeded);
+
/* Copy initial state */
uprv_memcpy(localConverter, cnv, sizeof(UConverter));
localConverter->isCopyLocal = localConverter->isExtraLocal = FALSE;
@@ -291,6 +247,7 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
if(localConverter==NULL || U_FAILURE(*status)) {
uprv_free(allocatedConverter);
+ UTRACE_EXIT_STATUS(*status);
return NULL;
}
@@ -307,21 +264,8 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
if(localConverter == (UConverter*)stackBuffer) {
/* we're using user provided data - set to not destroy */
localConverter->isCopyLocal = TRUE;
-#ifdef UCNV_DEBUG
- fprintf(stderr, "%p\t:%d\t\t==stackbuffer %p, isCopyLocal TRUE\n",
- localConverter, __LINE__, stackBuffer);
-#endif
-
- } else {
-#ifdef UCNV_DEBUG
- fprintf(stderr, "%p\t:%d\t\t!=stackbuffer %p, isCopyLocal left at %s\n",
- localConverter, __LINE__, stackBuffer,
- localConverter->isCopyLocal?"TRUE":"FALSE");
-#endif
}
- localConverter->isExtraLocal = localConverter->isCopyLocal;
-
/* allow callback functions to handle any memory allocation */
toUArgs.converter = fromUArgs.converter = localConverter;
cbErr = U_ZERO_ERROR;
@@ -329,11 +273,7 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
cbErr = U_ZERO_ERROR;
cnv->fromUCharErrorBehaviour(cnv->fromUContext, &fromUArgs, NULL, 0, 0, UCNV_CLONE, &cbErr);
- UCNV_DEBUG_LOG("cloning TO", ucnv_getName(localConverter,status), localConverter);
- UCNV_DEBUG_CNV(localConverter);
- UCNV_DEBUG_CNV(cnv);
-
-
+ UTRACE_EXIT_PTR_STATUS(localConverter, *status);
return localConverter;
}
@@ -368,13 +308,16 @@ ucnv_close (UConverter * converter)
};
UErrorCode errorCode = U_ZERO_ERROR;
+ UTRACE_ENTRY_OC(UTRACE_UCNV_CLOSE);
+
if (converter == NULL)
{
+ UTRACE_EXIT();
return;
}
- UCNV_DEBUG_LOG("close", ucnv_getName(converter, &errorCode), converter);
- UCNV_DEBUG_CNV(converter);
+ UTRACE_DATA3(UTRACE_OPEN_CLOSE, "close converter %s at %p, isCopyLocal=%b",
+ ucnv_getName(converter, &errorCode), converter, converter->isCopyLocal);
toUArgs.converter = fromUArgs.converter = converter;
@@ -382,25 +325,10 @@ ucnv_close (UConverter * converter)
errorCode = U_ZERO_ERROR;
converter->fromUCharErrorBehaviour(converter->fromUContext, &fromUArgs, NULL, 0, 0, UCNV_CLOSE, &errorCode);
- UCNV_DEBUG_CNV(converter);
-
if (converter->sharedData->impl->close != NULL) {
converter->sharedData->impl->close(converter);
}
-#ifdef UCNV_DEBUG
- {
- char c[4];
- c[0]='0'+converter->sharedData->referenceCounter;
- c[1]=0;
- UCNV_DEBUG_LOG("close--", c, converter);
- if((converter->sharedData->referenceCounter == 0)&&(converter->sharedData->sharedDataCached == FALSE)) {
- UCNV_DEBUG_CNV(converter);
- UCNV_DEBUG_LOG("close:delDead", "??", converter);
- }
- }
-#endif
-
/*
Checking whether it's an algorithic converter is okay
in multithreaded applications because the value never changes.
@@ -411,10 +339,10 @@ ucnv_close (UConverter * converter)
}
if(!converter->isCopyLocal){
- UCNV_DEBUG_LOG("close:free", "", converter);
uprv_free (converter);
}
- return;
+
+ UTRACE_EXIT();
}
/*returns a single Name from the list, will return NULL if out of bounds
@@ -516,130 +444,94 @@ ucnv_setSubstChars (UConverter * converter,
return;
}
-U_CAPI int32_t U_EXPORT2
-ucnv_getDisplayName(const UConverter *cnv,
- const char *displayLocale,
- UChar *displayName, int32_t displayNameCapacity,
- UErrorCode *pErrorCode) {
- UResourceBundle *rb;
- const UChar *name;
- int32_t length;
-
- /* check arguments */
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
- return 0;
- }
-
- if(cnv==NULL || displayNameCapacity<0 || (displayNameCapacity>0 && displayName==NULL)) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
-
- /* open the resource bundle and get the display name string */
- rb=ures_open(NULL, displayLocale, pErrorCode);
- if(U_FAILURE(*pErrorCode)) {
- return 0;
- }
-
- /* use the internal name as the key */
- name=ures_getStringByKey(rb, cnv->sharedData->staticData->name, &length, pErrorCode);
- ures_close(rb);
-
- if(U_SUCCESS(*pErrorCode)) {
- /* copy the string */
- u_memcpy(displayName, name, uprv_min(length, displayNameCapacity)*U_SIZEOF_UCHAR);
- } else {
- /* convert the internal name into a Unicode string */
- *pErrorCode=U_ZERO_ERROR;
- length=uprv_strlen(cnv->sharedData->staticData->name);
- u_charsToUChars(cnv->sharedData->staticData->name, displayName, uprv_min(length, displayNameCapacity));
- }
- return u_terminateUChars(displayName, displayNameCapacity, length, pErrorCode);
-}
-
/*resets the internal states of a converter
*goal : have the same behaviour than a freshly created converter
*/
-static void _reset(UConverter *converter, UConverterResetChoice choice) {
- /* first, notify the callback functions that the converter is reset */
- UConverterToUnicodeArgs toUArgs = {
- sizeof(UConverterToUnicodeArgs),
- TRUE,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL
- };
- UConverterFromUnicodeArgs fromUArgs = {
- sizeof(UConverterFromUnicodeArgs),
- TRUE,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL
- };
- UErrorCode errorCode;
-
+static void _reset(UConverter *converter, UConverterResetChoice choice,
+ UBool callCallback) {
if(converter == NULL) {
return;
}
- toUArgs.converter = fromUArgs.converter = converter;
- if(choice<=UCNV_RESET_TO_UNICODE) {
- errorCode = U_ZERO_ERROR;
- converter->fromCharErrorBehaviour(converter->toUContext, &toUArgs, NULL, 0, UCNV_RESET, &errorCode);
- }
- if(choice!=UCNV_RESET_TO_UNICODE) {
- errorCode = U_ZERO_ERROR;
- converter->fromUCharErrorBehaviour(converter->fromUContext, &fromUArgs, NULL, 0, 0, UCNV_RESET, &errorCode);
+ if(callCallback) {
+ /* first, notify the callback functions that the converter is reset */
+ UConverterToUnicodeArgs toUArgs = {
+ sizeof(UConverterToUnicodeArgs),
+ TRUE,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL
+ };
+ UConverterFromUnicodeArgs fromUArgs = {
+ sizeof(UConverterFromUnicodeArgs),
+ TRUE,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL
+ };
+ UErrorCode errorCode;
+
+ toUArgs.converter = fromUArgs.converter = converter;
+ if(choice<=UCNV_RESET_TO_UNICODE) {
+ errorCode = U_ZERO_ERROR;
+ converter->fromCharErrorBehaviour(converter->toUContext, &toUArgs, NULL, 0, UCNV_RESET, &errorCode);
+ }
+ if(choice!=UCNV_RESET_TO_UNICODE) {
+ errorCode = U_ZERO_ERROR;
+ converter->fromUCharErrorBehaviour(converter->fromUContext, &fromUArgs, NULL, 0, 0, UCNV_RESET, &errorCode);
+ }
}
/* now reset the converter itself */
if(choice<=UCNV_RESET_TO_UNICODE) {
converter->toUnicodeStatus = converter->sharedData->toUnicodeStatus;
+ converter->mode = 0;
converter->toULength = 0;
converter->invalidCharLength = converter->UCharErrorBufferLength = 0;
+ converter->preToULength = 0;
}
if(choice!=UCNV_RESET_TO_UNICODE) {
converter->fromUnicodeStatus = 0;
- converter->fromUSurrogateLead = 0;
+ converter->fromUChar32 = 0;
converter->invalidUCharLength = converter->charErrorBufferLength = 0;
+ converter->preFromUFirstCP = U_SENTINEL;
+ converter->preFromULength = 0;
}
if (converter->sharedData->impl->reset != NULL) {
/* call the custom reset function */
converter->sharedData->impl->reset(converter, choice);
- } else if(choice<=UCNV_RESET_TO_UNICODE) {
- converter->mode = UCNV_SI;
}
}
U_CAPI void U_EXPORT2
ucnv_reset(UConverter *converter)
{
- _reset(converter, UCNV_RESET_BOTH);
+ _reset(converter, UCNV_RESET_BOTH, TRUE);
}
U_CAPI void U_EXPORT2
ucnv_resetToUnicode(UConverter *converter)
{
- _reset(converter, UCNV_RESET_TO_UNICODE);
+ _reset(converter, UCNV_RESET_TO_UNICODE, TRUE);
}
U_CAPI void U_EXPORT2
ucnv_resetFromUnicode(UConverter *converter)
{
- _reset(converter, UCNV_RESET_FROM_UNICODE);
+ _reset(converter, UCNV_RESET_FROM_UNICODE, TRUE);
}
U_CAPI int8_t U_EXPORT2
ucnv_getMaxCharSize (const UConverter * converter)
{
- return converter->sharedData->staticData->maxBytesPerChar;
+ return converter->maxBytesPerUChar;
}
@@ -663,14 +555,27 @@ ucnv_getName (const UConverter * converter, UErrorCode * err)
return converter->sharedData->staticData->name;
}
-U_CAPI int32_t U_EXPORT2
-ucnv_getCCSID (const UConverter * converter,
- UErrorCode * err)
+U_CAPI int32_t U_EXPORT2
+ucnv_getCCSID(const UConverter * converter,
+ UErrorCode * err)
{
+ int32_t ccsid;
if (U_FAILURE (*err))
return -1;
- return converter->sharedData->staticData->codepage;
+ ccsid = converter->sharedData->staticData->codepage;
+ if (ccsid == 0) {
+ /* Rare case. This is for cases like gb18030,
+ which doesn't have an IBM cannonical name, but does have an IBM alias. */
+ const char *standardName = ucnv_getStandardName(ucnv_getName(converter, err), "IBM", err);
+ if (U_SUCCESS(*err) && standardName) {
+ const char *ccsidStr = uprv_strchr(standardName, '-');
+ if (ccsidStr) {
+ ccsid = (int32_t)atol(ccsidStr+1); /* +1 to skip '-' */
+ }
+ }
+ }
+ return ccsid;
}
@@ -684,33 +589,6 @@ ucnv_getPlatform (const UConverter * converter,
return (UConverterPlatform)converter->sharedData->staticData->platform;
}
-U_CAPI void U_EXPORT2
-ucnv_getUnicodeSet(const UConverter *cnv,
- USet *set,
- UConverterUnicodeSet which,
- UErrorCode *pErrorCode) {
- /* argument checking */
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
- return;
- }
- if(cnv==NULL || set==NULL || whichsharedData->impl->getUnicodeSet==NULL) {
- *pErrorCode=U_UNSUPPORTED_ERROR;
- return;
- }
-
- /* empty the set */
- uset_clear(set);
-
- /* call the converter to add the code points it supports */
- cnv->sharedData->impl->getUnicodeSet(cnv, set, which, pErrorCode);
-}
-
U_CAPI void U_EXPORT2
ucnv_getToUCallBack (const UConverter * converter,
UConverterToUCallback *action,
@@ -761,203 +639,868 @@ ucnv_setFromUCallBack (UConverter * converter,
converter->fromUContext = newContext;
}
-U_CAPI void U_EXPORT2
-ucnv_fromUnicode (UConverter * _this,
- char **target,
- const char *targetLimit,
- const UChar ** source,
- const UChar * sourceLimit,
- int32_t* offsets,
- UBool flush,
- UErrorCode * err)
-{
- UConverterFromUnicodeArgs args;
- const char *t;
+static void
+_updateOffsets(int32_t *offsets, int32_t length,
+ int32_t sourceIndex, int32_t errorInputLength) {
+ int32_t *limit;
+ int32_t delta, offset;
+
+ if(sourceIndex>=0) {
+ /*
+ * adjust each offset by adding the previous sourceIndex
+ * minus the length of the input sequence that caused an
+ * error, if any
+ */
+ delta=sourceIndex-errorInputLength;
+ } else {
+ /*
+ * set each offset to -1 because this conversion function
+ * does not handle offsets
+ */
+ delta=-1;
+ }
+
+ limit=offsets+length;
+ if(delta==0) {
+ /* most common case, nothing to do */
+ } else if(delta>0) {
+ /* add the delta to each offset (but not if the offset is <0) */
+ while(offsets=0) {
+ *offsets=offset+delta;
+ }
+ ++offsets;
+ }
+ } else /* delta<0 */ {
+ /*
+ * set each offset to -1 because this conversion function
+ * does not handle offsets
+ * or the error input sequence started in a previous buffer
+ */
+ while(offsetsconverter;
+ s=pArgs->source;
+ t=pArgs->target;
+ offsets=pArgs->offsets;
+
+ /* get the converter implementation function */
+ sourceIndex=0;
+ if(offsets==NULL) {
+ fromUnicode=cnv->sharedData->impl->fromUnicode;
+ } else {
+ fromUnicode=cnv->sharedData->impl->fromUnicodeWithOffsets;
+ if(fromUnicode==NULL) {
+ /* there is no WithOffsets implementation */
+ fromUnicode=cnv->sharedData->impl->fromUnicode;
+ /* we will write -1 for each offset */
+ sourceIndex=-1;
+ }
+ }
+
+ if(cnv->preFromULength>=0) {
+ /* normal mode */
+ realSource=NULL;
+
+ /* avoid compiler warnings - not otherwise necessary, and the values do not matter */
+ realSourceLimit=NULL;
+ realFlush=FALSE;
+ realSourceIndex=0;
+ } else {
+ /*
+ * Previous m:n conversion stored source units from a partial match
+ * and failed to consume all of them.
+ * We need to "replay" them from a temporary buffer and convert them first.
+ */
+ realSource=pArgs->source;
+ realSourceLimit=pArgs->sourceLimit;
+ realFlush=pArgs->flush;
+ realSourceIndex=sourceIndex;
+
+ uprv_memcpy(replay, cnv->preFromU, -cnv->preFromULength*U_SIZEOF_UCHAR);
+ pArgs->source=replay;
+ pArgs->sourceLimit=replay-cnv->preFromULength;
+ pArgs->flush=FALSE;
+ sourceIndex=-1;
+
+ cnv->preFromULength=0;
+ }
/*
- * Check parameters in for all conversions
- */
- if (err == NULL || U_FAILURE (*err)) {
+ * loop for conversion and error handling
+ *
+ * loop {
+ * convert
+ * loop {
+ * update offsets
+ * handle end of input
+ * handle errors/call callback
+ * }
+ * }
+ */
+ for(;;) {
+ /* convert */
+ fromUnicode(pArgs, err);
+
+ /*
+ * set a flag for whether the converter
+ * successfully processed the end of the input
+ *
+ * need not check cnv->preFromULength==0 because a replay (<0) will cause
+ * sflush && pArgs->source==pArgs->sourceLimit &&
+ cnv->fromUChar32==0);
+
+ /* no callback called yet for this iteration */
+ calledCallback=FALSE;
+
+ /* no sourceIndex adjustment for conversion, only for callback output */
+ errorInputLength=0;
+
+ /*
+ * loop for offsets and error handling
+ *
+ * iterates at most 3 times:
+ * 1. to clean up after the conversion function
+ * 2. after the callback
+ * 3. after the callback again if there was truncated input
+ */
+ for(;;) {
+ /* update offsets if we write any */
+ if(offsets!=NULL) {
+ int32_t length=(int32_t)(pArgs->target-t);
+ if(length>0) {
+ _updateOffsets(offsets, length, sourceIndex, errorInputLength);
+
+ /*
+ * if a converter handles offsets and updates the offsets
+ * pointer at the end, then pArgs->offset should not change
+ * here;
+ * however, some converters do not handle offsets at all
+ * (sourceIndex<0) or may not update the offsets pointer
+ */
+ pArgs->offsets=offsets+=length;
+ }
+
+ if(sourceIndex>=0) {
+ sourceIndex+=(int32_t)(pArgs->source-s);
+ }
+ }
+
+ if(cnv->preFromULength<0) {
+ /*
+ * switch the source to new replay units (cannot occur while replaying)
+ * after offset handling and before end-of-input and callback handling
+ */
+ if(realSource==NULL) {
+ realSource=pArgs->source;
+ realSourceLimit=pArgs->sourceLimit;
+ realFlush=pArgs->flush;
+ realSourceIndex=sourceIndex;
+
+ uprv_memcpy(replay, cnv->preFromU, -cnv->preFromULength*U_SIZEOF_UCHAR);
+ pArgs->source=replay;
+ pArgs->sourceLimit=replay-cnv->preFromULength;
+ pArgs->flush=FALSE;
+ if((sourceIndex+=cnv->preFromULength)<0) {
+ sourceIndex=-1;
+ }
+
+ cnv->preFromULength=0;
+ } else {
+ /* see implementation note before _fromUnicodeWithCallback() */
+ U_ASSERT(realSource==NULL);
+ *err=U_INTERNAL_PROGRAM_ERROR;
+ }
+ }
+
+ /* update pointers */
+ s=pArgs->source;
+ t=pArgs->target;
+
+ if(U_SUCCESS(*err)) {
+ if(ssourceLimit) {
+ /*
+ * continue with the conversion loop while there is still input left
+ * (continue converting by breaking out of only the inner loop)
+ */
+ break;
+ } else if(realSource!=NULL) {
+ /* switch back from replaying to the real source and continue */
+ pArgs->source=realSource;
+ pArgs->sourceLimit=realSourceLimit;
+ pArgs->flush=realFlush;
+ sourceIndex=realSourceIndex;
+
+ realSource=NULL;
+ break;
+ } else if(pArgs->flush && cnv->fromUChar32!=0) {
+ /*
+ * the entire input stream is consumed
+ * and there is a partial, truncated input sequence left
+ */
+
+ /* inject an error and continue with callback handling */
+ *err=U_TRUNCATED_CHAR_FOUND;
+ calledCallback=FALSE; /* new error condition */
+ } else {
+ /* input consumed */
+ if(pArgs->flush) {
+ /*
+ * return to the conversion loop once more if the flush
+ * flag is set and the conversion function has not
+ * successfully processed the end of the input yet
+ *
+ * (continue converting by breaking out of only the inner loop)
+ */
+ if(!converterSawEndOfInput) {
+ break;
+ }
+
+ /* reset the converter without calling the callback function */
+ _reset(cnv, UCNV_RESET_FROM_UNICODE, FALSE);
+ }
+
+ /* done successfully */
+ return;
+ }
+ }
+
+ /* U_FAILURE(*err) */
+ {
+ UErrorCode e;
+
+ if( calledCallback ||
+ (e=*err)==U_BUFFER_OVERFLOW_ERROR ||
+ (e!=U_INVALID_CHAR_FOUND &&
+ e!=U_ILLEGAL_CHAR_FOUND &&
+ e!=U_TRUNCATED_CHAR_FOUND)
+ ) {
+ /*
+ * the callback did not or cannot resolve the error:
+ * set output pointers and return
+ *
+ * the check for buffer overflow is redundant but it is
+ * a high-runner case and hopefully documents the intent
+ * well
+ *
+ * if we were replaying, then the replay buffer must be
+ * copied back into the UConverter
+ * and the real arguments must be restored
+ */
+ if(realSource!=NULL) {
+ int32_t length;
+
+ U_ASSERT(cnv->preFromULength==0);
+
+ length=(int32_t)(pArgs->sourceLimit-pArgs->source);
+ if(length>0) {
+ uprv_memcpy(cnv->preFromU, pArgs->source, length*U_SIZEOF_UCHAR);
+ cnv->preFromULength=(int8_t)-length;
+ }
+
+ pArgs->source=realSource;
+ pArgs->sourceLimit=realSourceLimit;
+ pArgs->flush=realFlush;
+ }
+
+ return;
+ }
+ }
+
+ /* callback handling */
+ {
+ UChar32 codePoint;
+
+ /* get and write the code point */
+ codePoint=cnv->fromUChar32;
+ errorInputLength=0;
+ U16_APPEND_UNSAFE(cnv->invalidUCharBuffer, errorInputLength, codePoint);
+ cnv->invalidUCharLength=(int8_t)errorInputLength;
+
+ /* set the converter state to deal with the next character */
+ cnv->fromUChar32=0;
+
+ /* call the callback function */
+ cnv->fromUCharErrorBehaviour(cnv->fromUContext, pArgs,
+ cnv->invalidUCharBuffer, errorInputLength, codePoint,
+ *err==U_INVALID_CHAR_FOUND ? UCNV_UNASSIGNED : UCNV_ILLEGAL,
+ err);
+ }
+
+ /*
+ * loop back to the offset handling
+ *
+ * this flag will indicate after offset handling
+ * that a callback was called;
+ * if the callback did not resolve the error, then we return
+ */
+ calledCallback=TRUE;
+ }
+ }
+}
+
+U_CAPI void U_EXPORT2
+ucnv_fromUnicode(UConverter *cnv,
+ char **target, const char *targetLimit,
+ const UChar **source, const UChar *sourceLimit,
+ int32_t *offsets,
+ UBool flush,
+ UErrorCode *err) {
+ UConverterFromUnicodeArgs args;
+ const UChar *s;
+ char *t;
+
+ /* check parameters */
+ if(err==NULL || U_FAILURE(*err)) {
return;
}
- if (_this == NULL || target == NULL || source == NULL) {
- *err = U_ILLEGAL_ARGUMENT_ERROR;
+ if(cnv==NULL || target==NULL || source==NULL) {
+ *err=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
- t = *target;
- if (targetLimit < t || sourceLimit < *source)
- {
- *err = U_ILLEGAL_ARGUMENT_ERROR;
+ s=*source;
+ t=*target;
+ if(sourceLimit (size_t)0x7fffffff && targetLimit > t)
- {
- targetLimit = t + 0x7fffffff;
+ * Make sure that the buffer sizes do not exceed the number range for
+ * int32_t because some functions use the size (in units or bytes)
+ * rather than comparing pointers, and because offsets are int32_t values.
+ *
+ * size_t is guaranteed to be unsigned and large enough for the job.
+ *
+ * Return with an error instead of adjusting the limits because we would
+ * not be able to maintain the semantics that either the source must be
+ * consumed or the target filled (unless an error occurs).
+ * An adjustment would be targetLimit=t+0x7fffffff; for example.
+ */
+ if(
+ ((size_t)(sourceLimit-s)>(size_t)0x3fffffff && sourceLimit>s) ||
+ ((size_t)(targetLimit-t)>(size_t)0x7fffffff && targetLimit>t)
+ ) {
+ *err=U_ILLEGAL_ARGUMENT_ERROR;
+ return;
}
- /*
- * Deal with stored carry over data. This is done in the common location
- * to avoid doing it for each conversion.
- */
- if (_this->charErrorBufferLength > 0)
- {
- int32_t myTargetIndex = 0;
-
- ucnv_flushInternalCharBuffer (_this,
- (char *)t,
- &myTargetIndex,
- targetLimit - *target,
- offsets?&offsets:NULL,
- err);
- *target += myTargetIndex;
- if (U_FAILURE (*err))
- return;
+ /* flush the target overflow buffer */
+ if(cnv->charErrorBufferLength>0) {
+ char *overflow;
+ int32_t i, length;
+
+ overflow=(char *)cnv->charErrorBuffer;
+ length=cnv->charErrorBufferLength;
+ i=0;
+ do {
+ if(t==targetLimit) {
+ /* the overflow buffer contains too much, keep the rest */
+ int32_t j=0;
+
+ do {
+ overflow[j++]=overflow[i++];
+ } while(icharErrorBufferLength=(int8_t)j;
+ *target=t;
+ *err=U_BUFFER_OVERFLOW_ERROR;
+ return;
+ }
+
+ /* copy the overflow contents to the target */
+ *t++=overflow[i++];
+ if(offsets!=NULL) {
+ *offsets++=-1; /* no source index available for old output */
+ }
+ } while(icharErrorBufferLength=0;
}
- if(!flush && *source == sourceLimit) {
+ if(!flush && s==sourceLimit && cnv->preFromULength>=0) {
/* the overflow buffer is emptied and there is no new input: we are done */
+ *target=t;
return;
}
- args.converter = _this;
- args.flush = flush;
- args.offsets = offsets;
- args.source = *source;
- args.sourceLimit = sourceLimit;
- args.target = *target;
- args.targetLimit = targetLimit;
- args.size = sizeof(args);
- if (offsets)
- {
- if (_this->sharedData->impl->fromUnicodeWithOffsets != NULL)
- {
- _this->sharedData->impl->fromUnicodeWithOffsets(&args, err);
- *source = args.source;
- *target = args.target;
- return;
+ /*
+ * Do not simply return with a buffer overflow error if
+ * !flush && t==targetLimit
+ * because it is possible that the source will not generate any output.
+ * For example, the skip callback may be called;
+ * it does not output anything.
+ */
+
+ /* prepare the converter arguments */
+ args.converter=cnv;
+ args.flush=flush;
+ args.offsets=offsets;
+ args.source=s;
+ args.sourceLimit=sourceLimit;
+ args.target=t;
+ args.targetLimit=targetLimit;
+ args.size=sizeof(args);
+
+ _fromUnicodeWithCallback(&args, err);
+
+ *source=args.source;
+ *target=args.target;
+}
+
+/* ucnv_toUnicode() --------------------------------------------------------- */
+
+static void
+_toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
+ UConverterToUnicode toUnicode;
+ UConverter *cnv;
+ const char *s;
+ UChar *t;
+ int32_t *offsets;
+ int32_t sourceIndex;
+ int32_t errorInputLength;
+ UBool converterSawEndOfInput, calledCallback;
+
+ /* variables for m:n conversion */
+ char replay[UCNV_EXT_MAX_BYTES];
+ const char *realSource, *realSourceLimit;
+ int32_t realSourceIndex;
+ UBool realFlush;
+
+ cnv=pArgs->converter;
+ s=pArgs->source;
+ t=pArgs->target;
+ offsets=pArgs->offsets;
+
+ /* get the converter implementation function */
+ sourceIndex=0;
+ if(offsets==NULL) {
+ toUnicode=cnv->sharedData->impl->toUnicode;
+ } else {
+ toUnicode=cnv->sharedData->impl->toUnicodeWithOffsets;
+ if(toUnicode==NULL) {
+ /* there is no WithOffsets implementation */
+ toUnicode=cnv->sharedData->impl->toUnicode;
+ /* we will write -1 for each offset */
+ sourceIndex=-1;
+ }
+ }
+
+ if(cnv->preToULength>=0) {
+ /* normal mode */
+ realSource=NULL;
+
+ /* avoid compiler warnings - not otherwise necessary, and the values do not matter */
+ realSourceLimit=NULL;
+ realFlush=FALSE;
+ realSourceIndex=0;
+ } else {
+ /*
+ * Previous m:n conversion stored source units from a partial match
+ * and failed to consume all of them.
+ * We need to "replay" them from a temporary buffer and convert them first.
+ */
+ realSource=pArgs->source;
+ realSourceLimit=pArgs->sourceLimit;
+ realFlush=pArgs->flush;
+ realSourceIndex=sourceIndex;
+
+ uprv_memcpy(replay, cnv->preToU, -cnv->preToULength);
+ pArgs->source=replay;
+ pArgs->sourceLimit=replay-cnv->preToULength;
+ pArgs->flush=FALSE;
+ sourceIndex=-1;
+
+ cnv->preToULength=0;
+ }
+
+ /*
+ * loop for conversion and error handling
+ *
+ * loop {
+ * convert
+ * loop {
+ * update offsets
+ * handle end of input
+ * handle errors/call callback
+ * }
+ * }
+ */
+ for(;;) {
+ if(U_SUCCESS(*err)) {
+ /* convert */
+ toUnicode(pArgs, err);
+
+ /*
+ * set a flag for whether the converter
+ * successfully processed the end of the input
+ *
+ * need not check cnv->preToULength==0 because a replay (<0) will cause
+ * sflush && pArgs->source==pArgs->sourceLimit &&
+ cnv->toULength==0);
+ } else {
+ /* handle error from getNextUChar() */
+ converterSawEndOfInput=FALSE;
}
- else {
- /* there is no implementation that sets offsets, set them all to -1 */
- int32_t i, targetSize = targetLimit - *target;
-
- for (i=0; itarget-t);
+ if(length>0) {
+ _updateOffsets(offsets, length, sourceIndex, errorInputLength);
+
+ /*
+ * if a converter handles offsets and updates the offsets
+ * pointer at the end, then pArgs->offset should not change
+ * here;
+ * however, some converters do not handle offsets at all
+ * (sourceIndex<0) or may not update the offsets pointer
+ */
+ pArgs->offsets=offsets+=length;
+ }
+
+ if(sourceIndex>=0) {
+ sourceIndex+=(int32_t)(pArgs->source-s);
+ }
+ }
+
+ if(cnv->preToULength<0) {
+ /*
+ * switch the source to new replay units (cannot occur while replaying)
+ * after offset handling and before end-of-input and callback handling
+ */
+ if(realSource==NULL) {
+ realSource=pArgs->source;
+ realSourceLimit=pArgs->sourceLimit;
+ realFlush=pArgs->flush;
+ realSourceIndex=sourceIndex;
+
+ uprv_memcpy(replay, cnv->preToU, -cnv->preToULength);
+ pArgs->source=replay;
+ pArgs->sourceLimit=replay-cnv->preToULength;
+ pArgs->flush=FALSE;
+ if((sourceIndex+=cnv->preToULength)<0) {
+ sourceIndex=-1;
+ }
+
+ cnv->preToULength=0;
+ } else {
+ /* see implementation note before _fromUnicodeWithCallback() */
+ U_ASSERT(realSource==NULL);
+ *err=U_INTERNAL_PROGRAM_ERROR;
+ }
+ }
+
+ /* update pointers */
+ s=pArgs->source;
+ t=pArgs->target;
+
+ if(U_SUCCESS(*err)) {
+ if(ssourceLimit) {
+ /*
+ * continue with the conversion loop while there is still input left
+ * (continue converting by breaking out of only the inner loop)
+ */
+ break;
+ } else if(realSource!=NULL) {
+ /* switch back from replaying to the real source and continue */
+ pArgs->source=realSource;
+ pArgs->sourceLimit=realSourceLimit;
+ pArgs->flush=realFlush;
+ sourceIndex=realSourceIndex;
+
+ realSource=NULL;
+ break;
+ } else if(pArgs->flush && cnv->toULength>0) {
+ /*
+ * the entire input stream is consumed
+ * and there is a partial, truncated input sequence left
+ */
+
+ /* inject an error and continue with callback handling */
+ *err=U_TRUNCATED_CHAR_FOUND;
+ calledCallback=FALSE; /* new error condition */
+ } else {
+ /* input consumed */
+ if(pArgs->flush) {
+ /*
+ * return to the conversion loop once more if the flush
+ * flag is set and the conversion function has not
+ * successfully processed the end of the input yet
+ *
+ * (continue converting by breaking out of only the inner loop)
+ */
+ if(!converterSawEndOfInput) {
+ break;
+ }
+
+ /* reset the converter without calling the callback function */
+ _reset(cnv, UCNV_RESET_TO_UNICODE, FALSE);
+ }
+
+ /* done successfully */
+ return;
+ }
}
+
+ /* U_FAILURE(*err) */
+ {
+ UErrorCode e;
+
+ if( calledCallback ||
+ (e=*err)==U_BUFFER_OVERFLOW_ERROR ||
+ (e!=U_INVALID_CHAR_FOUND &&
+ e!=U_ILLEGAL_CHAR_FOUND &&
+ e!=U_TRUNCATED_CHAR_FOUND &&
+ e!=U_ILLEGAL_ESCAPE_SEQUENCE &&
+ e!=U_UNSUPPORTED_ESCAPE_SEQUENCE)
+ ) {
+ /*
+ * the callback did not or cannot resolve the error:
+ * set output pointers and return
+ *
+ * the check for buffer overflow is redundant but it is
+ * a high-runner case and hopefully documents the intent
+ * well
+ *
+ * if we were replaying, then the replay buffer must be
+ * copied back into the UConverter
+ * and the real arguments must be restored
+ */
+ if(realSource!=NULL) {
+ int32_t length;
+
+ U_ASSERT(cnv->preToULength==0);
+
+ length=(int32_t)(pArgs->sourceLimit-pArgs->source);
+ if(length>0) {
+ uprv_memcpy(cnv->preToU, pArgs->source, length);
+ cnv->preToULength=(int8_t)-length;
+ }
+
+ pArgs->source=realSource;
+ pArgs->sourceLimit=realSourceLimit;
+ pArgs->flush=realFlush;
+ }
+
+ return;
+ }
+ }
+
+ /* copy toUBytes[] to invalidCharBuffer[] */
+ errorInputLength=cnv->invalidCharLength=cnv->toULength;
+ if(errorInputLength>0) {
+ uprv_memcpy(cnv->invalidCharBuffer, cnv->toUBytes, errorInputLength);
+ }
+
+ /* set the converter state to deal with the next character */
+ cnv->toULength=0;
+
+ /* call the callback function */
+ cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs,
+ cnv->invalidCharBuffer, errorInputLength,
+ (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ?
+ UCNV_UNASSIGNED : UCNV_ILLEGAL,
+ err);
+
+ /*
+ * loop back to the offset handling
+ *
+ * this flag will indicate after offset handling
+ * that a callback was called;
+ * if the callback did not resolve the error, then we return
+ */
+ calledCallback=TRUE;
}
}
-
- /*calls the specific conversion routines */
- _this->sharedData->impl->fromUnicode(&args, err);
- *source = args.source;
- *target = args.target;
}
-
-
-U_CAPI void U_EXPORT2
-ucnv_toUnicode (UConverter * _this,
- UChar ** target,
- const UChar * targetLimit,
- const char **source,
- const char *sourceLimit,
- int32_t* offsets,
- UBool flush,
- UErrorCode * err)
-{
+U_CAPI void U_EXPORT2
+ucnv_toUnicode(UConverter *cnv,
+ UChar **target, const UChar *targetLimit,
+ const char **source, const char *sourceLimit,
+ int32_t *offsets,
+ UBool flush,
+ UErrorCode *err) {
UConverterToUnicodeArgs args;
- const UChar *t;
+ const char *s;
+ UChar *t;
- /*
- * Check parameters in for all conversions
- */
- if (err == NULL || U_FAILURE (*err)) {
+ /* check parameters */
+ if(err==NULL || U_FAILURE(*err)) {
return;
}
- if (_this == NULL || target == NULL || source == NULL) {
- *err = U_ILLEGAL_ARGUMENT_ERROR;
+ if(cnv==NULL || target==NULL || source==NULL) {
+ *err=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
- t = *target;
- if (targetLimit < t || sourceLimit < *source) {
- *err = U_ILLEGAL_ARGUMENT_ERROR;
+ s=*source;
+ t=*target;
+ if(sourceLimit (size_t)0x3fffffff && targetLimit > t) {
- targetLimit = t + 0x3fffffff;
+ * Make sure that the buffer sizes do not exceed the number range for
+ * int32_t because some functions use the size (in units or bytes)
+ * rather than comparing pointers, and because offsets are int32_t values.
+ *
+ * size_t is guaranteed to be unsigned and large enough for the job.
+ *
+ * Return with an error instead of adjusting the limits because we would
+ * not be able to maintain the semantics that either the source must be
+ * consumed or the target filled (unless an error occurs).
+ * An adjustment would be sourceLimit=t+0x7fffffff; for example.
+ */
+ if(
+ ((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s) ||
+ ((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t)
+ ) {
+ *err=U_ILLEGAL_ARGUMENT_ERROR;
+ return;
}
+
+ /* flush the target overflow buffer */
+ if(cnv->UCharErrorBufferLength>0) {
+ UChar *overflow;
+ int32_t i, length;
+
+ overflow=cnv->UCharErrorBuffer;
+ length=cnv->UCharErrorBufferLength;
+ i=0;
+ do {
+ if(t==targetLimit) {
+ /* the overflow buffer contains too much, keep the rest */
+ int32_t j=0;
+
+ do {
+ overflow[j++]=overflow[i++];
+ } while(iUCharErrorBufferLength=(int8_t)j;
+ *target=t;
+ *err=U_BUFFER_OVERFLOW_ERROR;
+ return;
+ }
- /*
- * Deal with stored carry over data. This is done in the common location
- * to avoid doing it for each conversion.
- */
- if (_this->UCharErrorBufferLength > 0)
- {
- int32_t myTargetIndex = 0;
+ /* copy the overflow contents to the target */
+ *t++=overflow[i++];
+ if(offsets!=NULL) {
+ *offsets++=-1; /* no source index available for old output */
+ }
+ } while(iUCharErrorBufferLength=0;
}
- if(!flush && *source == sourceLimit) {
+ if(!flush && s==sourceLimit && cnv->preToULength>=0) {
/* the overflow buffer is emptied and there is no new input: we are done */
+ *target=t;
return;
}
- args.converter = _this;
- args.flush = flush;
- args.offsets = offsets;
- args.source = (char *) *source;
- args.sourceLimit = sourceLimit;
- args.target = *target;
- args.targetLimit = targetLimit;
- args.size = sizeof(args);
- if (offsets) {
- if (_this->sharedData->impl->toUnicodeWithOffsets != NULL) {
- _this->sharedData->impl->toUnicodeWithOffsets(&args, err);
- *source = args.source;
- *target = args.target;
- return;
- } else {
- /* there is no implementation that sets offsets, set them all to -1 */
- int32_t i, targetSize = targetLimit - *target;
-
- for (i=0; isharedData->impl->toUnicode(&args, err);
+ /* prepare the converter arguments */
+ args.converter=cnv;
+ args.flush=flush;
+ args.offsets=offsets;
+ args.source=s;
+ args.sourceLimit=sourceLimit;
+ args.target=t;
+ args.targetLimit=targetLimit;
+ args.size=sizeof(args);
- *source = args.source;
- *target = args.target;
- return;
+ _toUnicodeWithCallback(&args, err);
+
+ *source=args.source;
+ *target=args.target;
}
+/* ucnv_to/fromUChars() ----------------------------------------------------- */
+
U_CAPI int32_t U_EXPORT2
ucnv_fromUChars(UConverter *cnv,
char *dest, int32_t destCapacity,
@@ -1080,65 +1623,213 @@ ucnv_toUChars(UConverter *cnv,
return u_terminateUChars(originalDest, destCapacity, destLength, pErrorCode);
}
-U_CAPI UChar32 U_EXPORT2
-ucnv_getNextUChar(UConverter * converter,
- const char **source,
- const char *sourceLimit,
- UErrorCode * err)
-{
+/* ucnv_getNextUChar() ------------------------------------------------------ */
+
+U_CAPI UChar32 U_EXPORT2
+ucnv_getNextUChar(UConverter *cnv,
+ const char **source, const char *sourceLimit,
+ UErrorCode *err) {
UConverterToUnicodeArgs args;
- UChar32 ch;
+ UChar buffer[U16_MAX_LENGTH];
+ const char *s;
+ UChar32 c;
+ int32_t i, length;
- if(err == NULL || U_FAILURE(*err)) {
+ /* check parameters */
+ if(err==NULL || U_FAILURE(*err)) {
return 0xffff;
}
- if(converter == NULL || source == NULL || sourceLimit < *source) {
- *err = U_ILLEGAL_ARGUMENT_ERROR;
+ if(cnv==NULL || source==NULL) {
+ *err=U_ILLEGAL_ARGUMENT_ERROR;
return 0xffff;
}
- /* In case internal data had been stored
- * we return the first UChar32 in the internal buffer,
- * and update the internal state accordingly
- */
- if (converter->UCharErrorBufferLength > 0)
- {
- int32_t i = 0;
- UChar32 myUChar;
- UTF_NEXT_CHAR(converter->UCharErrorBuffer, i, sizeof(converter->UCharErrorBuffer), myUChar);
- /*In this memmove we update the internal buffer by
- *popping the first character.
- *Note that in the call itself we decrement
- *UCharErrorBufferLength
- */
- uprv_memmove (converter->UCharErrorBuffer,
- converter->UCharErrorBuffer + i,
- (converter->UCharErrorBufferLength - i) * sizeof (UChar));
- converter->UCharErrorBufferLength -= (int8_t)i;
- return myUChar;
- }
- /*calls the specific conversion routines */
- /*as dictated in a code review, avoids a switch statement */
- args.converter = converter;
- args.flush = TRUE;
- args.offsets = NULL;
- args.source = *source;
- args.sourceLimit = sourceLimit;
- args.target = NULL;
- args.targetLimit = NULL;
- args.size = sizeof(args);
- if (converter->sharedData->impl->getNextUChar != NULL)
- {
- ch = converter->sharedData->impl->getNextUChar(&args, err);
+ s=*source;
+ if(sourceLimit(size_t)0x7fffffff && sourceLimit>s)) {
+ *err=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0xffff;
+ }
+
+ c=U_SENTINEL;
+
+ /* flush the target overflow buffer */
+ if(cnv->UCharErrorBufferLength>0) {
+ UChar *overflow;
+
+ overflow=cnv->UCharErrorBuffer;
+ i=0;
+ length=cnv->UCharErrorBufferLength;
+ U16_NEXT(overflow, i, length, c);
+
+ /* move the remaining overflow contents up to the beginning */
+ if((cnv->UCharErrorBufferLength=(int8_t)(length-i))>0) {
+ uprv_memmove(cnv->UCharErrorBuffer, cnv->UCharErrorBuffer+i,
+ cnv->UCharErrorBufferLength*U_SIZEOF_UCHAR);
+ }
+
+ if(!U16_IS_LEAD(c) || itoULength==0 && cnv->sharedData->impl->getNextUChar!=NULL) {
+ c=cnv->sharedData->impl->getNextUChar(&args, err);
+ *source=s=args.source;
+ if(*err==U_INDEX_OUTOFBOUNDS_ERROR) {
+ /* reset the converter without calling the callback function */
+ _reset(cnv, UCNV_RESET_TO_UNICODE, FALSE);
+ return 0xffff; /* no output */
+ } else if(U_SUCCESS(*err) && c>=0) {
+ return c;
+ /*
+ * else fall through to use _toUnicode() because
+ * UCNV_GET_NEXT_UCHAR_USE_TO_U: the native function did not want to handle it after all
+ * U_FAILURE: call _toUnicode() for callback handling (do not output c)
+ */
+ }
+ }
+
+ /* convert to one UChar in buffer[0], or handle getNextUChar() errors */
+ _toUnicodeWithCallback(&args, err);
+
+ if(*err==U_BUFFER_OVERFLOW_ERROR) {
+ *err=U_ZERO_ERROR;
+ }
+
+ i=0;
+ length=(int32_t)(args.target-buffer);
+ } else {
+ /* write the lead surrogate from the overflow buffer */
+ buffer[0]=(UChar)c;
+ args.target=buffer+1;
+ i=0;
+ length=1;
+ }
+
+ /* buffer contents starts at i and ends before length */
+
+ if(U_FAILURE(*err)) {
+ c=0xffff; /* no output */
+ } else if(length==0) {
+ /* no input or only state changes */
+ *err=U_INDEX_OUTOFBOUNDS_ERROR;
+ /* no need to reset explicitly because _toUnicodeWithCallback() did it */
+ c=0xffff; /* no output */
} else {
- /* default implementation */
- ch = ucnv_getNextUCharFromToUImpl(&args, converter->sharedData->impl->toUnicode, FALSE, err);
+ c=buffer[0];
+ i=1;
+ if(!U16_IS_LEAD(c)) {
+ /* consume c=buffer[0], done */
+ } else {
+ /* got a lead surrogate, see if a trail surrogate follows */
+ UChar c2;
+
+ if(cnv->UCharErrorBufferLength>0) {
+ /* got overflow output from the conversion */
+ if(U16_IS_TRAIL(c2=cnv->UCharErrorBuffer[0])) {
+ /* got a trail surrogate, too */
+ c=U16_GET_SUPPLEMENTARY(c, c2);
+
+ /* move the remaining overflow contents up to the beginning */
+ if((--cnv->UCharErrorBufferLength)>0) {
+ uprv_memmove(cnv->UCharErrorBuffer, cnv->UCharErrorBuffer+1,
+ cnv->UCharErrorBufferLength*U_SIZEOF_UCHAR);
+ }
+ } else {
+ /* c is an unpaired lead surrogate, just return it */
+ }
+ } else if(args.sourceUCharErrorBufferLength)>0) {
+ uprv_memmove(cnv->UCharErrorBuffer+delta, cnv->UCharErrorBuffer,
+ length*U_SIZEOF_UCHAR);
+ }
+ cnv->UCharErrorBufferLength=(int8_t)(length+delta);
+
+ cnv->UCharErrorBuffer[0]=buffer[i++];
+ if(delta>1) {
+ cnv->UCharErrorBuffer[1]=buffer[i];
+ }
}
- *source = args.source;
- return ch;
+
+ *source=args.source;
+ return c;
}
+/* ucnv_convert() and siblings ---------------------------------------------- */
+
U_CAPI void U_EXPORT2
ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
char **target, const char *targetLimit,
@@ -1463,7 +2154,7 @@ ucnv_getType(const UConverter* converter)
int8_t type = converter->sharedData->staticData->conversionType;
#if !UCONFIG_NO_LEGACY_CONVERSION
if(type == UCNV_MBCS) {
- return _MBCSGetType(converter);
+ return ucnv_MBCSGetType(converter);
}
#endif
return (UConverterType)type;
@@ -1682,13 +2373,19 @@ ucnv_detectUnicodeSignature( const char* source,
*signatureLength=4;
return "UTF-7";
}
+ }else if(start[0]=='\xDD' && start[1]== '\x73'&& start[2]=='\x66' && start[3]=='\x73'){
+ *signatureLength=4;
+ return "UTF-EBCDIC";
}
+
/* no known Unicode signature byte sequence recognized */
*signatureLength=0;
return NULL;
}
+#endif
+
/*
* Hey, Emacs, please set the following:
*
diff --git a/icuSources/common/ucnv2022.c b/icuSources/common/ucnv2022.c
index 5c083e40..cc8159a2 100644
--- a/icuSources/common/ucnv2022.c
+++ b/icuSources/common/ucnv2022.c
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 2000-2003, International Business Machines
+* Copyright (C) 2000-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv2022.c
@@ -28,22 +28,55 @@
#include "unicode/utypes.h"
-#if !UCONFIG_NO_LEGACY_CONVERSION
+#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
#include "unicode/ucnv.h"
#include "unicode/uset.h"
#include "unicode/ucnv_err.h"
#include "unicode/ucnv_cb.h"
+#include "ucnv_imp.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
#include "ucnvmbcs.h"
#include "cstring.h"
#include "cmemory.h"
-static const char UCNV_SS2[] = "\x1B\x4E";
-static const char UCNV_SS3[] = "\x1B\x4F";
-#define UCNV_SS2_LEN 2
-#define UCNV_SS3_LEN 2
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
+#ifdef U_ENABLE_GENERIC_ISO_2022
+/*
+ * I am disabling the generic ISO-2022 converter after proposing to do so on
+ * the icu mailing list two days ago.
+ *
+ * Reasons:
+ * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
+ * its designation sequences, single shifts with return to the previous state,
+ * switch-with-no-return to UTF-16BE or similar, etc.
+ * This is unlike the language-specific variants like ISO-2022-JP which
+ * require a much smaller repertoire of ISO-2022 features.
+ * These variants continue to be supported.
+ * 2. I believe that no one is really using the generic ISO-2022 converter
+ * but rather always one of the language-specific variants.
+ * Note that ICU's generic ISO-2022 converter has always output one escape
+ * sequence followed by UTF-8 for the whole stream.
+ * 3. Switching between subcharsets is extremely slow, because each time
+ * the previous converter is closed and a new one opened,
+ * without any kind of caching, least-recently-used list, etc.
+ * 4. The code is currently buggy, and given the above it does not seem
+ * reasonable to spend the time on maintenance.
+ * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
+ * This means, for example, that when ISO-8859-7 is designated, the following
+ * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
+ * The ICU ISO-2022 converter does not handle this - and has no information
+ * about which subconverter would have to be shifted vs. which is designed
+ * for 7-bit ISO-2022.
+ *
+ * Markus Scherer 2003-dec-03
+ */
+#endif
+
+static const char SHIFT_IN_STR[] = "\x0F";
+static const char SHIFT_OUT_STR[] = "\x0E";
#define CR 0x0D
#define LF 0x0A
@@ -51,9 +84,16 @@ static const char UCNV_SS3[] = "\x1B\x4F";
#define V_TAB 0x0B
#define SPACE 0x20
-/* for ISO-2022JP implementation*/
+/* for ISO-2022-JP and -CN implementations */
typedef enum {
+ /* shared values */
+ INVALID_STATE=-1,
ASCII = 0,
+
+ SS2_STATE=0x10,
+ SS3_STATE,
+
+ /* JP */
ISO8859_1 = 1 ,
ISO8859_7 = 2 ,
JISX201 = 3,
@@ -62,113 +102,91 @@ typedef enum {
GB2312 =6,
KSC5601 =7,
HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
- INVALID_STATE=-1
+ /* CN */
+ /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
+ GB2312_1=1,
+ ISO_IR_165=2,
+ CNS_11643=3,
+
+ /*
+ * these are used in StateEnum and ISO2022State variables,
+ * but CNS_11643 must be used to index into myConverterArray[]
+ */
+ CNS_11643_0=0x20,
+ CNS_11643_1,
+ CNS_11643_2,
+ CNS_11643_3,
+ CNS_11643_4,
+ CNS_11643_5,
+ CNS_11643_6,
+ CNS_11643_7
} StateEnum;
+/* is the StateEnum charset value for a DBCS charset? */
+#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
+
+#define CSM(cs) ((uint16_t)1<<(cs))
+/*
+ * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
+ * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
+ *
+ * Note: The converter uses some leniency:
+ * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
+ * all versions, not just JIS7 and JIS8.
+ * - ICU does not distinguish between different versions of JIS X 0208.
+ */
+static const uint16_t jpCharsetMasks[5]={
+ CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
+ CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
+ CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
+ CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
+ CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
+};
typedef enum {
ASCII1=0,
LATIN1,
SBCS,
DBCS,
- MBCS
-
+ MBCS,
+ HWKANA
}Cnv2022Type;
+typedef struct ISO2022State {
+ int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
+ int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
+ int8_t prevG; /* g before single shift (SS2 or SS3) */
+} ISO2022State;
+
#define UCNV_OPTIONS_VERSION_MASK 0xf
#define UCNV_2022_MAX_CONVERTERS 10
typedef struct{
UConverter *currentConverter;
- UConverter *fromUnicodeConverter;
+#ifdef U_ENABLE_GENERIC_ISO_2022
UBool isFirstBuffer;
- StateEnum toUnicodeCurrentState;
- StateEnum fromUnicodeCurrentState;
- StateEnum toUnicodeSaveState;
+#endif
Cnv2022Type currentType;
- int plane;
- UConverter* myConverterArray[UCNV_2022_MAX_CONVERTERS];
- UBool isEscapeAppended;
- UBool isShiftAppended;
- UBool isLocaleSpecified;
+ ISO2022State toU2022State, fromU2022State;
+ UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
uint32_t key;
uint32_t version;
char locale[3];
char name[30];
}UConverterDataISO2022;
+/* Protos */
/* ISO-2022 ----------------------------------------------------------------- */
/*Forward declaration */
U_CFUNC void
-T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
- UErrorCode * err);
+ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
+ UErrorCode * err);
U_CFUNC void
-T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
- UErrorCode * err);
-U_CFUNC void
-_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
- UErrorCode *pErrorCode);
-U_CFUNC void
-_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
- UErrorCode *pErrorCode);
-
-/* Protos */
-/***************** ISO-2022 ********************************/
-static void
-_ISO_2022_GetUnicodeSet(const UConverter *cnv,
- USet *set,
- UConverterUnicodeSet which,
- UErrorCode *pErrorCode);
-
-static void
-T_UConverter_toUnicode_ISO_2022(UConverterToUnicodeArgs * args,
- UErrorCode * err);
-static void
-T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
- UErrorCode * err);
-
-static UChar32
-T_UConverter_getNextUChar_ISO_2022 (UConverterToUnicodeArgs * args,
- UErrorCode * err);
-
-/***************** ISO-2022-JP ********************************/
-
-static void
-UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args,
- UErrorCode* err);
-
-static void
-UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
- UErrorCode* err);
-
-/***************** ISO-2022-KR ********************************/
-
-static void
-UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args,
- UErrorCode* err);
-
-static void
-UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
- UErrorCode* err);
-/* Special function for getting output from IBM-25546 code page*/
-static void
-UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
- UErrorCode* err);
-static void
-UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args,
- UErrorCode* err);
-/***************** ISO-2022-CN ********************************/
-
-static void
-UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args,
- UErrorCode* err);
-
-static void
-UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
- UErrorCode* err);
+ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
+ UErrorCode * err);
#define ESC_2022 0x1B /*ESC*/
@@ -177,10 +195,7 @@ typedef enum
INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
- VALID_MAYBE_TERMINAL_2022 = 2, /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
- VALID_SS2_SEQUENCE=3,
- VALID_SS3_SEQUENCE=4
-
+ VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
} UCNV_TableStates_2022;
/*
@@ -251,6 +266,25 @@ static const int8_t normalize_esq_chars_2022[256] = {
,0 ,0 ,0 ,0 ,0 ,0
};
+#ifdef U_ENABLE_GENERIC_ISO_2022
+/*
+ * When the generic ISO-2022 converter is completely removed, not just disabled
+ * per #ifdef, then the following state table and the associated tables that are
+ * dimensioned with MAX_STATES_2022 should be trimmed.
+ *
+ * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
+ * the associated escape sequences starting with ESC ( B should be removed.
+ * This includes the ones with key values 1097 and all of the ones above 1000000.
+ *
+ * For the latter, the tables can simply be truncated.
+ * For the former, since the tables must be kept parallel, it is probably best
+ * to simply duplicate an adjacent table cell, parallel in all tables.
+ *
+ * It may make sense to restructure the tables, especially by using small search
+ * tables for the variants instead of indexing them parallel to the table here.
+ */
+#endif
+
#define MAX_STATES_2022 74
static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
/* 0 1 2 3 4 5 6 7 8 9 */
@@ -265,12 +299,13 @@ static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
,35947631 ,35947635 ,35947636 ,35947638
};
+#ifdef U_ENABLE_GENERIC_ISO_2022
static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
/* 0 1 2 3 4 5 6 7 8 9 */
NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
- ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX-201" ,"JISX-201" ,"latin1"
+ ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
@@ -279,9 +314,11 @@ static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
};
+#endif
+
static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022] = {
/* 0 1 2 3 4 5 6 7 8 9 */
- VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_SS2_SEQUENCE ,VALID_SS3_SEQUENCE ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
+ VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
@@ -292,38 +329,16 @@ static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022]
};
-
-/*for 2022 looks ahead in the stream
- *to determine the longest possible convertible
- *data stream
- */
-static const char* getEndOfBuffer_2022(const char** source,
- const char* sourceLimit,
- UBool flush);
/* Type def for refactoring changeState_2022 code*/
typedef enum{
+#ifdef U_ENABLE_GENERIC_ISO_2022
ISO_2022=0,
+#endif
ISO_2022_JP=1,
ISO_2022_KR=2,
ISO_2022_CN=3
} Variant2022;
-/*runs through a state machine to determine the escape sequence - codepage correspondance
- *changes the pointer pointed to be _this->extraInfo
- */
-static void
-changeState_2022(UConverter* _this,
- const char** source,
- const char* sourceLimit,
- UBool flush,Variant2022 var,int* plane,
- UErrorCode* err);
-
-
-static UCNV_TableStates_2022
-getKey_2022(char source,
- int32_t* key,
- int32_t* offset);
-
/*********** ISO 2022 Converter Protos ***********/
static void
_ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
@@ -343,310 +358,148 @@ _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorC
static UConverter *
_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
-/************ protos of functions for setting the initial state *********************/
+#ifdef U_ENABLE_GENERIC_ISO_2022
static void
-setInitialStateToUnicodeJPCN(UConverter* converter,UConverterDataISO2022 *myConverterData);
+T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
+#endif
-static void
-setInitialStateFromUnicodeJPCN(UConverter* converter,UConverterDataISO2022 *myConverterData);
+/*const UConverterSharedData _ISO2022Data;*/
+static const UConverterSharedData _ISO2022JPData;
+static const UConverterSharedData _ISO2022KRData;
+static const UConverterSharedData _ISO2022CNData;
-static void
-setInitialStateToUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData);
+/*************** Converter implementations ******************/
static void
-setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData);
+setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
+ if(myConverterData->version == 1) {
+ UConverter *cnv = myConverterData->currentConverter;
-/*************** Converter implemenations ******************/
-static const UConverterImpl _ISO2022Impl={
- UCNV_ISO_2022,
+ cnv->toUnicodeStatus=0; /* offset */
+ cnv->mode=0; /* state */
+ cnv->toULength=0; /* byteIndex */
+ }
+}
- NULL,
- NULL,
+static void
+setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
+ /* in ISO-2022-KR the designator sequence appears only once
+ * in a file so we append it only once
+ */
+ if( converter->charErrorBufferLength==0){
- _ISO2022Open,
- _ISO2022Close,
- _ISO2022Reset,
+ converter->charErrorBufferLength = 4;
+ converter->charErrorBuffer[0] = 0x1b;
+ converter->charErrorBuffer[1] = 0x24;
+ converter->charErrorBuffer[2] = 0x29;
+ converter->charErrorBuffer[3] = 0x43;
+ }
+ if(myConverterData->version == 1) {
+ UConverter *cnv = myConverterData->currentConverter;
- T_UConverter_toUnicode_ISO_2022,
- T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
- T_UConverter_fromUnicode_UTF8,
- T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC,
- T_UConverter_getNextUChar_ISO_2022,
+ cnv->fromUChar32=0;
+ cnv->fromUnicodeStatus=1; /* prevLength */
+ }
+}
- NULL,
- _ISO2022getName,
- _ISO_2022_WriteSub,
- _ISO_2022_SafeClone,
- _ISO_2022_GetUnicodeSet
-};
-static const UConverterStaticData _ISO2022StaticData={
- sizeof(UConverterStaticData),
- "ISO_2022",
- 2022,
- UCNV_IBM,
- UCNV_ISO_2022,
- 1,
- 4,
- { 0x1a, 0, 0, 0 },
- 1,
- FALSE,
- FALSE,
- 0,
- 0,
- { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
-};
-const UConverterSharedData _ISO2022Data={
- sizeof(UConverterSharedData),
- ~((uint32_t) 0),
- NULL,
- NULL,
- &_ISO2022StaticData,
- FALSE,
- &_ISO2022Impl,
- 0
-};
+static void
+_ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
-/*************JP****************/
-static const UConverterImpl _ISO2022JPImpl={
- UCNV_ISO_2022,
+ char myLocale[6]={' ',' ',' ',' ',' ',' '};
- NULL,
- NULL,
+ cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
+ if(cnv->extraInfo != NULL) {
+ UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
+ uint32_t version;
- _ISO2022Open,
- _ISO2022Close,
- _ISO2022Reset,
+ uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
+ myConverterData->currentConverter = NULL;
+ myConverterData->currentType = ASCII1;
+ myConverterData->key =0;
+#ifdef U_ENABLE_GENERIC_ISO_2022
+ myConverterData->isFirstBuffer = TRUE;
+#endif
+ cnv->fromUnicodeStatus =FALSE;
+ if(locale){
+ uprv_strncpy(myLocale, locale, sizeof(myLocale));
+ }
+ myConverterData->version= 0;
+ version = options & UCNV_OPTIONS_VERSION_MASK;
+ if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
+ (myLocale[2]=='_' || myLocale[2]=='\0')){
+ int len=0;
+ /* open the required converters and cache them */
+ if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
+ myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
+ }
+ myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode);
+ myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode);
+ if(jpCharsetMasks[version]&CSM(JISX212)) {
+ myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
+ }
+ if(jpCharsetMasks[version]&CSM(GB2312)) {
+ myConverterData->myConverterArray[GB2312] = ucnv_loadSharedData("ibm-5478", NULL, errorCode); /* gb_2312_80-1 */
+ }
+ if(jpCharsetMasks[version]&CSM(KSC5601)) {
+ myConverterData->myConverterArray[KSC5601] = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
+ }
- UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
- UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
- UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
- UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
- NULL,
+ /* set the function pointers to appropriate funtions */
+ cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
+ uprv_strcpy(myConverterData->locale,"ja");
- NULL,
- _ISO2022getName,
- _ISO_2022_WriteSub,
- _ISO_2022_SafeClone,
- _ISO_2022_GetUnicodeSet
-};
-static const UConverterStaticData _ISO2022JPStaticData={
- sizeof(UConverterStaticData),
- "ISO_2022_JP",
- 0,
- UCNV_IBM,
- UCNV_ISO_2022,
- 1,
- 6,
- { 0x1a, 0, 0, 0 },
- 1,
- FALSE,
- FALSE,
- 0,
- 0,
- { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
-};
-const UConverterSharedData _ISO2022JPData={
- sizeof(UConverterSharedData),
- ~((uint32_t) 0),
- NULL,
- NULL,
- &_ISO2022JPStaticData,
- FALSE,
- &_ISO2022JPImpl,
- 0
-};
+ myConverterData->version = version;
+ uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
+ len = uprv_strlen(myConverterData->name);
+ myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
+ myConverterData->name[len+1]='\0';
+ }
+ else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
+ (myLocale[2]=='_' || myLocale[2]=='\0')){
-/************* KR ***************/
-static const UConverterImpl _ISO2022KRImpl={
- UCNV_ISO_2022,
+ if ((options & UCNV_OPTIONS_VERSION_MASK)==1){
+ myConverterData->version = 1;
+ myConverterData->currentConverter=
+ ucnv_open("icu-internal-25546",errorCode);
- NULL,
- NULL,
+ if (U_FAILURE(*errorCode)) {
+ _ISO2022Close(cnv);
+ return;
+ }
- _ISO2022Open,
- _ISO2022Close,
- _ISO2022Reset,
+ uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
+ uprv_memcpy(cnv->subChar, myConverterData->currentConverter->subChar, 4);
+ cnv->subCharLen = myConverterData->currentConverter->subCharLen;
+ }else{
+ myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
- UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
- UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
- UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
- UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
- NULL,
+ if (U_FAILURE(*errorCode)) {
+ _ISO2022Close(cnv);
+ return;
+ }
- NULL,
- _ISO2022getName,
- _ISO_2022_WriteSub,
- _ISO_2022_SafeClone,
- _ISO_2022_GetUnicodeSet
-};
-static const UConverterStaticData _ISO2022KRStaticData={
- sizeof(UConverterStaticData),
- "ISO_2022_KR",
- 0,
- UCNV_IBM,
- UCNV_ISO_2022,
- 1,
- 3,
- { 0x1a, 0, 0, 0 },
- 1,
- FALSE,
- FALSE,
- 0,
- 0,
- { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
-};
-const UConverterSharedData _ISO2022KRData={
- sizeof(UConverterSharedData),
- ~((uint32_t) 0),
- NULL,
- NULL,
- &_ISO2022KRStaticData,
- FALSE,
- &_ISO2022KRImpl,
- 0
-};
+ myConverterData->version = 0;
+ uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
+ }
-/*************** CN ***************/
-static const UConverterImpl _ISO2022CNImpl={
-
- UCNV_ISO_2022,
-
- NULL,
- NULL,
-
- _ISO2022Open,
- _ISO2022Close,
- _ISO2022Reset,
-
- UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
- UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
- UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
- UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
- NULL,
-
- NULL,
- _ISO2022getName,
- _ISO_2022_WriteSub,
- _ISO_2022_SafeClone,
- _ISO_2022_GetUnicodeSet
-};
-static const UConverterStaticData _ISO2022CNStaticData={
- sizeof(UConverterStaticData),
- "ISO_2022_CN",
- 0,
- UCNV_IBM,
- UCNV_ISO_2022,
- 2,
- 8,
- { 0x1a, 0, 0, 0 },
- 1,
- FALSE,
- FALSE,
- 0,
- 0,
- { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
-};
-const UConverterSharedData _ISO2022CNData={
- sizeof(UConverterSharedData),
- ~((uint32_t) 0),
- NULL,
- NULL,
- &_ISO2022CNStaticData,
- FALSE,
- &_ISO2022CNImpl,
- 0
-};
-
-
-/**********/
-
-static void
-_ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
-
- char myLocale[6]={' ',' ',' ',' ',' ',' '};
-
- cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
- if(cnv->extraInfo != NULL) {
- UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
- myConverterData->currentConverter = NULL;
- myConverterData->fromUnicodeConverter = NULL;
- myConverterData->currentType= ASCII1;
- myConverterData->plane = -1;
- myConverterData->key =0;
- myConverterData->isFirstBuffer = TRUE;
- cnv->fromUnicodeStatus =FALSE;
- if(locale){
- uprv_strncpy(myLocale, locale, sizeof(myLocale));
- myConverterData->isLocaleSpecified = TRUE;
- }
- myConverterData->version= 0;
- myConverterData->myConverterArray[0] =NULL;
- if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
- (myLocale[2]=='_' || myLocale[2]=='\0')){
- int len=0;
- /* open the required converters and cache them */
- myConverterData->myConverterArray[0]= ucnv_open("ASCII", errorCode );
- myConverterData->myConverterArray[1]= ucnv_open("ISO8859_1", errorCode);
- myConverterData->myConverterArray[2]= ucnv_open("ISO8859_7", errorCode);
- myConverterData->myConverterArray[3]= ucnv_open("jisx-201", errorCode);
- myConverterData->myConverterArray[4]= ucnv_open("jisx-208", errorCode);
- myConverterData->myConverterArray[5]= ucnv_open("jisx-212", errorCode);
- myConverterData->myConverterArray[6]= ucnv_open("ibm-5478", errorCode); /* gb_2312_80-1 */
- myConverterData->myConverterArray[7]= ucnv_open("ksc_5601", errorCode);
- myConverterData->myConverterArray[8]= ucnv_open("jisx-201", errorCode);
- myConverterData->myConverterArray[9]= NULL;
-
- /* initialize the state variables */
- setInitialStateToUnicodeJPCN(cnv, myConverterData);
- setInitialStateFromUnicodeJPCN(cnv,myConverterData);
-
- /* set the function pointers to appropriate funtions */
- cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
- uprv_strcpy(myConverterData->locale,"ja");
-
- myConverterData->version =options & UCNV_OPTIONS_VERSION_MASK;
- uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
- len=strlen(myConverterData->name);
- myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
- myConverterData->name[len+1]='\0';
- }
- else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
- (myLocale[2]=='_' || myLocale[2]=='\0')){
-
- /* initialize the state variables */
- setInitialStateToUnicodeKR(cnv, myConverterData);
- setInitialStateFromUnicodeKR(cnv,myConverterData);
-
- if ((options & UCNV_OPTIONS_VERSION_MASK)==1){
- myConverterData->version = 1;
- myConverterData->currentConverter=myConverterData->fromUnicodeConverter=
- ucnv_open("icu-internal-25546",errorCode);
- uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
- }else{
- myConverterData->currentConverter=myConverterData->fromUnicodeConverter = ucnv_open("ibm-949",errorCode);
- myConverterData->version = 0;
- uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
- }
+ /* initialize the state variables */
+ setInitialStateToUnicodeKR(cnv, myConverterData);
+ setInitialStateFromUnicodeKR(cnv,myConverterData);
/* set the function pointers to appropriate funtions */
cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
- cnv->mode=UCNV_SI;
uprv_strcpy(myConverterData->locale,"ko");
}
else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
(myLocale[2]=='_' || myLocale[2]=='\0')){
/* open the required converters and cache them */
- myConverterData->myConverterArray[0] = ucnv_open("ASCII",errorCode);
- myConverterData->myConverterArray[1] = ucnv_open("ibm-5478",errorCode); /* gb_2312_80-1 */
- myConverterData->myConverterArray[2] = ucnv_open("iso-ir-165",errorCode);
- myConverterData->myConverterArray[3] = ucnv_open("cns-11643-1992",errorCode);
- myConverterData->myConverterArray[4] = NULL;
-
+ myConverterData->myConverterArray[GB2312_1] = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
+ if(version==1) {
+ myConverterData->myConverterArray[ISO_IR_165] = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
+ }
+ myConverterData->myConverterArray[CNS_11643] = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
- /*initialize the state variables*/
- setInitialStateToUnicodeJPCN(cnv, myConverterData);
- setInitialStateFromUnicodeJPCN(cnv,myConverterData);
/* set the function pointers to appropriate funtions */
cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
@@ -661,6 +514,7 @@ _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t opti
}
}
else{
+#ifdef U_ENABLE_GENERIC_ISO_2022
/* append the UTF-8 escape sequence */
cnv->charErrorBufferLength = 3;
cnv->charErrorBuffer[0] = 0x1b;
@@ -669,36 +523,43 @@ _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t opti
cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
/* initialize the state variables */
- myConverterData->isLocaleSpecified=FALSE;
uprv_strcpy(myConverterData->name,"ISO_2022");
+#else
+ *errorCode = U_UNSUPPORTED_ERROR;
+ return;
+#endif
}
+ cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
+
+ if(U_FAILURE(*errorCode)) {
+ _ISO2022Close(cnv);
+ }
} else {
*errorCode = U_MEMORY_ALLOCATION_ERROR;
}
-
}
static void
_ISO2022Close(UConverter *converter) {
- UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
- UConverter **array = myData->myConverterArray;
+ UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
+ UConverterSharedData **array = myData->myConverterArray;
+ int32_t i;
if (converter->extraInfo != NULL) {
/*close the array of converter pointers and free the memory*/
- while(*array!=NULL){
- if(*array==myData->currentConverter){
- myData->currentConverter=NULL;
+ for (i=0; icurrentConverter); /* if not closed above */
+ ucnv_close(myData->currentConverter);
if(!converter->isExtraLocal){
uprv_free (converter->extraInfo);
+ converter->extraInfo = NULL;
}
}
}
@@ -706,9 +567,18 @@ _ISO2022Close(UConverter *converter) {
static void
_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
- if(! myConverterData->isLocaleSpecified){
+ if(choice<=UCNV_RESET_TO_UNICODE) {
+ uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
+ myConverterData->key = 0;
+ }
+ if(choice!=UCNV_RESET_TO_UNICODE) {
+ uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
+ }
+#ifdef U_ENABLE_GENERIC_ISO_2022
+ if(myConverterData->locale[0] == 0){
if(choice<=UCNV_RESET_TO_UNICODE) {
myConverterData->isFirstBuffer = TRUE;
+ myConverterData->key = 0;
if (converter->mode == UCNV_SO){
ucnv_close (myConverterData->currentConverter);
myConverterData->currentConverter=NULL;
@@ -723,17 +593,11 @@ _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
converter->charErrorBuffer[2] = 0x42;
}
}
- else {
+ else
+#endif
+ {
/* reset the state variables */
- if(myConverterData->locale[0] == 'j' || myConverterData->locale[0] == 'c'){
- if(choice<=UCNV_RESET_TO_UNICODE) {
- setInitialStateToUnicodeJPCN(converter, myConverterData);
- }
- if(choice!=UCNV_RESET_TO_UNICODE) {
- setInitialStateFromUnicodeJPCN(converter,myConverterData);
- }
- }
- else if(myConverterData->locale[0] == 'k'){
+ if(myConverterData->locale[0] == 'k'){
if(choice<=UCNV_RESET_TO_UNICODE) {
setInitialStateToUnicodeKR(converter, myConverterData);
}
@@ -753,165 +617,405 @@ _ISO2022getName(const UConverter* cnv){
return NULL;
}
-static void
-setInitialStateToUnicodeJPCN(UConverter* converter,UConverterDataISO2022 *myConverterData ){
- myConverterData->toUnicodeCurrentState =ASCII;
- myConverterData->currentConverter = NULL;
- myConverterData->isFirstBuffer = TRUE;
- myConverterData->toUnicodeSaveState = INVALID_STATE;
- converter->mode = UCNV_SI;
-}
+/*************** to unicode *******************/
+/****************************************************************************
+ * Recognized escape sequences are
+ * (B ASCII
+ * .A ISO-8859-1
+ * .F ISO-8859-7
+ * (J JISX-201
+ * (I JISX-201
+ * $B JISX-208
+ * $@ JISX-208
+ * $(D JISX-212
+ * $A GB2312
+ * $(C KSC5601
+ */
+static const StateEnum nextStateToUnicodeJP[MAX_STATES_2022]= {
+/* 0 1 2 3 4 5 6 7 8 9 */
+ INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
+ ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
+ ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
+ ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
+ ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
+ ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
+ ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
+ ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
+};
-static void
-setInitialStateFromUnicodeJPCN(UConverter* converter,UConverterDataISO2022 *myConverterData){
- myConverterData->fromUnicodeCurrentState= ASCII;
- myConverterData->isEscapeAppended=FALSE;
- myConverterData->isShiftAppended=FALSE;
- myConverterData->isLocaleSpecified=TRUE;
- myConverterData->currentType = ASCII1;
- converter->fromUnicodeStatus = FALSE;
+/*************** to unicode *******************/
+static const StateEnum nextStateToUnicodeCN[MAX_STATES_2022]= {
+/* 0 1 2 3 4 5 6 7 8 9 */
+ INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
+ ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
+ ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
+ ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
+ ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
+ ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
+ ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
+ ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
+};
-}
-static void
-setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
+static UCNV_TableStates_2022
+getKey_2022(char c,int32_t* key,int32_t* offset){
+ int32_t togo;
+ int32_t low = 0;
+ int32_t hi = MAX_STATES_2022;
+ int32_t oldmid=0;
- myConverterData->isLocaleSpecified=TRUE;
- converter->mode = UCNV_SI;
- myConverterData->currentConverter = myConverterData->fromUnicodeConverter;
+ togo = normalize_esq_chars_2022[(uint8_t)c];
+ if(togo == 0) {
+ /* not a valid character anywhere in an escape sequence */
+ *key = 0;
+ *offset = 0;
+ return INVALID_2022;
+ }
+ togo = (*key << 5) + togo;
-}
+ while (hi != low) /*binary search*/{
-static void
-setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
- /* in ISO-2022-KR the desginator sequence appears only once
- * in a file so we append it only once
- */
- if( converter->charErrorBufferLength==0){
+ register int32_t mid = (hi+low) >> 1; /*Finds median*/
+
+ if (mid == oldmid)
+ break;
+
+ if (escSeqStateTable_Key_2022[mid] > togo){
+ hi = mid;
+ }
+ else if (escSeqStateTable_Key_2022[mid] < togo){
+ low = mid;
+ }
+ else /*we found it*/{
+ *key = togo;
+ *offset = mid;
+ return escSeqStateTable_Value_2022[mid];
+ }
+ oldmid = mid;
- converter->charErrorBufferLength = 4;
- converter->charErrorBuffer[0] = 0x1b;
- converter->charErrorBuffer[1] = 0x24;
- converter->charErrorBuffer[2] = 0x29;
- converter->charErrorBuffer[3] = 0x43;
}
- myConverterData->isLocaleSpecified=TRUE;
- myConverterData->isShiftAppended=FALSE;
+ *key = 0;
+ *offset = 0;
+ return INVALID_2022;
}
+/*runs through a state machine to determine the escape sequence - codepage correspondance
+ */
+static void
+changeState_2022(UConverter* _this,
+ const char** source,
+ const char* sourceLimit,
+ Variant2022 var,
+ UErrorCode* err){
+ UCNV_TableStates_2022 value;
+ UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
+ uint32_t key = myData2022->key;
+ int32_t offset;
+ char c;
+
+ value = VALID_NON_TERMINAL_2022;
+ while (*source < sourceLimit) {
+ c = *(*source)++;
+ _this->toUBytes[_this->toULength++]=(uint8_t)c;
+ value = getKey_2022(c,(int32_t *) &key, &offset);
+
+ switch (value){
-static U_INLINE void
-CONCAT_ESCAPE_EX(UConverterFromUnicodeArgs* args,
- const UChar* source,
- unsigned char** target,
- const unsigned char* targetLimit,
- int32_t** offsets,
- const char* strToAppend,
- int len,
- UErrorCode* err);
+ case VALID_NON_TERMINAL_2022 :
+ /* continue with the loop */
+ break;
-static U_INLINE void
-MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
- UChar32 c,
- uint32_t* value,
- UBool useFallback,
- int* length,
- int outputType);
+ case VALID_TERMINAL_2022:
+ key = 0;
+ goto DONE;
-static U_INLINE void
-MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
- UChar32 c,
- uint32_t* retval,
- UBool useFallback);
+ case INVALID_2022:
+ goto DONE;
-static U_INLINE void
-CONCAT_ESCAPE_EX(UConverterFromUnicodeArgs* args,
- const UChar* source,
- unsigned char** target,
- const unsigned char* targetLimit,
- int32_t** offsets,
- const char* strToAppend,
- int len,
- UErrorCode* err)
-{
+ case VALID_MAYBE_TERMINAL_2022:
+#ifdef U_ENABLE_GENERIC_ISO_2022
+ /* ESC ( B is ambiguous only for ISO_2022 itself */
+ if(var == ISO_2022) {
+ /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
+ _this->toULength = 0;
- unsigned char* myTarget = *target;
- int32_t* myOffsets = *offsets;
- while(len-->0){
- if(myTarget < targetLimit){
- *(myTarget++) = (unsigned char) *(strToAppend++);
- if(myOffsets){
- *(myOffsets++) = source - args->source -1;
+ /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
+
+ /* continue with the loop */
+ value = VALID_NON_TERMINAL_2022;
+ break;
+ } else
+#endif
+ {
+ /* not ISO_2022 itself, finish here */
+ value = VALID_TERMINAL_2022;
+ key = 0;
+ goto DONE;
}
}
- else{
- args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *(strToAppend++);
- *err =U_BUFFER_OVERFLOW_ERROR;
- }
}
- *target = myTarget;
- *offsets = myOffsets;
-}
-/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
- * any future change in _MBCSFromUChar32() function should be reflected in
- * this macro
- */
-static U_INLINE void
-MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
- UChar32 c,
- uint32_t* value,
- UBool useFallback,
- int* length,
- int outputType)
-{
+DONE:
+ myData2022->key = key;
- const uint16_t *table=sharedData->table->mbcs.fromUnicodeTable;
- uint32_t stage2Entry;
- uint32_t myValue=0;
- const uint8_t *p;
- /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
- if(c<0x10000 || (sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
- stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
- /* get the bytes and the length for the output */
- if(outputType==MBCS_OUTPUT_2){
- myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);
- if(myValue<=0xff) {
- *length=1;
- } else {
- *length=2;
+ if (value == VALID_NON_TERMINAL_2022) {
+ /* indicate that the escape sequence is incomplete: key!=0 */
+ return;
+ } else if (value == INVALID_2022 ) {
+ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+ return;
+ } else /* value == VALID_TERMINAL_2022 */ {
+ switch(var){
+#ifdef U_ENABLE_GENERIC_ISO_2022
+ case ISO_2022:
+ {
+ const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
+ if(chosenConverterName == NULL) {
+ /* SS2 or SS3 */
+ *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
+ return;
}
- }else if(outputType==MBCS_OUTPUT_3){
- p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);
- myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
- if(myValue<=0xff) {
- *length=1;
- } else if(myValue<=0xffff) {
- *length=2;
- } else {
- *length=3;
+
+ _this->mode = UCNV_SI;
+ ucnv_close(myData2022->currentConverter);
+ myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
+ if(U_SUCCESS(*err)) {
+ myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
+ _this->mode = UCNV_SO;
+ }
+ break;
+ }
+#endif
+ case ISO_2022_JP:
+ {
+ StateEnum tempState=nextStateToUnicodeJP[offset];
+ switch(tempState) {
+ case INVALID_STATE:
+ *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
+ break;
+ case SS2_STATE:
+ if(myData2022->toU2022State.cs[2]!=0) {
+ if(myData2022->toU2022State.g<2) {
+ myData2022->toU2022State.prevG=myData2022->toU2022State.g;
+ }
+ myData2022->toU2022State.g=2;
+ } else {
+ /* illegal to have SS2 before a matching designator */
+ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+ }
+ break;
+ /* case SS3_STATE: not used in ISO-2022-JP-x */
+ case ISO8859_1:
+ case ISO8859_7:
+ if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
+ *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
+ } else {
+ /* G2 charset for SS2 */
+ myData2022->toU2022State.cs[2]=(int8_t)tempState;
+ }
+ break;
+ default:
+ if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
+ *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
+ } else {
+ /* G0 charset */
+ myData2022->toU2022State.cs[0]=(int8_t)tempState;
+ }
+ break;
+ }
+ }
+ break;
+ case ISO_2022_CN:
+ {
+ StateEnum tempState=nextStateToUnicodeCN[offset];
+ switch(tempState) {
+ case INVALID_STATE:
+ *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
+ break;
+ case SS2_STATE:
+ if(myData2022->toU2022State.cs[2]!=0) {
+ if(myData2022->toU2022State.g<2) {
+ myData2022->toU2022State.prevG=myData2022->toU2022State.g;
+ }
+ myData2022->toU2022State.g=2;
+ } else {
+ /* illegal to have SS2 before a matching designator */
+ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+ }
+ break;
+ case SS3_STATE:
+ if(myData2022->toU2022State.cs[3]!=0) {
+ if(myData2022->toU2022State.g<2) {
+ myData2022->toU2022State.prevG=myData2022->toU2022State.g;
+ }
+ myData2022->toU2022State.g=3;
+ } else {
+ /* illegal to have SS3 before a matching designator */
+ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+ }
+ break;
+ case ISO_IR_165:
+ if(myData2022->version==0) {
+ *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
+ break;
+ }
+ case GB2312_1:
+ case CNS_11643_1:
+ myData2022->toU2022State.cs[1]=(int8_t)tempState;
+ break;
+ case CNS_11643_2:
+ myData2022->toU2022State.cs[2]=(int8_t)tempState;
+ break;
+ default:
+ /* other CNS 11643 planes */
+ if(myData2022->version==0) {
+ *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
+ } else {
+ myData2022->toU2022State.cs[3]=(int8_t)tempState;
+ }
+ break;
+ }
+ }
+ break;
+ case ISO_2022_KR:
+ if(offset==0x30){
+ /* nothing to be done, just accept this one escape sequence */
+ } else {
+ *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
+ }
+ break;
+
+ default:
+ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+ break;
+ }
+ }
+ if(U_SUCCESS(*err)) {
+ _this->toULength = 0;
+ }
+}
+
+/*Checks the characters of the buffer against valid 2022 escape sequences
+*if the match we return a pointer to the initial start of the sequence otherwise
+*we return sourceLimit
+*/
+/*for 2022 looks ahead in the stream
+ *to determine the longest possible convertible
+ *data stream
+ */
+static U_INLINE const char*
+getEndOfBuffer_2022(const char** source,
+ const char* sourceLimit,
+ UBool flush){
+
+ const char* mySource = *source;
+
+#ifdef U_ENABLE_GENERIC_ISO_2022
+ if (*source >= sourceLimit)
+ return sourceLimit;
+
+ do{
+
+ if (*mySource == ESC_2022){
+ int8_t i;
+ int32_t key = 0;
+ int32_t offset;
+ UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
+
+ /* Kludge: I could not
+ * figure out the reason for validating an escape sequence
+ * twice - once here and once in changeState_2022().
+ * is it possible to have an ESC character in a ISO2022
+ * byte stream which is valid in a code page? Is it legal?
+ */
+ for (i=0;
+ (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
+ i++) {
+ value = getKey_2022(*(mySource+i), &key, &offset);
+ }
+ if (value > 0 || *mySource==ESC_2022)
+ return mySource;
+
+ if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
+ return sourceLimit;
+ }
+ }while (++mySource < sourceLimit);
+
+ return sourceLimit;
+#else
+ while(mySource < sourceLimit && *mySource != ESC_2022) {
+ ++mySource;
+ }
+ return mySource;
+#endif
+}
+
+
+/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
+ * any future change in _MBCSFromUChar32() function should be reflected in
+ * this macro
+ */
+static U_INLINE void
+MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
+ UChar32 c,
+ uint32_t* value,
+ UBool useFallback,
+ int32_t *length,
+ int outputType)
+{
+ const int32_t *cx;
+ const uint16_t *table;
+ uint32_t stage2Entry;
+ uint32_t myValue;
+ const uint8_t *p;
+ /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
+ if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
+ table=sharedData->mbcs.fromUnicodeTable;
+ stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
+ /* get the bytes and the length for the output */
+ if(outputType==MBCS_OUTPUT_2){
+ myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
+ if(myValue<=0xff) {
+ *length=1;
+ } else {
+ *length=2;
+ }
+ } else /* outputType==MBCS_OUTPUT_3 */ {
+ p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
+ myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
+ if(myValue<=0xff) {
+ *length=1;
+ } else if(myValue<=0xffff) {
+ *length=2;
+ } else {
+ *length=3;
}
}
/* is this code point assigned, or do we use fallbacks? */
if( (stage2Entry&(1<<(16+(c&0xf))))!=0 ||
- (FROM_U_USE_FALLBACK(useFallback, c) && (myValue!=0 || c==0))
+ (FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0)
) {
/*
- * We allow a 0 byte output if the Unicode code point is
- * U+0000 and also if the "assigned" bit is set for this entry.
+ * We allow a 0 byte output if the "assigned" bit is set for this entry.
* There is no way with this data structure for fallback output
- * for other than U+0000 to be a zero byte.
+ * to be a zero byte.
*/
/* assigned */
*value=myValue;
- } else {
- *length=0;
+ return;
}
- }else{
- *length=0;
}
+
+ cx=sharedData->mbcs.extIndexes;
+ if(cx!=NULL) {
+ *length=ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
+ return;
+ }
+
+ /* unassigned */
+ *length=0;
}
/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
@@ -927,13 +1031,14 @@ MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
const uint16_t *table;
int32_t value;
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
- if(c>=0x10000 && !(sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
- value= -1;
+ if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
+ *retval=(uint16_t)-1;
+ return;
}
/* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
- table=sharedData->table->mbcs.fromUnicodeTable;
+ table=sharedData->mbcs.fromUnicodeTable;
/* get the byte for the output */
- value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->table->mbcs.fromUnicodeBytes, c);
+ value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
/* is this code point assigned, or do we use fallbacks? */
if(useFallback ? value>=0x800 : value>=0xc00) {
value &=0xff;
@@ -943,473 +1048,143 @@ MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
*retval=(uint16_t) value;
}
+#ifdef U_ENABLE_GENERIC_ISO_2022
+
/**********************************************************************************
* ISO-2022 Converter
*
*
*/
-static UChar32
-T_UConverter_getNextUChar_ISO_2022(UConverterToUnicodeArgs* args,
- UErrorCode* err){
- const char* mySourceLimit;
- int plane=0; /*dummy variable*/
- UConverterDataISO2022* myData =((UConverterDataISO2022*)(args->converter->extraInfo));
- /*Arguments Check*/
- if (args->sourceLimit < args->source){
- *err = U_ILLEGAL_ARGUMENT_ERROR;
- return 0xffff;
- }
-
- while(1){
-
- mySourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, TRUE);
- /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
- if (args->converter->mode == UCNV_SO && mySourceLimit!=args->source)
- /*Already doing some conversion*/{
-
- return ucnv_getNextUChar(myData->currentConverter,
- &(args->source),
- mySourceLimit,
- err);
- }
- /*-Done with buffer with entire buffer
- *-Error while converting
- */
- changeState_2022(args->converter,
- &(args->source),
- args->sourceLimit,
- TRUE,
- ISO_2022,
- &plane,
- err);
- if(args->source >= args->sourceLimit){
- *err = U_INDEX_OUTOFBOUNDS_ERROR;
- break;
- }
- }
-
- if( (args->source == args->sourceLimit) && args->flush){
- _ISO2022Reset(args->converter,UCNV_RESET_TO_UNICODE);
- }
- return 0xffff;
-}
-
-static void
-T_UConverter_toUnicode_ISO_2022(UConverterToUnicodeArgs *args,
- UErrorCode* err){
-
- const char *mySourceLimit;
- char const* sourceStart;
- UConverter *saveThis;
- int plane =0; /*dummy variable*/
- UConverterDataISO2022* myData;
-
- if ((args->converter == NULL) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)){
- *err = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
- myData= ((UConverterDataISO2022*)(args->converter->extraInfo));
- while (args->source < args->sourceLimit) {
-
- /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
- mySourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
-
- if (args->converter->mode == UCNV_SO) /*Already doing some conversion*/{
-
- saveThis = args->converter;
- args->offsets = NULL;
- args->converter = myData->currentConverter;
- ucnv_toUnicode(args->converter,
- &args->target,
- args->targetLimit,
- &args->source,
- mySourceLimit,
- args->offsets,
- args->flush,
- err);
- args->converter = saveThis;
- myData->isFirstBuffer = FALSE;
- }
- if((myData->isFirstBuffer) && (args->source[0]!=(char)ESC_2022)
- && (myData->currentConverter==NULL)){
-
-
- saveThis = args->converter;
- args->offsets = NULL;
- myData->currentConverter = ucnv_open("ASCII",err);
-
- if(U_FAILURE(*err)){
- break;
- }
-
- args->converter = myData->currentConverter;
- ucnv_toUnicode(args->converter,
- &args->target,
- args->targetLimit,
- &args->source,
- mySourceLimit,
- args->offsets,
- args->flush,
- err);
- args->converter = saveThis;
- args->converter->mode = UCNV_SO;
- myData->isFirstBuffer=FALSE;
-
- }
-
- /*-Done with buffer with entire buffer
- -Error while converting
- */
-
- if (U_FAILURE(*err) || (args->source == args->sourceLimit))
- return;
-
- sourceStart = args->source;
- changeState_2022(args->converter,
- &(args->source),
- args->sourceLimit,
- TRUE,
- ISO_2022,
- &plane,
- err);
- /* args->source = sourceStart; */
-
-
- }
-
- myData->isFirstBuffer=FALSE;
- if( (args->source == args->sourceLimit) && args->flush){
- _ISO2022Reset(args->converter,UCNV_RESET_FROM_UNICODE);
- }
-
-}
-
static void
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
UErrorCode* err){
-
- int32_t myOffset=0;
- int32_t base = 0;
- const char* mySourceLimit;
- char const* sourceStart;
+ const char* mySourceLimit, *realSourceLimit;
+ const char* sourceStart;
+ const UChar* myTargetStart;
UConverter* saveThis;
- int plane =0;/*dummy variable*/
UConverterDataISO2022* myData;
+ int8_t length;
+
+ saveThis = args->converter;
+ myData=((UConverterDataISO2022*)(saveThis->extraInfo));
+
+ realSourceLimit = args->sourceLimit;
+ while (args->source < realSourceLimit) {
+ if(myData->key == 0) { /* are we in the middle of an escape sequence? */
+ /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
+ mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
+
+ if(args->source < mySourceLimit) {
+ if(myData->currentConverter==NULL) {
+ myData->currentConverter = ucnv_open("ASCII",err);
+ if(U_FAILURE(*err)){
+ return;
+ }
- if ((args->converter == NULL) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)){
- *err = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
-
- myData=((UConverterDataISO2022*)(args->converter->extraInfo));
-
- while (args->source < args->sourceLimit) {
- mySourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
- /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
-
- if (args->converter->mode == UCNV_SO) /*Already doing some conversion*/{
- const UChar* myTargetStart = args->target;
-
- saveThis = args->converter;
- args->converter = myData->currentConverter;
- ucnv_toUnicode(args->converter,
- &(args->target),
- args->targetLimit,
- &(args->source),
- mySourceLimit,
- args->offsets,
- args->flush,
- err);
-
- myData->isFirstBuffer = FALSE;
-
- args->converter = saveThis;
- {
- int32_t lim = args->target - myTargetStart;
- int32_t i = 0;
- for (i=base; i < lim;i++){
- args->offsets[i] += myOffset;
+ myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
+ saveThis->mode = UCNV_SO;
}
- base += lim;
- }
- }
- if(myData->isFirstBuffer && args->source[0]!=ESC_2022
- && (myData->currentConverter==NULL)){
-
- const UChar* myTargetStart = args->target;
- saveThis = args->converter;
- args->offsets = NULL;
- myData->currentConverter = ucnv_open("ASCII",err);
-
- if(U_FAILURE(*err)){
- break;
- }
+ /* convert to before the ESC or until the end of the buffer */
+ myData->isFirstBuffer=FALSE;
+ sourceStart = args->source;
+ myTargetStart = args->target;
+ args->converter = myData->currentConverter;
+ ucnv_toUnicode(args->converter,
+ &args->target,
+ args->targetLimit,
+ &args->source,
+ mySourceLimit,
+ args->offsets,
+ (UBool)(args->flush && mySourceLimit == realSourceLimit),
+ err);
+ args->converter = saveThis;
+
+ if (*err == U_BUFFER_OVERFLOW_ERROR) {
+ /* move the overflow buffer */
+ length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
+ myData->currentConverter->UCharErrorBufferLength = 0;
+ if(length > 0) {
+ uprv_memcpy(saveThis->UCharErrorBuffer,
+ myData->currentConverter->UCharErrorBuffer,
+ length*U_SIZEOF_UCHAR);
+ }
+ return;
+ }
- args->converter = myData->currentConverter;
- ucnv_toUnicode(args->converter,
- &args->target,
- args->targetLimit,
- &args->source,
- mySourceLimit,
- args->offsets,
- args->flush,
- err);
- args->converter = saveThis;
- args->converter->mode = UCNV_SO;
- myData->isFirstBuffer=FALSE;
-/* args->converter = saveThis;*/
- {
- int32_t lim = args->target - myTargetStart;
- int32_t i = 0;
- for (i=base; i < lim;i++){
- args->offsets[i] += myOffset;
+ /*
+ * At least one of:
+ * -Error while converting
+ * -Done with entire buffer
+ * -Need to write offsets or update the current offset
+ * (leave that up to the code in ucnv.c)
+ *
+ * or else we just stopped at an ESC byte and continue with changeState_2022()
+ */
+ if (U_FAILURE(*err) ||
+ (args->source == realSourceLimit) ||
+ (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
+ (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
+ ) {
+ /* copy partial or error input for truncated detection and error handling */
+ if(U_FAILURE(*err)) {
+ length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
+ if(length > 0) {
+ uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
+ }
+ } else {
+ length = saveThis->toULength = myData->currentConverter->toULength;
+ if(length > 0) {
+ uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
+ if(args->source < mySourceLimit) {
+ *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
+ }
+ }
+ }
+ return;
}
- base += lim;
}
}
- /*-Done with buffer with entire buffer
- -Error while converting
- */
-
- if (U_FAILURE(*err) || (args->source == args->sourceLimit))
- return;
sourceStart = args->source;
changeState_2022(args->converter,
&(args->source),
- args->sourceLimit,
- TRUE,
+ realSourceLimit,
ISO_2022,
- &plane,
err);
- myOffset += args->source - sourceStart;
-
- }
- if( (args->source == args->sourceLimit) && args->flush){
- _ISO2022Reset(args->converter,UCNV_RESET_TO_UNICODE);
- }
-}
-
-static UCNV_TableStates_2022
-getKey_2022(char c,int32_t* key,int32_t* offset){
- int32_t togo = *key;
- int32_t low = 0;
- int32_t hi = MAX_STATES_2022;
- int32_t oldmid=0;
-
- if (*key == 0){
- togo = (int8_t)normalize_esq_chars_2022[(int)c];
- }
- else{
- togo <<= 5;
- togo += (int8_t)normalize_esq_chars_2022[(int)c];
- }
-
- while (hi != low) /*binary search*/{
-
- register int32_t mid = (hi+low) >> 1; /*Finds median*/
-
- if (mid == oldmid)
- break;
-
- if (escSeqStateTable_Key_2022[mid] > togo){
- hi = mid;
- }
- else if (escSeqStateTable_Key_2022[mid] < togo){
- low = mid;
- }
- else /*we found it*/{
- *key = togo;
- *offset = mid;
- return escSeqStateTable_Value_2022[mid];
+ if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
+ /* let the ucnv.c code update its current offset */
+ return;
}
- oldmid = mid;
-
}
-
- *key = 0;
- *offset = 0;
- return INVALID_2022;
-}
-
-
-
-/*Checks the characters of the buffer against valid 2022 escape sequences
-*if the match we return a pointer to the initial start of the sequence otherwise
-*we return sourceLimit
-*/
-static const char*
-getEndOfBuffer_2022(const char** source,
- const char* sourceLimit,
- UBool flush){
-
- const char* mySource = *source;
-
- if (*source >= sourceLimit)
- return sourceLimit;
-
- do{
-
- if (*mySource == ESC_2022){
- int8_t i;
- int32_t key = 0;
- int32_t offset;
- UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
-
- /* Kludge: I could not
- * figure out the reason for validating an escape sequence
- * twice - once here and once in changeState_2022().
- * is it possible to have an ESC character in a ISO2022
- * byte stream which is valid in a code page? Is it legal?
- */
- for (i=0;
- (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
- i++) {
- value = getKey_2022(*(mySource+i), &key, &offset);
- }
- if (value > 0 || *mySource==ESC_2022)
- return mySource;
-
- if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
- return sourceLimit;
- }
- }while (++mySource < sourceLimit);
-
- return sourceLimit;
}
-/*
- * From Unicode Callback helper function
- */
-static void
-fromUnicodeCallback(UConverterFromUnicodeArgs* args,const UChar32 sourceChar,const UChar** pSource,
- unsigned char** pTarget,int32_t** pOffsets,UConverterCallbackReason reason, UErrorCode* err){
-
- /*variables for callback */
- const UChar* saveSource =NULL;
- char* saveTarget =NULL;
- int32_t* saveOffsets =NULL;
- int currentOffset =0;
- int saveIndex =0;
- int32_t* offsets = *pOffsets;
- const UChar* source = *pSource;
- unsigned char* target = *pTarget;
-
- args->converter->invalidUCharLength = 0;
-
- if(sourceChar>0xffff){
- args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(uint16_t)(((sourceChar)>>10)+0xd7c0);
- args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(uint16_t)(((sourceChar)&0x3ff)|0xdc00);
- }
- else{
- args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(UChar)sourceChar;
- }
- if(offsets)
- currentOffset = *(offsets-1)+1;
-
- saveSource = args->source;
- saveTarget = args->target;
- saveOffsets = args->offsets;
- args->target = (char*)target;
- args->source = source;
- args->offsets = offsets;
-
- /*copies current values for the ErrorFunctor to update */
- /*Calls the ErrorFunctor */
- args->converter->fromUCharErrorBehaviour ( args->converter->fromUContext,
- args,
- args->converter->invalidUCharBuffer,
- args->converter->invalidUCharLength,
- (UChar32) (sourceChar),
- reason,
- err);
-
- saveIndex = args->target - (char*)target;
- if(args->offsets){
- args->offsets = saveOffsets;
- while(saveIndex-->0){
- *offsets = currentOffset;
- offsets++;
- }
- }
- target = (unsigned char*)args->target;
- *pTarget=target;
- *pOffsets=offsets;
- args->source=saveSource;
- args->target=saveTarget;
- args->offsets=saveOffsets;
- args->converter->fromUSurrogateLead=0x00;
-
-}
+#endif
/*
* To Unicode Callback helper function
*/
static void
-toUnicodeCallback(UConverterToUnicodeArgs* args, const uint32_t sourceChar,const char** pSource,
- const uint32_t targetUniChar,UChar** pTarget,UErrorCode* err){
-
- const char *saveSource = args->source;
- UChar *saveTarget = args->target;
- const char* source = *pSource;
- UChar* target = *pTarget;
- int32_t *saveOffsets = NULL;
- UConverterCallbackReason reason;
- int32_t currentOffset;
- int32_t saveIndex = target - args->target;
-
- args->converter->invalidCharLength=0;
-
+toUnicodeCallback(UConverter *cnv,
+ const uint32_t sourceChar, const uint32_t targetUniChar,
+ UErrorCode* err){
if(sourceChar>0xff){
- currentOffset= source - args->source - 2;
- args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)(sourceChar>>8);
- args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)sourceChar;
+ cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
+ cnv->toUBytes[1] = (uint8_t)sourceChar;
+ cnv->toULength = 2;
}
else{
-
- currentOffset= source - args->source -1;
- args->converter->invalidCharBuffer[args->converter->invalidCharLength++] =(char) sourceChar;
+ cnv->toUBytes[0] =(char) sourceChar;
+ cnv->toULength = 2;
}
if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
- reason = UCNV_UNASSIGNED;
*err = U_INVALID_CHAR_FOUND;
}
else{
- reason = UCNV_ILLEGAL;
*err = U_ILLEGAL_CHAR_FOUND;
}
-
- if(args->offsets){
- saveOffsets=args->offsets;
- args->offsets = args->offsets+(target - args->target);
- }
-
- args->target =target;
- target =saveTarget;
- args->source = source;
-
- args->converter->fromCharErrorBehaviour (
- args->converter->toUContext,
- args,
- args->converter->invalidCharBuffer,
- args->converter->invalidCharLength,
- reason,
- err);
-
- if(args->offsets){
- args->offsets = saveOffsets;
-
- for (;saveIndex < (args->target - target);saveIndex++) {
- args->offsets[saveIndex] += currentOffset;
- }
- }
- target=args->target;
- *pTarget=target;
- args->source = saveSource;
- args->target = saveTarget;
}
/**************************************ISO-2022-JP*************************************************/
@@ -1453,28 +1228,21 @@ toUnicodeCallback(UConverterToUnicodeArgs* args, const uint32_t sourceChar,const
* ISO-8859-1 : Algorithmic implemented as LATIN1 case
* ISO-8859-7 : alisas to ibm-9409 mapping table
*/
-#define MAX_VALID_CP_JP 9
-static const Cnv2022Type myConverterType[MAX_VALID_CP_JP]={
- ASCII1,
- LATIN1,
- SBCS,
- SBCS,
- DBCS,
- DBCS,
- DBCS,
- DBCS,
- SBCS,
+/* preference order of JP charsets */
+static const StateEnum jpCharsetPref[]={
+ ASCII,
+ JISX201,
+ ISO8859_1,
+ ISO8859_7,
+ JISX208,
+ JISX212,
+ GB2312,
+ KSC5601,
+ HWKANA_7BIT
};
-static const StateEnum nextStateArray[5][MAX_VALID_CP_JP]= {
- {JISX201 ,INVALID_STATE,INVALID_STATE,JISX208,ASCII,INVALID_STATE,INVALID_STATE,INVALID_STATE,INVALID_STATE},
- {JISX201,INVALID_STATE,INVALID_STATE,JISX208,JISX212,ASCII,INVALID_STATE,INVALID_STATE,INVALID_STATE},
- {ISO8859_1,ISO8859_7,JISX201,JISX208,JISX212,GB2312,KSC5601,ASCII,INVALID_STATE},
- {JISX201,INVALID_STATE,INVALID_STATE,JISX208,JISX212,HWKANA_7BIT,INVALID_STATE,INVALID_STATE,ASCII},
- {JISX201,INVALID_STATE,INVALID_STATE,JISX208,JISX212,ASCII,INVALID_STATE,INVALID_STATE,INVALID_STATE},
-};
-static const char escSeqChars[MAX_VALID_CP_JP][6] ={
+static const char escSeqChars[][6] ={
"\x1B\x28\x42", /* (B ASCII */
"\x1B\x2E\x41", /* .A ISO-8859-1 */
"\x1B\x2E\x46", /* .F ISO-8859-7 */
@@ -1486,8 +1254,8 @@ static const char escSeqChars[MAX_VALID_CP_JP][6] ={
"\x1B\x28\x49" /* (I HWKANA_7BIT */
};
-static const int32_t escSeqCharsLen[MAX_VALID_CP_JP] ={
- 3, /* length of (B ASCII */
+static const int32_t escSeqCharsLen[] ={
+ 3, /* length of (B ASCII */
3, /* length of .A ISO-8859-1 */
3, /* length of .F ISO-8859-7 */
3, /* length of (J JISX-201 */
@@ -1516,237 +1284,275 @@ static const int32_t escSeqCharsLen[MAX_VALID_CP_JP] ={
*/
static void
-UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
-
+UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
UConverterDataISO2022 *converterData;
- unsigned char* target = (unsigned char*) args->target;
- const unsigned char* targetLimit = (const unsigned char*) args->targetLimit;
+ ISO2022State *pFromU2022State;
+ uint8_t *target = (uint8_t *) args->target;
+ const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
const UChar* source = args->source;
const UChar* sourceLimit = args->sourceLimit;
int32_t* offsets = args->offsets;
- int32_t offset = 0;
- uint32_t targetByteUnit = missingCharMarker;
- UChar32 sourceChar =0x0000;
- const char* escSeq = NULL;
- int len =0; /*length of escSeq chars*/
- UConverterCallbackReason reason;
- UConverterSharedData* sharedData=NULL;
- UBool useFallback;
-
- /* state variables*/
- StateEnum* currentState;
- StateEnum initIterState;
- UConverter** currentConverter;
- Cnv2022Type* currentType;
- UConverter** convArray;
-
- /* arguments check*/
- if ((args->converter == NULL) || (targetLimit < target) || (sourceLimit < source)){
- *err = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
+ UChar32 sourceChar;
+ char buffer[8];
+ int32_t len, outLen;
+ int8_t choices[10];
+ int32_t choiceCount;
+ uint32_t targetValue;
+ UBool useFallback;
+
+ int32_t i;
+ int8_t cs, g;
+
+ /* set up the state */
+ converterData = (UConverterDataISO2022*)args->converter->extraInfo;
+ pFromU2022State = &converterData->fromU2022State;
+ useFallback = args->converter->useFallback;
+
+ choiceCount = 0;
- /* Initialize */
- converterData = (UConverterDataISO2022*)args->converter->extraInfo;
- useFallback = args->converter->useFallback;
- currentState = &converterData->fromUnicodeCurrentState;
- initIterState = ASCII;
- currentConverter = &converterData->fromUnicodeConverter;
- convArray = converterData->myConverterArray;
- initIterState = *currentState;
- currentType = &converterData->currentType;
-
/* check if the last codepoint of previous buffer was a lead surrogate*/
- if(args->converter->fromUSurrogateLead!=0 && target< targetLimit) {
+ if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
goto getTrail;
}
-
- *currentConverter = convArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
- sharedData= (*currentConverter)->sharedData;
-
- while( source < sourceLimit){
- targetByteUnit = missingCharMarker;
+ while(source < sourceLimit) {
+ if(target < targetLimit) {
- if(target < targetLimit){
sourceChar = *(source++);
- if(sourceChar > SPACE) {
- do{
- switch (*currentType){
- /* most common case*/
- case DBCS:
- {
- uint32_t value=0;
- int length=0;
- /*if(2 == _MBCSFromUChar32(sharedData,sourceChar, &value, useFallback)) {
- targetByteUnit = (uint16_t)value;
- }*/
- MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&value,useFallback,&length,MBCS_OUTPUT_2);
- if(length==2){
- targetByteUnit = value;
- }
- }
- break;
- case ASCII1:
- if(sourceChar < 0x7f){
- targetByteUnit = sourceChar;
- }
- break;
-
- case SBCS:
- MBCS_SINGLE_FROM_UCHAR32(sharedData,sourceChar,&targetByteUnit,useFallback);
- /*targetByteUnit=(uint16_t)_MBCSSingleFromUChar32(sharedData,sourceChar,useFallback);*/
- /*
- * If mySourceChar is unassigned, then _MBCSSingleFromUChar32() returns -1
- * which becomes the same as missingCharMarker with the cast to uint16_t.
- */
- /* Check if the sourceChar is in the HW Kana range*/
- if(0xFF9F-sourceChar<=(0xFF9F-0xFF61)){
- if( converterData->version==3){
- /*we get a1-df from _MBCSSingleFromUChar32 so subtract 0x80*/
- targetByteUnit-=0x80;
- *currentState = HWKANA_7BIT;
- }
- else if( converterData->version==4){
- *currentState = JISX201;
- }
- else{
- targetByteUnit=missingCharMarker;
- }
- *currentConverter = convArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
- *currentType = (Cnv2022Type) myConverterType[*currentState];
- }
- break;
-
- case LATIN1:
- if(sourceChar <= 0x00FF){
- targetByteUnit = sourceChar;
+ /*check if the char is a First surrogate*/
+ if(UTF_IS_SURROGATE(sourceChar)) {
+ if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
+getTrail:
+ /*look ahead to find the trail surrogate*/
+ if(source < sourceLimit) {
+ /* test the following code unit */
+ UChar trail=(UChar) *source;
+ if(UTF_IS_SECOND_SURROGATE(trail)) {
+ source++;
+ sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
+ args->converter->fromUChar32=0x00;
+ /* convert this supplementary code point */
+ /* exit this condition tree */
+ } else {
+ /* this is an unmatched lead code unit (1st surrogate) */
+ /* callback(illegal) */
+ *err=U_ILLEGAL_CHAR_FOUND;
+ args->converter->fromUChar32=sourceChar;
+ break;
}
-
- break;
- default:
- /*not expected */
+ } else {
+ /* no more input */
+ args->converter->fromUChar32=sourceChar;
break;
}
- if(targetByteUnit==missingCharMarker){
- *currentState = nextStateArray[converterData->version][*currentState];
- *currentConverter = convArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
- *currentType = (Cnv2022Type) myConverterType[*currentState];
- sharedData= (*currentConverter)->sharedData;
- }
- else
- /*got the mapping so break from while loop*/
- break;
-
- }while(initIterState != *currentState);
-
- }
- else{
- targetByteUnit = sourceChar;
- *currentState = ASCII;
- *currentType = (Cnv2022Type) myConverterType[*currentState];
+ } else {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ *err=U_ILLEGAL_CHAR_FOUND;
+ args->converter->fromUChar32=sourceChar;
+ break;
+ }
}
- if(targetByteUnit != missingCharMarker){
+ /* do the conversion */
- if( *currentState != initIterState){
+ if(choiceCount == 0) {
+ uint16_t csm;
- escSeq = escSeqChars[(int)*currentState];
- len = escSeqCharsLen[(int)*currentState];
+ /*
+ * The csm variable keeps track of which charsets are allowed
+ * and not used yet while building the choices[].
+ */
+ csm = jpCharsetMasks[converterData->version];
+ choiceCount = 0;
+
+ /* JIS7/8: try single-byte half-width Katakana before JISX208 */
+ if(converterData->version == 3 || converterData->version == 4) {
+ choices[choiceCount++] = cs = (int8_t)HWKANA_7BIT;
+ csm &= ~CSM(cs);
+ }
- CONCAT_ESCAPE_EX(args,source, &target,targetLimit, &offsets, escSeq,len,err);
+ /* try the current G0 charset */
+ choices[choiceCount++] = cs = pFromU2022State->cs[0];
+ csm &= ~CSM(cs);
- /* Append SSN for shifting to G2 */
- if(*currentState==ISO8859_1 || *currentState==ISO8859_7){
- escSeq = UCNV_SS2;
- len = UCNV_SS2_LEN;
- CONCAT_ESCAPE_EX(args, source, &target, targetLimit,&offsets, escSeq,len,err);
+ /* try the current G2 charset */
+ if((cs = pFromU2022State->cs[2]) != 0) {
+ choices[choiceCount++] = cs;
+ csm &= ~CSM(cs);
+ }
+
+ /* try all the other possible charsets */
+ for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
+ cs = (int8_t)jpCharsetPref[i];
+ if(CSM(cs) & csm) {
+ choices[choiceCount++] = cs;
+ csm &= ~CSM(cs);
}
}
- initIterState = *currentState;
- offset = source - args->source -1;
- /* write the targetByteUnit to target */
- if(targetByteUnit <= 0x00FF){
- if( target converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) targetByteUnit;
- *err = U_BUFFER_OVERFLOW_ERROR;
+ cs = g = 0;
+ len = 0;
+
+ for(i = 0; i < choiceCount && len == 0; ++i) {
+ cs = choices[i];
+ switch(cs) {
+ case ASCII:
+ if(sourceChar <= 0x7f) {
+ targetValue = (uint32_t)sourceChar;
+ len = 1;
}
- }else{
- if(target < targetLimit){
- *(target++) =(unsigned char) (targetByteUnit>>8);
- if(offsets){
- *(offsets++) = offset;
- }
- if(target < targetLimit){
- *(target++) =(unsigned char) (targetByteUnit);
- if(offsets){
- *(offsets++) = offset;
+ break;
+ case ISO8859_1:
+ if(0x80 <= sourceChar && sourceChar <= 0xff) {
+ targetValue = (uint32_t)sourceChar - 0x80;
+ len = 1;
+ g = 2;
+ }
+ break;
+ case HWKANA_7BIT:
+ if((uint32_t)(0xff9f-sourceChar)<=(0xff9f-0xff61)) {
+ targetValue = (uint32_t)(sourceChar - (0xff61 - 0x21));
+ len = 1;
+
+ if(converterData->version==3) {
+ /* JIS7: use G1 (SO) */
+ pFromU2022State->cs[1] = cs; /* do not output an escape sequence */
+ g = 1;
+ } else if(converterData->version==4) {
+ /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
+ int8_t cs0;
+
+ targetValue += 0x80;
+
+ cs0 = pFromU2022State->cs[0];
+ if(IS_JP_DBCS(cs0)) {
+ /* switch from a DBCS charset to JISX201 */
+ cs = (int8_t)JISX201;
+ } else {
+ /* stay in the current G0 charset */
+ cs = cs0;
}
-
- }else{
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
- *err = U_BUFFER_OVERFLOW_ERROR;
}
- }else{
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit>>8);
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
- *err = U_BUFFER_OVERFLOW_ERROR;
}
+ break;
+ case JISX201:
+ /* G0 SBCS */
+ MBCS_SINGLE_FROM_UCHAR32(
+ converterData->myConverterArray[cs],
+ sourceChar, &targetValue,
+ useFallback);
+ if(targetValue <= 0x7f) {
+ len = 1;
+ }
+ break;
+ case ISO8859_7:
+ /* G0 SBCS forced to 7-bit output */
+ MBCS_SINGLE_FROM_UCHAR32(
+ converterData->myConverterArray[cs],
+ sourceChar, &targetValue,
+ useFallback);
+ if(0x80 <= targetValue && targetValue <= 0xff) {
+ targetValue -= 0x80;
+ len = 1;
+ g = 2;
+ }
+ break;
+ default:
+ /* G0 DBCS */
+ MBCS_FROM_UCHAR32_ISO2022(
+ converterData->myConverterArray[cs],
+ sourceChar, &targetValue,
+ useFallback, &len, MBCS_OUTPUT_2);
+ if(len != 2) {
+ len = 0;
+ }
+ break;
}
}
- else{
- /* if we cannot find the character after checking all codepages
+ if(len > 0) {
+ outLen = 0; /* count output bytes */
+
+ /* write SI if necessary (only for JIS7) */
+ if(pFromU2022State->g == 1 && g == 0) {
+ buffer[outLen++] = UCNV_SI;
+ pFromU2022State->g = 0;
+ }
+
+ /* write the designation sequence if necessary */
+ if(cs != pFromU2022State->cs[g]) {
+ int32_t escLen = escSeqCharsLen[cs];
+ uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
+ outLen += escLen;
+ pFromU2022State->cs[g] = cs;
+
+ /* invalidate the choices[] */
+ choiceCount = 0;
+ }
+
+ /* write the shift sequence if necessary */
+ if(g != pFromU2022State->g) {
+ switch(g) {
+ /* case 0 handled before writing escapes */
+ case 1:
+ buffer[outLen++] = UCNV_SO;
+ pFromU2022State->g = 1;
+ break;
+ default: /* case 2 */
+ buffer[outLen++] = 0x1b;
+ buffer[outLen++] = 0x4e;
+ break;
+ /* no case 3: no SS3 in ISO-2022-JP-x */
+ }
+ }
+
+ /* write the output bytes */
+ if(len == 1) {
+ buffer[outLen++] = (char)targetValue;
+ } else /* len == 2 */ {
+ buffer[outLen++] = (char)(targetValue >> 8);
+ buffer[outLen++] = (char)targetValue;
+ }
+ } else {
+ /*
+ * if we cannot find the character after checking all codepages
* then this is an error
*/
- reason = UCNV_UNASSIGNED;
*err = U_INVALID_CHAR_FOUND;
-
- /*check if the char is a First surrogate*/
- if(UTF_IS_SURROGATE(sourceChar)) {
- if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
- args->converter->fromUSurrogateLead=(UChar)sourceChar;
-getTrail:
- /*look ahead to find the trail surrogate*/
- if(source < sourceLimit) {
- /* test the following code unit */
- UChar trail=(UChar) *source;
- if(UTF_IS_SECOND_SURROGATE(trail)) {
- source++;
- sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail);
- args->converter->fromUSurrogateLead=0x00;
- reason =UCNV_UNASSIGNED;
- *err = U_INVALID_CHAR_FOUND;
- /* convert this surrogate code point */
- /* exit this condition tree */
- } else {
- /* this is an unmatched lead code unit (1st surrogate) */
- /* callback(illegal) */
- reason=UCNV_ILLEGAL;
- *err=U_ILLEGAL_CHAR_FOUND;
- }
- } else {
- /* no more input */
- *err = U_ZERO_ERROR;
- break;
- }
- } else {
- /* this is an unmatched trail code unit (2nd surrogate) */
- /* callback(illegal) */
- reason=UCNV_ILLEGAL;
- *err=U_ILLEGAL_CHAR_FOUND;
- }
+ args->converter->fromUChar32=sourceChar;
+ break;
+ }
+
+ if(sourceChar == CR || sourceChar == LF) {
+ /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
+ pFromU2022State->cs[2] = 0;
+ choiceCount = 0;
+ }
+
+ /* output outLen>0 bytes in buffer[] */
+ if(outLen == 1) {
+ *target++ = buffer[0];
+ if(offsets) {
+ *offsets++ = source - args->source - 1; /* -1: known to be ASCII */
}
- /* Call the callback function*/
- fromUnicodeCallback(args,sourceChar,&source,&target,&offsets,reason,err);
- initIterState = *currentState;
- if (U_FAILURE (*err)){
+ } else if(outLen == 2 && (target + 2) <= targetLimit) {
+ *target++ = buffer[0];
+ *target++ = buffer[1];
+ if(offsets) {
+ int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
+ *offsets++ = sourceIndex;
+ *offsets++ = sourceIndex;
+ }
+ } else {
+ ucnv_fromUWriteBytes(
+ args->converter,
+ buffer, outLen,
+ (char **)&target, (const char *)targetLimit,
+ &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
+ err);
+ if(U_FAILURE(*err)) {
break;
}
}
@@ -1758,18 +1564,62 @@ getTrail:
}/* end while(mySourceIndexconverter->fromUSurrogateLead !=0 && (source == sourceLimit) && args->flush){
- *err = U_TRUNCATED_CHAR_FOUND;
- }
- /* Reset the state of converter if we consumed
- * the source and flush is true
+ /*
+ * the end of the input stream and detection of truncated input
+ * are handled by the framework, but for ISO-2022-JP conversion
+ * we need to be in ASCII mode at the very end
+ *
+ * conditions:
+ * successful
+ * in SO mode or not in ASCII mode
+ * end of input and no truncated input
*/
- if( (source == sourceLimit) && args->flush){
- setInitialStateFromUnicodeJPCN(args->converter,converterData);
+ if( U_SUCCESS(*err) &&
+ (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
+ args->flush && source>=sourceLimit && args->converter->fromUChar32==0
+ ) {
+ int32_t sourceIndex;
+
+ outLen = 0;
+
+ if(pFromU2022State->g != 0) {
+ buffer[outLen++] = UCNV_SI;
+ pFromU2022State->g = 0;
+ }
+
+ if(pFromU2022State->cs[0] != ASCII) {
+ int32_t escLen = escSeqCharsLen[ASCII];
+ uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
+ outLen += escLen;
+ pFromU2022State->cs[0] = (int8_t)ASCII;
+ }
+
+ /* get the source index of the last input character */
+ /*
+ * TODO this would be simpler and more reliable if we used a pair
+ * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
+ * so that we could simply use the prevSourceIndex here;
+ * this code gives an incorrect result for the rare case of an unmatched
+ * trail surrogate that is alone in the last buffer of the text stream
+ */
+ sourceIndex=(int32_t)(source-args->source);
+ if(sourceIndex>0) {
+ --sourceIndex;
+ if( U16_IS_TRAIL(args->source[sourceIndex]) &&
+ (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
+ ) {
+ --sourceIndex;
+ }
+ } else {
+ sourceIndex=-1;
+ }
+
+ ucnv_fromUWriteBytes(
+ args->converter,
+ buffer, outLen,
+ (char **)&target, (const char *)targetLimit,
+ &offsets, sourceIndex,
+ err);
}
/*save the state and return */
@@ -1779,229 +1629,186 @@ getTrail:
/*************** to unicode *******************/
-/****************************************************************************
- * Recognized escape sequences are
- * (B ASCII
- * .A ISO-8859-1
- * .F ISO-8859-7
- * (J JISX-201
- * (I JISX-201
- * $B JISX-208
- * $@ JISX-208
- * $(D JISX-212
- * $A GB2312
- * $(C KSC5601
- */
-static const StateEnum nextStateToUnicodeJP[5][MAX_STATES_2022]= {
- {
-/* 0 1 2 3 4 5 6 7 8 9 */
- INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,JISX208 ,INVALID_STATE ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- },
- {
-/* 0 1 2 3 4 5 6 7 8 9 */
- INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,JISX208 ,INVALID_STATE ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX212 ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- },
- {
-/* 0 1 2 3 4 5 6 7 8 9 */
- INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- },
- {
-/* 0 1 2 3 4 5 6 7 8 9 */
- INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- },
- {
-/* 0 1 2 3 4 5 6 7 8 9 */
- INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- }
-};
-
static void
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
- UErrorCode* err){
- char tempBuf[2];
- const char *mySource = ( char *) args->source;
+ UErrorCode* err){
+ char tempBuf[3];
+ const char *mySource = (char *) args->source;
UChar *myTarget = args->target;
const char *mySourceLimit = args->sourceLimit;
uint32_t targetUniChar = 0x0000;
uint32_t mySourceChar = 0x0000;
UConverterDataISO2022* myData;
- StateEnum* currentState;
- uint32_t* toUnicodeStatus;
- int plane = 0; /*dummy variable*/
+ ISO2022State *pToU2022State;
+ StateEnum cs;
- if ((args->converter == NULL) || (myTarget < args->target) || (mySource < args->source)){
- *err = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
myData=(UConverterDataISO2022*)(args->converter->extraInfo);
- currentState = &myData->toUnicodeCurrentState;
- toUnicodeStatus = &args->converter->toUnicodeStatus;
- while(mySource< args->sourceLimit){
+ pToU2022State = &myData->toU2022State;
- targetUniChar = missingCharMarker;
+ if(myData->key != 0) {
+ /* continue with a partial escape sequence */
+ goto escape;
+ } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
+ /* continue with a partial double-byte character */
+ mySourceChar = args->converter->toUBytes[0];
+ args->converter->toULength = 0;
+ cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
+ goto getTrailByte;
+ }
+
+ while(mySource < mySourceLimit){
+
+ targetUniChar =missingCharMarker;
if(myTarget < args->targetLimit){
mySourceChar= (unsigned char) *mySource++;
-
- /* Consume the escape sequences and ascertain the state */
- if(mySourceChar==UCNV_SI){
- if(myData->version==3 && *toUnicodeStatus==0x00){
- if(myData->toUnicodeSaveState!=INVALID_STATE){
- *currentState = (StateEnum) myData->toUnicodeSaveState;
- continue;
- }
- else{
- *err =U_ILLEGAL_CHAR_FOUND;
- goto CALLBACK;
- }
-
- }
- else{
- goto CALLBACK;
- }
- }else if(mySourceChar==UCNV_SO){
- if(myData->version==3 && *toUnicodeStatus==0x00){
- myData->toUnicodeSaveState= (int) *currentState;
- *currentState = HWKANA_7BIT;
+
+ switch(mySourceChar) {
+ case UCNV_SI:
+ if(myData->version==3) {
+ pToU2022State->g=0;
continue;
+ } else {
+ /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
+ break;
}
- else{
- goto CALLBACK;
- }
- }else if(mySourceChar==ESC_2022 || myData->key!=0){
- if(*toUnicodeStatus== 0x00){
- mySource--;
- changeState_2022(args->converter,&(mySource),
- args->sourceLimit, args->flush,ISO_2022_JP,&plane, err);
- /*Invalid or illegal escape sequence */
- if(U_SUCCESS(*err)){
- continue;
-
- }
- else{
- args->target = myTarget;
- args->source = mySource;
- return;
- }
- }
- else{
- goto CALLBACK;
- }
- }
- switch(myConverterType[*currentState]){
- case DBCS:
- if(*toUnicodeStatus== 0x00){
- *toUnicodeStatus= (UChar) mySourceChar;
+ case UCNV_SO:
+ if(myData->version==3) {
+ /* JIS7: switch to G1 half-width Katakana */
+ pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
+ pToU2022State->g=1;
continue;
+ } else {
+ /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
+ break;
}
- else{
- const char *pBuf;
-
- tempBuf[0] = (char) args->converter->toUnicodeStatus;
- tempBuf[1] = (char) mySourceChar;
- mySourceChar+= (args->converter->toUnicodeStatus)<<8;
- *toUnicodeStatus= 0;
- pBuf = tempBuf;
- targetUniChar = _MBCSSimpleGetNextUChar(myData->currentConverter->sharedData, &pBuf, tempBuf+2, args->converter->useFallback);
- }
- break;
+ case ESC_2022:
+ mySource--;
+escape:
+ changeState_2022(args->converter,&(mySource),
+ mySourceLimit, ISO_2022_JP,err);
- case ASCII1:
- if( mySourceChar < 0x7F){
- targetUniChar = (UChar) mySourceChar;
- }
- else if((uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4) {
- /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
- targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myData->myConverterArray[JISX201]->sharedData, mySourceChar);
+ /* invalid or illegal escape sequence */
+ if(U_FAILURE(*err)){
+ args->target = myTarget;
+ args->source = mySource;
+ return;
}
+ continue;
- break;
+ /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
- case SBCS:
- if((uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4) {
- /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
- targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myData->myConverterArray[JISX201]->sharedData, mySourceChar);
- }
- else if(*currentState==HWKANA_7BIT){
- targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myData->myConverterArray[JISX201]->sharedData, mySourceChar+0x80);
- }
- else {
- targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myData->currentConverter->sharedData, mySourceChar);
+ case CR:
+ /*falls through*/
+ case LF:
+ /* automatically reset to single-byte mode */
+ if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
+ pToU2022State->cs[0] = (int8_t)ASCII;
}
-
- break;
-
- case LATIN1:
-
- targetUniChar = (UChar) mySourceChar;
- break;
-
- case INVALID_STATE:
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- args->target = myTarget;
- args->source = mySource;
- return;
-
+ pToU2022State->cs[2] = 0;
+ pToU2022State->g = 0;
+ /* falls through */
default:
- /* For non-valid state MBCS and others */
+ /* convert one or two bytes */
+ cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
+ if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
+ !IS_JP_DBCS(cs)
+ ) {
+ /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
+ targetUniChar = mySourceChar + (0xff61 - 0xa1);
+
+ /* return from a single-shift state to the previous one */
+ if(pToU2022State->g >= 2) {
+ pToU2022State->g=pToU2022State->prevG;
+ }
+ } else switch(cs) {
+ case ASCII:
+ if(mySourceChar <= 0x7f) {
+ targetUniChar = mySourceChar;
+ }
+ break;
+ case ISO8859_1:
+ if(mySourceChar <= 0x7f) {
+ targetUniChar = mySourceChar + 0x80;
+ }
+ /* return from a single-shift state to the previous one */
+ pToU2022State->g=pToU2022State->prevG;
+ break;
+ case ISO8859_7:
+ if(mySourceChar <= 0x7f) {
+ /* convert mySourceChar+0x80 to use a normal 8-bit table */
+ targetUniChar =
+ _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
+ myData->myConverterArray[cs],
+ mySourceChar + 0x80);
+ }
+ /* return from a single-shift state to the previous one */
+ pToU2022State->g=pToU2022State->prevG;
+ break;
+ case JISX201:
+ if(mySourceChar <= 0x7f) {
+ targetUniChar =
+ _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
+ myData->myConverterArray[cs],
+ mySourceChar);
+ }
+ break;
+ case HWKANA_7BIT:
+ if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
+ /* 7-bit halfwidth Katakana */
+ targetUniChar = mySourceChar + (0xff61 - 0x21);
+ }
+ break;
+ default:
+ /* G0 DBCS */
+ if(mySource < mySourceLimit) {
+ char trailByte;
+getTrailByte:
+ tempBuf[0] = (char) (mySourceChar);
+ tempBuf[1] = trailByte = *mySource++;
+ mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
+ } else {
+ args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+ args->converter->toULength = 1;
+ goto endloop;
+ }
+ }
break;
}
if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
if(args->offsets){
- args->offsets[myTarget - args->target]= mySource - args->source - 2
- +(myConverterType[*currentState] <= SBCS);
-
+ args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
}
*(myTarget++)=(UChar)targetUniChar;
- targetUniChar=missingCharMarker;
}
- else{
-CALLBACK:
+ else if(targetUniChar > missingCharMarker){
+ /* disassemble the surrogate pair and write to output*/
+ targetUniChar-=0x0010000;
+ *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
+ if(args->offsets){
+ args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
+ }
+ ++myTarget;
+ if(myTarget< args->targetLimit){
+ *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
+ if(args->offsets){
+ args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
+ }
+ ++myTarget;
+ }else{
+ args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
+ (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
+ }
+ }
+ else{
/* Call the callback function*/
- toUnicodeCallback(args,mySourceChar,&mySource,targetUniChar,&myTarget,err);
- /*args->offsets = saveOffsets;*/
- if(U_FAILURE(*err))
- break;
-
+ toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
+ break;
}
}
else{
@@ -2009,25 +1816,12 @@ CALLBACK:
break;
}
}
- if((args->flush==TRUE)
- && (mySource == mySourceLimit)
- && ( *toUnicodeStatus!=0x00)){
-
- *err = U_TRUNCATED_CHAR_FOUND;
- *toUnicodeStatus= 0x00;
- }
- /* Reset the state of converter if we consumed
- * the source and flush is true
- */
- if( (mySource == mySourceLimit) && args->flush){
- setInitialStateToUnicodeJPCN(args->converter,myData);
- }
+endloop:
args->target = myTarget;
args->source = mySource;
}
-
/***************************************************************
* Rules for ISO-2022-KR encoding
* i) The KSC5601 designator sequence should appear only once in a file,
@@ -2039,25 +1833,25 @@ CALLBACK:
static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
- UConverter* saveConv = args->converter;
- UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)args->converter->extraInfo;
- args->converter=myConverterData->currentConverter;
- _MBCSFromUnicodeWithOffsets(args,err);
- if(U_FAILURE(*err)){
- if(args->converter->charErrorBufferLength!=0){
- uprv_memcpy(saveConv->charErrorBuffer, args->converter->charErrorBuffer,
- args->converter->charErrorBufferLength);
- saveConv->charErrorBufferLength=args->converter->charErrorBufferLength;
- args->converter->charErrorBufferLength=0;
- }
- if(args->converter->invalidUCharLength!=0){
- uprv_memcpy(saveConv->invalidUCharBuffer, args->converter->invalidUCharBuffer,
- args->converter->invalidUCharLength);
- saveConv->invalidUCharLength=args->converter->invalidUCharLength;
- args->converter->invalidCharLength=0;
- }
- }
- args->converter=saveConv;
+ UConverter* saveConv = args->converter;
+ UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
+ args->converter=myConverterData->currentConverter;
+
+ myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
+ ucnv_MBCSFromUnicodeWithOffsets(args,err);
+ saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
+
+ if(*err == U_BUFFER_OVERFLOW_ERROR) {
+ if(myConverterData->currentConverter->charErrorBufferLength > 0) {
+ uprv_memcpy(
+ saveConv->charErrorBuffer,
+ myConverterData->currentConverter->charErrorBuffer,
+ myConverterData->currentConverter->charErrorBufferLength);
+ }
+ saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
+ myConverterData->currentConverter->charErrorBufferLength = 0;
+ }
+ args->converter=saveConv;
}
static void
@@ -2073,21 +1867,11 @@ UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
UBool isTargetByteDBCS;
UBool oldIsTargetByteDBCS;
UConverterDataISO2022 *converterData;
- UConverterCallbackReason reason;
UConverterSharedData* sharedData;
UBool useFallback;
int32_t length =0;
- if ((args->converter == NULL) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)){
- *err = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
- /* initialize data */
converterData=(UConverterDataISO2022*)args->converter->extraInfo;
- sharedData = converterData->fromUnicodeConverter->sharedData;
- useFallback = args->converter->useFallback;
- isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
- oldIsTargetByteDBCS = isTargetByteDBCS;
/* if the version is 1 then the user is requesting
* conversion with ibm-25546 pass the arguments to
* MBCS converter and return
@@ -2096,9 +1880,15 @@ UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
return;
}
+
+ /* initialize data */
+ sharedData = converterData->currentConverter->sharedData;
+ useFallback = args->converter->useFallback;
+ isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
+ oldIsTargetByteDBCS = isTargetByteDBCS;
isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
- if(args->converter->fromUSurrogateLead!=0 && target converter->fromUChar32)!=0 && target targetLimit){
sourceChar = *source++;
- /* length= _MBCSFromUChar32(converterData->fromUnicodeConverter->sharedData,
+ /* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData,
sourceChar,&targetByteUnit,args->converter->useFallback);*/
- MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,(int*)&length,MBCS_OUTPUT_2);
+ MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2);
/* only DBCS or SBCS characters are expected*/
- /* DB haracters with high bit set to 1 are expected */
+ /* DB characters with high bit set to 1 are expected */
if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
targetByteUnit=missingCharMarker;
}
@@ -2168,13 +1958,10 @@ UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
/* oops.. the code point is unassingned
* set the error and reason
*/
- reason =UCNV_UNASSIGNED;
- *err =U_INVALID_CHAR_FOUND;
/*check if the char is a First surrogate*/
if(UTF_IS_SURROGATE(sourceChar)) {
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
- args->converter->fromUSurrogateLead=(UChar)sourceChar;
getTrail:
/*look ahead to find the trail surrogate*/
if(source < sourceLimit) {
@@ -2182,38 +1969,32 @@ getTrail:
UChar trail=(UChar) *source;
if(UTF_IS_SECOND_SURROGATE(trail)) {
source++;
- sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail);
- args->converter->fromUSurrogateLead=0x00;
+ sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
*err = U_INVALID_CHAR_FOUND;
- reason =UCNV_UNASSIGNED;
/* convert this surrogate code point */
/* exit this condition tree */
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
- reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
}
} else {
/* no more input */
*err = U_ZERO_ERROR;
- break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
- reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
}
+ } else {
+ /* callback(unassigned) for a BMP code point */
+ *err = U_INVALID_CHAR_FOUND;
}
- args->converter->fromUnicodeStatus = (int32_t)isTargetByteDBCS;
- /* Call the callback function*/
- fromUnicodeCallback(args,sourceChar,&source,&target,&offsets,reason,err);
- isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
- if (U_FAILURE (*err)){
- break;
- }
+ args->converter->fromUChar32=sourceChar;
+ args->converter->fromUnicodeStatus = (int32_t)isTargetByteDBCS;
+ break;
}
} /* end if(myTargetIndexconverter->fromUSurrogateLead !=0 && (source == sourceLimit) && args->flush){
- *err = U_TRUNCATED_CHAR_FOUND;
- }
- /* Reset the state of converter if we consumed
- * the source and flush is true
+ /*
+ * the end of the input stream and detection of truncated input
+ * are handled by the framework, but for ISO-2022-KR conversion
+ * we need to be in ASCII mode at the very end
+ *
+ * conditions:
+ * successful
+ * not in ASCII mode
+ * end of input and no truncated input
*/
- if( (source == sourceLimit) && args->flush){
- setInitialStateFromUnicodeKR(args->converter,converterData);
+ if( U_SUCCESS(*err) &&
+ isTargetByteDBCS &&
+ args->flush && source>=sourceLimit && args->converter->fromUChar32==0
+ ) {
+ int32_t sourceIndex;
+
+ /* we are switching to ASCII */
+ isTargetByteDBCS=FALSE;
+
+ /* get the source index of the last input character */
+ /*
+ * TODO this would be simpler and more reliable if we used a pair
+ * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
+ * so that we could simply use the prevSourceIndex here;
+ * this code gives an incorrect result for the rare case of an unmatched
+ * trail surrogate that is alone in the last buffer of the text stream
+ */
+ sourceIndex=(int32_t)(source-args->source);
+ if(sourceIndex>0) {
+ --sourceIndex;
+ if( U16_IS_TRAIL(args->source[sourceIndex]) &&
+ (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
+ ) {
+ --sourceIndex;
+ }
+ } else {
+ sourceIndex=-1;
+ }
+
+ ucnv_fromUWriteBytes(
+ args->converter,
+ SHIFT_IN_STR, 1,
+ (char **)&target, (const char *)targetLimit,
+ &offsets, sourceIndex,
+ err);
}
/*save the state and return */
@@ -2248,113 +2062,153 @@ getTrail:
static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
UErrorCode* err){
- const char* mySourceLimit;
char const* sourceStart;
- UConverter* saveThis;
- int plane =0; /*dummy variable */
UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
- do{
+ UConverterToUnicodeArgs subArgs;
+ int32_t minArgsSize;
+
+ /* set up the subconverter arguments */
+ if(args->sizesize;
+ } else {
+ minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
+ }
+
+ uprv_memcpy(&subArgs, args, minArgsSize);
+ subArgs.size = (uint16_t)minArgsSize;
+ subArgs.converter = myData->currentConverter;
+
+ /* remember the original start of the input for offsets */
+ sourceStart = args->source;
+
+ if(myData->key != 0) {
+ /* continue with a partial escape sequence */
+ goto escape;
+ }
+
+ while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
/*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
- mySourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
-
- if (args->converter->mode == UCNV_SO) /*Already doing some conversion*/{
- saveThis = args->converter;
- args->offsets = NULL;
- args->converter = myData->currentConverter;
- _MBCSToUnicodeWithOffsets(args,err);
- if(U_FAILURE(*err)){
- uprv_memcpy(saveThis->invalidUCharBuffer, args->converter->invalidUCharBuffer,
- args->converter->invalidUCharLength);
- saveThis->invalidUCharLength=args->converter->invalidUCharLength;
+ subArgs.source = args->source;
+ subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
+ if(subArgs.source != subArgs.sourceLimit) {
+ /*
+ * get the current partial byte sequence
+ *
+ * it needs to be moved between the public and the subconverter
+ * so that the conversion framework, which only sees the public
+ * converter, can handle truncated and illegal input etc.
+ */
+ if(args->converter->toULength > 0) {
+ uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
+ }
+ subArgs.converter->toULength = args->converter->toULength;
+
+ /*
+ * Convert up to the end of the input, or to before the next escape character.
+ * Does not handle conversion extensions because the preToU[] state etc.
+ * is not copied.
+ */
+ ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
+
+ if(args->offsets != NULL && sourceStart != args->source) {
+ /* update offsets to base them on the actual start of the input */
+ int32_t *offsets = args->offsets;
+ UChar *target = args->target;
+ int32_t delta = (int32_t)(args->source - sourceStart);
+ while(target < subArgs.target) {
+ if(*offsets >= 0) {
+ *offsets += delta;
+ }
+ ++offsets;
+ ++target;
+ }
+ }
+ args->source = subArgs.source;
+ args->target = subArgs.target;
+ args->offsets = subArgs.offsets;
+
+ /* copy input/error/overflow buffers */
+ if(subArgs.converter->toULength > 0) {
+ uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
+ }
+ args->converter->toULength = subArgs.converter->toULength;
+
+ if(*err == U_BUFFER_OVERFLOW_ERROR) {
+ if(subArgs.converter->UCharErrorBufferLength > 0) {
+ uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
+ subArgs.converter->UCharErrorBufferLength);
+ }
+ args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
+ subArgs.converter->UCharErrorBufferLength = 0;
}
- args->converter = saveThis;
}
- /*-Done with buffer with entire buffer
- -Error while converting
- */
- if (U_FAILURE(*err) || (args->source == args->sourceLimit))
+ if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
return;
+ }
- sourceStart = args->source;
+escape:
changeState_2022(args->converter,
&(args->source),
args->sourceLimit,
- TRUE,
ISO_2022_KR,
- &plane,
err);
- /* args->source = sourceStart; */
-
-
- }while(args->source < args->sourceLimit);
- /* return*/
+ }
}
static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
UErrorCode* err){
- char tempBuf[3];
- const char* pBuf;
+ char tempBuf[2];
const char *mySource = ( char *) args->source;
UChar *myTarget = args->target;
const char *mySourceLimit = args->sourceLimit;
UChar32 targetUniChar = 0x0000;
UChar mySourceChar = 0x0000;
UConverterDataISO2022* myData;
- int plane =0; /*dummy variable */
UConverterSharedData* sharedData ;
UBool useFallback;
-
- if ((args->converter == NULL) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)){
- *err = U_ILLEGAL_ARGUMENT_ERROR;
+ myData=(UConverterDataISO2022*)(args->converter->extraInfo);
+ if(myData->version==1){
+ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
return;
}
+
/* initialize state */
- myData=(UConverterDataISO2022*)(args->converter->extraInfo);
- sharedData = myData->fromUnicodeConverter->sharedData;
+ sharedData = myData->currentConverter->sharedData;
useFallback = args->converter->useFallback;
- if(myData->version==1){
- UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
- return;
+ if(myData->key != 0) {
+ /* continue with a partial escape sequence */
+ goto escape;
+ } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
+ /* continue with a partial double-byte character */
+ mySourceChar = args->converter->toUBytes[0];
+ args->converter->toULength = 0;
+ goto getTrailByte;
}
- while(mySource< args->sourceLimit){
- targetUniChar = missingCharMarker;
+ while(mySource< mySourceLimit){
if(myTarget < args->targetLimit){
mySourceChar= (unsigned char) *mySource++;
if(mySourceChar==UCNV_SI){
- myData->currentType = SBCS;
+ myData->toU2022State.g = 0;
/*consume the source */
continue;
}else if(mySourceChar==UCNV_SO){
- myData->currentType = DBCS;
+ myData->toU2022State.g = 1;
/*consume the source */
continue;
- }else if(mySourceChar==ESC_2022 || myData->key!=0){
-
- /*
- * Commented out this part to be lenient and allow for
- * more escape sequences in ISO-2022-KR byte stream
- *
- * Already doing some conversion and found escape Sequence
- * if(args->converter->mode == UCNV_SO){
- * *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- * }
- * else{
- *
- */
-
- mySource--;
- changeState_2022(args->converter,&(mySource),
- args->sourceLimit, args->flush,ISO_2022_KR,&plane, err);
- /*}*/
+ }else if(mySourceChar==ESC_2022){
+ mySource--;
+escape:
+ changeState_2022(args->converter,&(mySource),
+ mySourceLimit, ISO_2022_KR, err);
if(U_FAILURE(*err)){
args->target = myTarget;
args->source = mySource;
@@ -2363,40 +2217,39 @@ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
continue;
}
- if(myData->currentType==DBCS){
- if(args->converter->toUnicodeStatus == 0x00){
- args->converter->toUnicodeStatus = (UChar) mySourceChar;
- continue;
- }
- else{
- tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80);
- tempBuf[1] = (char) (mySourceChar+0x80);
- mySourceChar = (UChar)(mySourceChar + (args->converter->toUnicodeStatus<<8));
- args->converter->toUnicodeStatus =0x00;
- pBuf = tempBuf;
- targetUniChar = _MBCSSimpleGetNextUChar(sharedData,
- &pBuf,(pBuf+2),useFallback);
+ if(myData->toU2022State.g == 1) {
+ if(mySource < mySourceLimit) {
+ char trailByte;
+getTrailByte:
+ trailByte = *mySource++;
+ tempBuf[0] = (char)(mySourceChar + 0x80);
+ tempBuf[1] = (char)(trailByte + 0x80);
+ mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
+ if((mySourceChar & 0x8080) == 0) {
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
+ } else {
+ /* illegal bytes > 0x7f */
+ targetUniChar = missingCharMarker;
+ }
+ } else {
+ args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+ args->converter->toULength = 1;
+ break;
}
}
else{
- if(args->converter->fromUnicodeStatus == 0x00){
- targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(sharedData, mySourceChar);
-
- }
-
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
}
- if(targetUniChar != missingCharMarker){
- if(args->offsets)
- args->offsets[myTarget - args->target]= mySource - args->source - 1-(myData->currentType==DBCS);
+ if(targetUniChar < 0xfffe){
+ if(args->offsets) {
+ args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
+ }
*(myTarget++)=(UChar)targetUniChar;
}
else {
-
/* Call the callback function*/
- toUnicodeCallback(args,mySourceChar,&mySource,targetUniChar,&myTarget,err);
- if(U_FAILURE(*err)){
- break;
- }
+ toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
+ break;
}
}
else{
@@ -2404,19 +2257,6 @@ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
break;
}
}
- if((args->flush==TRUE)
- && (mySource == mySourceLimit)
- && ( args->converter->toUnicodeStatus !=0x00)){
-
- *err = U_TRUNCATED_CHAR_FOUND;
- args->converter->toUnicodeStatus = 0x00;
- }
- /* Reset the state of converter if we consumed
- * the source and flush is true
- */
- if( (mySource == mySourceLimit) && args->flush){
- setInitialStateToUnicodeKR(args->converter,myData);
- }
args->target = myTarget;
args->source = mySource;
}
@@ -2426,21 +2266,21 @@ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
/*************************** ISO-2022-CN *********************************
*
* Rules for ISO-2022-CN Encoding:
-* i) The desinator sequence must appear once on a line before any instance
+* i) The designator sequence must appear once on a line before any instance
* of character set it designates.
* ii) If two lines contain characters from the same character set, both lines
* must include the designator sequence.
-* iii) Once the designator sequence is know, a shifting sequnce has to be found
+* iii) Once the designator sequence is known, a shifting sequence has to be found
* to invoke the shifting
* iv) All lines start in ASCII and end in ASCII.
* v) Four shifting sequences are employed for this purpose:
*
* Sequcence ASCII Eq Charsets
* ---------- ------- ---------
-* SS2 N CNS-11643-1992 Planes 3-7
-* SS3 O CNS-11643-1992 Plane 2
-* SI
-* SO CNS-11643-1992 Plane 1, GB2312,ISO-IR-165
+* SI US-ASCII
+* SO CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
+* SS2 N CNS-11643-1992 Plane 2
+* SS3 O CNS-11643-1992 Planes 3-7
*
* vi)
* SOdesignator : ESC "$" ")" finalchar_for_SO
@@ -2478,7 +2318,7 @@ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
* is a Chinese character as defined in CNS
* 11643-plane-4, until another SS3designation
* appears
-* (In English: N must preceed every 2 byte
+* (In English: O must preceed every 2 byte
* sequence.)
*
* ESC $ + K Indicates the immediate two bytes following SS3
@@ -2503,9 +2343,6 @@ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
*/
/* The following are defined this way to make the strings truely readonly */
-static const char EMPTY_STR[] = "";
-static const char SHIFT_IN_STR[] = "\x0F";
-static const char SHIFT_OUT_STR[] = "\x0E";
static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
@@ -2529,124 +2366,44 @@ static const char* const escSeqCharsCN[10] ={
CNS_11643_1992_Plane_6_STR,
CNS_11643_1992_Plane_7_STR
};
-static const int escSeqCharsLenCN[10] = {
- 1, /* length of escSeq for ASCII */
- 4, /* length of escSeq for GB 2312-80 */
- 4, /* length of escSeq for ISO-IR-165 */
- 4, /* length of escSeq for CNS 11643-1992 Plane 1 */
- 4, /* length of escSeq for CNS 11643-1992 Plane 2 */
- 4, /* length of escSeq for CNS 11643-1992 Plane 3 */
- 4, /* length of escSeq for CNS 11643-1992 Plane 4 */
- 4, /* length of escSeq for CNS 11643-1992 Plane 5 */
- 4, /* length of escSeq for CNS 11643-1992 Plane 6 */
- 4 /* length of escSeq for CNS 11643-1992 Plane 7 */
-};
-static const char* const shiftSeqCharsCN[10] ={
- EMPTY_STR, /* ASCII */
- SHIFT_OUT_STR, /* GB 2312-80 */
- SHIFT_OUT_STR, /* ISO-IR-165 */
- SHIFT_OUT_STR, /* CNS 11643-1992 Plane 1 */
- UCNV_SS2, /* CNS 11643-1992 Plane 2 */
- UCNV_SS3, /* CNS 11643-1992 Plane 3 */
- UCNV_SS3, /* CNS 11643-1992 Plane 4 */
- UCNV_SS3, /* CNS 11643-1992 Plane 5 */
- UCNV_SS3, /* CNS 11643-1992 Plane 6 */
- UCNV_SS3 /* CNS 11643-1992 Plane 7 */
-};
-static const int shiftSeqCharsLenCN[10] ={
- 0, /* length of shiftSeq for ASCII */
- 1, /* length of shiftSeq for GB 2312-80 */
- 1, /* length of shiftSeq for ISO-IR-165 */
- 1, /* length of shiftSeq for CNS 11643-1992 Plane 1 */
- 2, /* length of shiftSeq for CNS 11643-1992 Plane 2 */
- 2, /* length of shiftSeq for CNS 11643-1992 Plane 3 */
- 2, /* length of shiftSeq for CNS 11643-1992 Plane 4 */
- 2, /* length of shiftSeq for CNS 11643-1992 Plane 5 */
- 2, /* length of shiftSeq for CNS 11643-1992 Plane 6 */
- 2 /* length of shiftSeq for CNS 11643-1992 Plane 7 */
-};
-
-typedef enum {
- ASCII_1=0,
- GB2312_1=1,
- ISO_IR_165=2,
- CNS_11643=3,
- INVALID_STATE_CN=-1
-} StateEnumCN;
-
-static const Cnv2022Type myConverterTypeCN[4]={
- ASCII1,
- DBCS,
- DBCS,
- MBCS
-};
-
static void
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
UConverterDataISO2022 *converterData;
- unsigned char* target = (unsigned char*) args->target;
- const unsigned char* targetLimit = (const unsigned char*) args->targetLimit;
+ ISO2022State *pFromU2022State;
+ uint8_t *target = (uint8_t *) args->target;
+ const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
const UChar* source = args->source;
const UChar* sourceLimit = args->sourceLimit;
int32_t* offsets = args->offsets;
- uint32_t targetByteUnit = missingCharMarker;
- uint32_t sourceChar =0x0000;
- const char* escSeq = NULL;
- int len =0; /*length of escSeq chars*/
- uint32_t targetValue=0;
- uint8_t planeVal=0;
- UConverterCallbackReason reason;
- UConverterSharedData* sharedData=NULL;
+ UChar32 sourceChar;
+ char buffer[8];
+ int32_t len;
+ int8_t choices[3];
+ int32_t choiceCount;
+ uint32_t targetValue;
UBool useFallback;
- /* state variables*/
- StateEnumCN* currentState;
- StateEnumCN initIterState;
- UConverter** currentConverter;
- UBool* isShiftAppended;
- UBool* isEscapeAppended;
- int* plane;
- int lPlane=0;
-
- /* arguments check*/
- if ((args->converter == NULL) || (targetLimit < target) || (sourceLimit < source)){
- *err = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
-
/* set up the state */
converterData = (UConverterDataISO2022*)args->converter->extraInfo;
+ pFromU2022State = &converterData->fromU2022State;
useFallback = args->converter->useFallback;
- currentState = (StateEnumCN*)&converterData->fromUnicodeCurrentState;
- initIterState = ASCII_1;
- currentConverter = &converterData->fromUnicodeConverter;
- isShiftAppended = &converterData->isShiftAppended;
- isEscapeAppended = &converterData->isEscapeAppended;
- plane = &converterData->plane;
- initIterState = *currentState;
- *currentConverter = converterData->myConverterArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
- sharedData = (*currentConverter)->sharedData;
+
+ choiceCount = 0;
/* check if the last codepoint of previous buffer was a lead surrogate*/
- if(args->converter->fromUSurrogateLead!=0 && target< targetLimit) {
+ if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
goto getTrail;
}
-
while( source < sourceLimit){
-
- targetByteUnit =missingCharMarker;
- lPlane =0;
-
if(target < targetLimit){
sourceChar = *(source++);
/*check if the char is a First surrogate*/
if(UTF_IS_SURROGATE(sourceChar)) {
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
- args->converter->fromUSurrogateLead=(UChar)sourceChar;
getTrail:
/*look ahead to find the trail surrogate*/
if(source < sourceLimit) {
@@ -2654,174 +2411,204 @@ getTrail:
UChar trail=(UChar) *source;
if(UTF_IS_SECOND_SURROGATE(trail)) {
source++;
- /*(((args->converter->fromUSurrogateLead)<<10L)+(trail)-((0xd800<<10L)+0xdc00-0x10000))*/
- sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail);
- args->converter->fromUSurrogateLead=0x00;
- /* convert this surrogate code point */
+ sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
+ args->converter->fromUChar32=0x00;
+ /* convert this supplementary code point */
/* exit this condition tree */
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
- reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
- goto callback;
+ args->converter->fromUChar32=sourceChar;
+ break;
}
} else {
/* no more input */
+ args->converter->fromUChar32=sourceChar;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
- reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
- goto callback;
+ args->converter->fromUChar32=sourceChar;
+ break;
}
}
/* do the conversion */
- if(sourceChar < 0x007f ){
- targetByteUnit = sourceChar;
- if(*currentState!= ASCII_1){
- *currentState = ASCII_1;
- *isEscapeAppended = FALSE;
+ if(sourceChar <= 0x007f ){
+ /* US-ASCII */
+ if(pFromU2022State->g == 0) {
+ buffer[0] = (char)sourceChar;
+ len = 1;
+ } else {
+ buffer[0] = UCNV_SI;
+ buffer[1] = (char)sourceChar;
+ len = 2;
+ pFromU2022State->g = 0;
+ choiceCount = 0;
+ }
+ if(sourceChar == CR || sourceChar == LF) {
+ /* reset the state at the end of a line */
+ uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
+ choiceCount = 0;
}
-
}
else{
+ /* convert U+0080..U+10ffff */
+ UConverterSharedData *cnv;
+ int32_t i;
+ int8_t cs, g;
+
+ if(choiceCount == 0) {
+ /* try the current SO/G1 converter first */
+ choices[0] = pFromU2022State->cs[1];
+
+ /* default to GB2312_1 if none is designated yet */
+ if(choices[0] == 0) {
+ choices[0] = GB2312_1;
+ }
- do{
- if(myConverterTypeCN[*currentState] == MBCS){
- /*len= _MBCSFromUChar32((*currentConverter)->sharedData,sourceChar,
- &targetValue,args->converter->useFallback);*/
- MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3);
- if(len==3){
- targetByteUnit = (UChar32) targetValue;
- planeVal = (uint8_t) ((targetValue)>>16);
- if(planeVal >0x80 && planeVal<0x89){
- lPlane = (int)(planeVal - 0x80);
- targetByteUnit -= (planeVal<<16);
- }else {
- lPlane =-1;
- targetByteUnit=missingCharMarker;
- }
- if(converterData->version == 0 && lPlane >2){
- targetByteUnit = missingCharMarker;
- }
- }
- }else if(myConverterTypeCN[*currentState] == DBCS){
- MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2);
- if(len==2){
- if(( converterData->version) == 0 && *currentState ==ISO_IR_165){
- targetByteUnit = missingCharMarker;
- }else{
- targetByteUnit = (UChar32) targetValue;
- }
+ if(converterData->version == 0) {
+ /* ISO-2022-CN */
+
+ /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
+ if(choices[0] == GB2312_1) {
+ choices[1] = (int8_t)CNS_11643_1;
+ } else {
+ choices[1] = (int8_t)GB2312_1;
}
-
- }else{
- if(sourceChar < 0x7f){
- targetByteUnit = sourceChar;
+
+ choiceCount = 2;
+ } else {
+ /* ISO-2022-CN-EXT */
+
+ /* try one of the other converters */
+ switch(choices[0]) {
+ case GB2312_1:
+ choices[1] = (int8_t)CNS_11643_1;
+ choices[2] = (int8_t)ISO_IR_165;
+ break;
+ case ISO_IR_165:
+ choices[1] = (int8_t)GB2312_1;
+ choices[2] = (int8_t)CNS_11643_1;
+ break;
+ default: /* CNS_11643_x */
+ choices[1] = (int8_t)GB2312_1;
+ choices[2] = (int8_t)ISO_IR_165;
+ break;
}
- }
- if(targetByteUnit==missingCharMarker){
-
- *currentState=(StateEnumCN)((*currentState<3)? *currentState+1:0);
- *currentConverter =converterData->myConverterArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
- targetByteUnit =missingCharMarker;
- *isEscapeAppended = FALSE;
- *isShiftAppended = FALSE;
- sharedData=(*currentConverter)->sharedData;
- }
- else
- break;
- }while(initIterState != *currentState);
- }
- if(targetByteUnit != missingCharMarker){
-
- args->converter->fromUnicodeStatus=(UBool) (*currentState > ASCII_1);
- /* Append the escpace sequence */
- if(!*isEscapeAppended ||(*plane != lPlane)){
- int temp =0;
- temp =(*currentState==CNS_11643) ? ((int)*currentState+lPlane-1):(int)*currentState ;
- escSeq = escSeqCharsCN[temp];
- len =escSeqCharsLenCN[temp];
- CONCAT_ESCAPE_EX(args,source, &target, targetLimit, &offsets, escSeq,len,err);
- *plane=lPlane;
- *isEscapeAppended=TRUE;
- *isShiftAppended=FALSE;
+ choiceCount = 3;
+ }
}
- /* Append Shift Sequences */
- if(*currentState == GB2312_1 || *currentState==ISO_IR_165){
- if(!*isShiftAppended){
- len =shiftSeqCharsLenCN[*currentState];
- escSeq = shiftSeqCharsCN[*currentState];
- CONCAT_ESCAPE_EX(args,source, &target, targetLimit, &offsets, escSeq,len,err);
- *isShiftAppended=TRUE;
- }
- }else if(*currentState!=ASCII1){
- int temp =*currentState+*plane-1;
- if(*plane ==1 && *isShiftAppended){
- temp=0;
+ cs = g = 0;
+ len = 0;
+
+ for(i = 0; i < choiceCount && len == 0; ++i) {
+ cs = choices[i];
+ if(cs > 0) {
+ if(cs > CNS_11643_0) {
+ cnv = converterData->myConverterArray[CNS_11643];
+ MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_3);
+ if(len==3) {
+ cs = (int8_t)(CNS_11643_0 + (targetValue >> 16) - 0x80);
+ len = 2;
+ if(cs == CNS_11643_1) {
+ g = 1;
+ } else if(cs == CNS_11643_2) {
+ g = 2;
+ } else /* plane 3..7 */ if(converterData->version == 1) {
+ g = 3;
+ } else {
+ /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
+ len = 0;
+ }
+ }
+ } else {
+ /* GB2312_1 or ISO-IR-165 */
+ cnv = converterData->myConverterArray[cs];
+ MBCS_FROM_UCHAR32_ISO2022(cnv,sourceChar,&targetValue,useFallback,&len,MBCS_OUTPUT_2);
+ g = 1; /* used if len == 2 */
+ }
}
- len =shiftSeqCharsLenCN[temp];
- escSeq = shiftSeqCharsCN[temp];
- CONCAT_ESCAPE_EX(args,source, &target, targetLimit, &offsets, escSeq,len,err);
- *isShiftAppended=TRUE;
}
- initIterState = *currentState;
+ if(len > 0) {
+ len = 0; /* count output bytes; it must have been len == 2 */
- /* write the targetByteUnit to target */
- if(targetByteUnit <= 0x00FF){
- if( target source-1;
+ /* write the designation sequence if necessary */
+ if(cs != pFromU2022State->cs[g]) {
+ if(cs < CNS_11643) {
+ uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
+ } else {
+ uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
}
-
- }else{
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) targetByteUnit;
- *err = U_BUFFER_OVERFLOW_ERROR;
- }
- }else{
- if(target < targetLimit){
- *(target++) =(unsigned char) (targetByteUnit>>8);
- if(offsets){
- *(offsets++) = source-args->source-1;
+ len = 4;
+ pFromU2022State->cs[g] = cs;
+ if(g == 1) {
+ /* changing the SO/G1 charset invalidates the choices[] */
+ choiceCount = 0;
}
- if(target < targetLimit){
- *(target++) =(unsigned char) (targetByteUnit);
- if(offsets){
- *(offsets++) = source-args->source-1;
- }
- }else{
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
- *err = U_BUFFER_OVERFLOW_ERROR;
+ }
+
+ /* write the shift sequence if necessary */
+ if(g != pFromU2022State->g) {
+ switch(g) {
+ case 1:
+ buffer[len++] = UCNV_SO;
+
+ /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
+ pFromU2022State->g = 1;
+ break;
+ case 2:
+ buffer[len++] = 0x1b;
+ buffer[len++] = 0x4e;
+ break;
+ default: /* case 3 */
+ buffer[len++] = 0x1b;
+ buffer[len++] = 0x4f;
+ break;
}
- }else{
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit>>8);
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
- *err = U_BUFFER_OVERFLOW_ERROR;
}
- }
+ /* write the two output bytes */
+ buffer[len++] = (char)(targetValue >> 8);
+ buffer[len++] = (char)targetValue;
+ } else {
+ /* if we cannot find the character after checking all codepages
+ * then this is an error
+ */
+ *err = U_INVALID_CHAR_FOUND;
+ args->converter->fromUChar32=sourceChar;
+ break;
+ }
}
- else{
- /* if we cannot find the character after checking all codepages
- * then this is an error
- */
- reason = UCNV_UNASSIGNED;
- *err = U_INVALID_CHAR_FOUND;
-callback:
-
- fromUnicodeCallback(args,sourceChar,&source,&target,&offsets,reason,err);
- initIterState = *currentState;
-
- if (U_FAILURE (*err)){
+ /* output len>0 bytes in buffer[] */
+ if(len == 1) {
+ *target++ = buffer[0];
+ if(offsets) {
+ *offsets++ = source - args->source - 1; /* -1: known to be ASCII */
+ }
+ } else if(len == 2 && (target + 2) <= targetLimit) {
+ *target++ = buffer[0];
+ *target++ = buffer[1];
+ if(offsets) {
+ int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
+ *offsets++ = sourceIndex;
+ *offsets++ = sourceIndex;
+ }
+ } else {
+ ucnv_fromUWriteBytes(
+ args->converter,
+ buffer, len,
+ (char **)&target, (const char *)targetLimit,
+ &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
+ err);
+ if(U_FAILURE(*err)) {
break;
}
}
@@ -2833,243 +2620,56 @@ callback:
}/* end while(mySourceIndexconverter->fromUSurrogateLead !=0 && (source == sourceLimit) && args->flush){
- *err = U_TRUNCATED_CHAR_FOUND;
- }
- /* Reset the state of converter if we consumed
- * the source and flush is true
- */
- if( (source == sourceLimit) && args->flush){
- setInitialStateFromUnicodeJPCN(args->converter,converterData);
- }
-
- /*save the state and return */
- args->source = source;
- args->target = (char*)target;
-}
-
-/*************** to unicode *******************/
-static const StateEnumCN nextStateToUnicodeCN[2][MAX_STATES_2022]= {
- {
-/* 0 1 2 3 4 5 6 7 8 9 */
- INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,GB2312_1 ,INVALID_STATE_CN ,INVALID_STATE_CN
- ,CNS_11643 ,CNS_11643 ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- },
- {
-/* 0 1 2 3 4 5 6 7 8 9 */
- INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,GB2312_1 ,INVALID_STATE_CN ,ISO_IR_165
- ,CNS_11643 ,CNS_11643 ,CNS_11643 ,CNS_11643 ,CNS_11643 ,CNS_11643 ,CNS_11643 ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN ,INVALID_STATE_CN
- }
-};
-
-static void
-changeState_2022(UConverter* _this,
- const char** source,
- const char* sourceLimit,
- UBool flush,Variant2022 var,
- int* plane,
- UErrorCode* err){
- UConverter* myUConverter;
- UCNV_TableStates_2022 value;
- UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
- uint32_t key = myData2022->key;
- const char* chosenConverterName = NULL;
- int32_t offset;
-
- /*In case we were in the process of consuming an escape sequence
- we need to reprocess it */
-
- do{
-
- value = getKey_2022(**source,(int32_t *) &key, &offset);
-
- switch (value){
-
- case VALID_NON_TERMINAL_2022 :
- break;
-
- case VALID_TERMINAL_2022:
- {
- (*source)++;
- chosenConverterName = escSeqStateTable_Result_2022[offset];
- key = 0;
- goto DONE;
- };
- break;
-
- case INVALID_2022:
- {
- myData2022->key = 0;
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- return;
- }
- case VALID_SS2_SEQUENCE:
- /*falls through*/
-
- case VALID_SS3_SEQUENCE:
- {
- (*source)++;
- key = 0;
- goto DONE;
+ if( U_SUCCESS(*err) &&
+ pFromU2022State->g!=0 &&
+ args->flush && source>=sourceLimit && args->converter->fromUChar32==0
+ ) {
+ int32_t sourceIndex;
+
+ /* we are switching to ASCII */
+ pFromU2022State->g=0;
+
+ /* get the source index of the last input character */
+ /*
+ * TODO this would be simpler and more reliable if we used a pair
+ * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
+ * so that we could simply use the prevSourceIndex here;
+ * this code gives an incorrect result for the rare case of an unmatched
+ * trail surrogate that is alone in the last buffer of the text stream
+ */
+ sourceIndex=(int32_t)(source-args->source);
+ if(sourceIndex>0) {
+ --sourceIndex;
+ if( U16_IS_TRAIL(args->source[sourceIndex]) &&
+ (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
+ ) {
+ --sourceIndex;
}
-
- case VALID_MAYBE_TERMINAL_2022:
- {
- const char* mySource = (*source+1);
- int32_t myKey = key;
- UCNV_TableStates_2022 myValue = value;
- int32_t myOffset=0;
- if(*mySource==ESC_2022){
- while ((mySource < sourceLimit) &&
- ((myValue == VALID_MAYBE_TERMINAL_2022)||(myValue == VALID_NON_TERMINAL_2022))){
- myValue = getKey_2022(*(mySource++), &myKey, &myOffset);
- }
- }
- else{
- (*source)++;
- myValue=(UCNV_TableStates_2022) 1;
- myOffset = 8;
- }
-
- switch (myValue){
- case INVALID_2022:
- {
- /*Backs off*/
- chosenConverterName = escSeqStateTable_Result_2022[offset];
- value = VALID_TERMINAL_2022;
- goto DONE;
- };
- break;
-
- case VALID_TERMINAL_2022:
- {
- /*uses longer escape sequence*/
- chosenConverterName = escSeqStateTable_Result_2022[myOffset];
- key = 0;
- value = VALID_TERMINAL_2022;
- goto DONE;
- };
- break;
-
- /* Not expected. Added to make the gcc happy */
- case VALID_SS2_SEQUENCE:
- /*falls through*/
- /* Not expected. Added to make the gcc happy */
- case VALID_SS3_SEQUENCE:
- {
- (*source)++;
- key = 0;
- goto DONE;
- }
-
- case VALID_NON_TERMINAL_2022:
- /*falls through*/
- case VALID_MAYBE_TERMINAL_2022:
- {
- if (flush){
- /*Backs off*/
- chosenConverterName = escSeqStateTable_Result_2022[offset];
- value = VALID_TERMINAL_2022;
- key = 0;
- goto DONE;
- }
- else{
- key = myKey;
- value = VALID_NON_TERMINAL_2022;
- }
- };
- break;
- };
- break;
- };
- break;
+ } else {
+ sourceIndex=-1;
}
- }while (++(*source) < sourceLimit);
-
-DONE:
- myData2022->key = key;
- if(offset<57 && offset>49){
- *plane = offset-49;
- }
- if ((value == VALID_NON_TERMINAL_2022) || (value == VALID_MAYBE_TERMINAL_2022)) {
- return;
+ ucnv_fromUWriteBytes(
+ args->converter,
+ SHIFT_IN_STR, 1,
+ (char **)&target, (const char *)targetLimit,
+ &offsets, sourceIndex,
+ err);
}
- else if (value != INVALID_2022 ) {
- if(value==3 || value==4 ){
- _this->mode = UCNV_SI;
- myUConverter =myData2022->currentConverter;
- }
- else{
- switch(var){
- case ISO_2022:
- _this->mode = UCNV_SI;
- ucnv_close(myData2022->currentConverter);
- myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
- break;
- case ISO_2022_JP:
- {
- StateEnum tempState=nextStateToUnicodeJP[myData2022->version][offset];
- _this->mode = UCNV_SI;
- myData2022->currentConverter = myUConverter =
- (tempState!=INVALID_STATE)? myData2022->myConverterArray[tempState]:NULL;
- myData2022->toUnicodeCurrentState = tempState;
- *err= (tempState==INVALID_STATE)?U_ILLEGAL_ESCAPE_SEQUENCE :U_ZERO_ERROR;
- }
- break;
- case ISO_2022_CN:
- {
- StateEnumCN tempState=nextStateToUnicodeCN[myData2022->version][offset];
- _this->mode = UCNV_SI;
- myData2022->currentConverter = myUConverter =
- (tempState!=INVALID_STATE)? myData2022->myConverterArray[tempState]:NULL;
- myData2022->toUnicodeCurrentState =(StateEnum) tempState;
- *err= (tempState==INVALID_STATE)?U_ILLEGAL_ESCAPE_SEQUENCE :U_ZERO_ERROR;
- }
- break;
- case ISO_2022_KR:
- if(offset==0x30){
- _this->mode = UCNV_SI;
- myUConverter = myData2022->currentConverter=myData2022->fromUnicodeConverter;
- break;
- }
- default:
- myUConverter=NULL;
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- }
- }
- if (U_SUCCESS(*err)){
- /*Customize the converter with the attributes set on the 2022 converter*/
- myUConverter->fromUCharErrorBehaviour = _this->fromUCharErrorBehaviour;
- myUConverter->fromUContext = _this->fromUContext;
- myUConverter->fromCharErrorBehaviour = _this->fromCharErrorBehaviour;
- myUConverter->toUContext = _this->toUContext;
-
- uprv_memcpy(myUConverter->subChar,
- _this->subChar,
- myUConverter->subCharLen = _this->subCharLen);
- myUConverter->subChar1 = 0;
-
- _this->mode = UCNV_SO;
- }
- }
+ /*save the state and return */
+ args->source = source;
+ args->target = (char*)target;
}
@@ -3077,24 +2677,28 @@ static void
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
UErrorCode* err){
char tempBuf[3];
- int plane=0;
- const char* pBuf;
- const char *mySource = ( char *) args->source;
+ const char *mySource = (char *) args->source;
UChar *myTarget = args->target;
- char *tempLimit = &tempBuf[3];
const char *mySourceLimit = args->sourceLimit;
uint32_t targetUniChar = 0x0000;
uint32_t mySourceChar = 0x0000;
UConverterDataISO2022* myData;
+ ISO2022State *pToU2022State;
- if ((args->converter == NULL) || (args->targetLimit < myTarget) || (args->sourceLimit < mySource)){
- *err = U_ILLEGAL_ARGUMENT_ERROR;
- return;
+ myData=(UConverterDataISO2022*)(args->converter->extraInfo);
+ pToU2022State = &myData->toU2022State;
+
+ if(myData->key != 0) {
+ /* continue with a partial escape sequence */
+ goto escape;
+ } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
+ /* continue with a partial double-byte character */
+ mySourceChar = args->converter->toUBytes[0];
+ args->converter->toULength = 0;
+ goto getTrailByte;
}
-
- myData=(UConverterDataISO2022*)(args->converter->extraInfo);
- plane=myData->plane;
- while(mySource< args->sourceLimit){
+
+ while(mySource < mySourceLimit){
targetUniChar =missingCharMarker;
@@ -3102,87 +2706,25 @@ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
mySourceChar= (unsigned char) *mySource++;
-
switch(mySourceChar){
case UCNV_SI:
- if(args->converter->toUnicodeStatus != 0x00){
- break;
- }
- myData->currentType = ASCII1;
- myData->plane=plane = 0;
+ pToU2022State->g=0;
continue;
case UCNV_SO:
- if(args->converter->toUnicodeStatus != 0x00){
+ if(pToU2022State->cs[1] != 0) {
+ pToU2022State->g=1;
+ continue;
+ } else {
+ /* illegal to have SO before a matching designator */
break;
}
- myData->currentType = MBCS;
- continue;
-
- case CR:
- /*falls through*/
- case LF:
- if(args->converter->toUnicodeStatus != 0x00){
- break;
- }
- myData->currentType = ASCII1;
- myData->plane=plane = 0;
- /* falls through */
- default:
- /* if we are in the middle of consuming an escape sequence
- * we fall through else we process the input
- */
- if(myData->key==0){
- if(myData->currentType != ASCII1){
- if(args->converter->toUnicodeStatus == 0x00){
- args->converter->toUnicodeStatus = (UChar) mySourceChar;
- continue;
- }
- else{
- if(plane >0){
- tempBuf[0] = (char) (0x80+plane);
- tempBuf[1] = (char) (args->converter->toUnicodeStatus);
- tempBuf[2] = (char) (mySourceChar);
- tempLimit = &tempBuf[2]+1;
-
- }else{
- tempBuf[0] = (char) args->converter->toUnicodeStatus;
- tempBuf[1] = (char) mySourceChar;
- tempLimit = &tempBuf[2];
- }
- mySourceChar+= (uint32_t) args->converter->toUnicodeStatus<<8;
- args->converter->toUnicodeStatus = 0;
- pBuf = tempBuf;
- if(myData->currentConverter!=NULL){
- targetUniChar = _MBCSSimpleGetNextUChar(myData->currentConverter->sharedData, &pBuf, tempLimit, FALSE);
- }else{
- *err=U_INVALID_CHAR_FOUND;
- break;
- }
- }
- }
- else{
- if(args->converter->toUnicodeStatus == 0x00 && mySourceChar < 0x7f){
- targetUniChar = (UChar) mySourceChar;
- }
- }
- break;
- }
case ESC_2022:
- if(args->converter->toUnicodeStatus != 0x00){
- break;
- }
mySource--;
+escape:
changeState_2022(args->converter,&(mySource),
- args->sourceLimit, args->flush,ISO_2022_CN,&plane,err);
-
- myData->plane=plane;
- if(plane>0){
- myData->currentType = MBCS;
- }else{
- myData->currentType=DBCS;
- }
+ mySourceLimit, ISO_2022_CN,err);
/* invalid or illegal escape sequence */
if(U_FAILURE(*err)){
@@ -3192,28 +2734,76 @@ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
}
continue;
+ /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
+
+ case CR:
+ /*falls through*/
+ case LF:
+ uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
+ /* falls through */
+ default:
+ /* convert one or two bytes */
+ if(pToU2022State->g != 0) {
+ if(mySource < mySourceLimit) {
+ UConverterSharedData *cnv;
+ StateEnum tempState;
+ int32_t tempBufLen;
+ char trailByte;
+getTrailByte:
+ trailByte = *mySource++;
+ tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
+ if(tempState > CNS_11643_0) {
+ cnv = myData->myConverterArray[CNS_11643];
+ tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
+ tempBuf[1] = (char) (mySourceChar);
+ tempBuf[2] = trailByte;
+ tempBufLen = 3;
+
+ }else{
+ cnv = myData->myConverterArray[tempState];
+ tempBuf[0] = (char) (mySourceChar);
+ tempBuf[1] = trailByte;
+ tempBufLen = 2;
+ }
+ mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
+ if(pToU2022State->g>=2) {
+ /* return from a single-shift state to the previous one */
+ pToU2022State->g=pToU2022State->prevG;
+ }
+ targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
+ } else {
+ args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+ args->converter->toULength = 1;
+ goto endloop;
+ }
+ }
+ else{
+ if(mySourceChar <= 0x7f) {
+ targetUniChar = (UChar) mySourceChar;
+ }
+ }
+ break;
}
if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
if(args->offsets){
- args->offsets[myTarget - args->target]= mySource - args->source - 2
- +(myData->currentType==ASCII);
+ args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
}
*(myTarget++)=(UChar)targetUniChar;
}
else if(targetUniChar > missingCharMarker){
/* disassemble the surrogate pair and write to output*/
targetUniChar-=0x0010000;
- *(myTarget++) = (UChar)(0xd800+(UChar)(targetUniChar>>10));
+ *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
if(args->offsets){
- args->offsets[myTarget - args->target]= mySource - args->source - 2
- +(myData->currentType==ASCII);
+ args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
}
+ ++myTarget;
if(myTarget< args->targetLimit){
- *(myTarget)++ = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
+ *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
if(args->offsets){
- args->offsets[myTarget - args->target]= mySource - args->source - 2
- +(myData->currentType==ASCII);
+ args->offsets[myTarget - args->target]= mySource - args->source - (mySourceChar <= 0xff ? 1 : 2);
}
+ ++myTarget;
}else{
args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
(UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
@@ -3222,11 +2812,8 @@ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
}
else{
/* Call the callback function*/
- toUnicodeCallback(args,mySourceChar,&mySource,targetUniChar,&myTarget,err);
- /*args->offsets = saveOffsets;*/
- if(U_FAILURE(*err))
- break;
-
+ toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
+ break;
}
}
else{
@@ -3234,19 +2821,7 @@ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
break;
}
}
- if((args->flush==TRUE)
- && (mySource == mySourceLimit)
- && ( args->converter->toUnicodeStatus !=0x00)){
-
- *err = U_TRUNCATED_CHAR_FOUND;
- args->converter->toUnicodeStatus = 0x00;
- }
- /* Reset the state of converter if we consumed
- * the source and flush is true
- */
- if( (mySource == mySourceLimit) && args->flush){
- setInitialStateToUnicodeJPCN(args->converter,myData);
- }
+endloop:
args->target = myTarget;
args->source = mySource;
}
@@ -3255,38 +2830,88 @@ static void
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
UConverter *cnv = args->converter;
UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
- char *p;
- char buffer[4];
+ ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
+ char *p, *subchar;
+ char buffer[8];
+ int32_t length;
+
+ subchar=(char *)cnv->subChar;
+ length=cnv->subCharLen; /* assume length==1 for most variants */
p = buffer;
switch(myConverterData->locale[0]){
case 'j':
- if(myConverterData->fromUnicodeCurrentState!= ASCII){
- myConverterData->fromUnicodeCurrentState= ASCII;
- myConverterData->currentType = (Cnv2022Type) myConverterType[myConverterData->fromUnicodeCurrentState];
+ {
+ int8_t cs;
+
+ if(pFromU2022State->g == 1) {
+ /* JIS7: switch from G1 to G0 */
+ pFromU2022State->g = 0;
+ *p++ = UCNV_SI;
+ }
+
+ cs = pFromU2022State->cs[0];
+ if(cs != ASCII && cs != JISX201) {
+ /* not in ASCII or JIS X 0201: switch to ASCII */
+ pFromU2022State->cs[0] = (int8_t)ASCII;
*p++ = '\x1b';
*p++ = '\x28';
*p++ = '\x42';
-
}
- *p++ = cnv->subChar[0];
+
+ *p++ = subchar[0];
break;
+ }
case 'c':
- if(args->converter->fromUnicodeStatus) {
- /* DBCS mode and SBCS sub char: change to SBCS */
- myConverterData->fromUnicodeCurrentState=ASCII;
- *p++ = UCNV_SI;
- }
- *p++ = cnv->subChar[0];
+ if(pFromU2022State->g != 0) {
+ /* not in ASCII mode: switch to ASCII */
+ pFromU2022State->g = 0;
+ *p++ = UCNV_SI;
+ }
+ *p++ = subchar[0];
break;
case 'k':
- if(args->converter->fromUnicodeStatus){
- args->converter->fromUnicodeStatus=0x00;
- *p++= UCNV_SI;
+ if(myConverterData->version == 0) {
+ if(length == 1) {
+ if((UBool)args->converter->fromUnicodeStatus) {
+ /* in DBCS mode: switch to SBCS */
+ args->converter->fromUnicodeStatus = 0;
+ *p++ = UCNV_SI;
+ }
+ *p++ = subchar[0];
+ } else /* length == 2*/ {
+ if(!(UBool)args->converter->fromUnicodeStatus) {
+ /* in SBCS mode: switch to DBCS */
+ args->converter->fromUnicodeStatus = 1;
+ *p++ = UCNV_SO;
+ }
+ *p++ = subchar[0];
+ *p++ = subchar[1];
+ }
+ break;
+ } else {
+ /* let the subconverter write the subchar */
+ args->converter = myConverterData->currentConverter;
+ uprv_memcpy(myConverterData->currentConverter->subChar, subchar, 4);
+ myConverterData->currentConverter->subCharLen = (int8_t)length;
+
+ myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
+ ucnv_cbFromUWriteSub(args, 0, err);
+ cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
+
+ if(*err == U_BUFFER_OVERFLOW_ERROR) {
+ if(myConverterData->currentConverter->charErrorBufferLength > 0) {
+ uprv_memcpy(
+ cnv->charErrorBuffer,
+ myConverterData->currentConverter->charErrorBuffer,
+ myConverterData->currentConverter->charErrorBufferLength);
+ }
+ cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
+ myConverterData->currentConverter->charErrorBufferLength = 0;
+ }
+ args->converter = cnv;
+ return;
}
-
- *p++ = cnv->subChar[0];
-
default:
/* not expected */
break;
@@ -3301,9 +2926,7 @@ struct cloneStruct
{
UConverter cnv;
UConverterDataISO2022 mydata;
- UConverter currentCnv; /**< for ISO_2022 converter if the current converter is open */
-
- UConverter clonedConverters[1]; /* Actually a variable sized array for all of the sub converters to be cloned. */
+ UConverter currentConverter;
};
@@ -3315,145 +2938,329 @@ _ISO_2022_SafeClone(
UErrorCode *status)
{
struct cloneStruct * localClone;
- int32_t bufferSizeNeeded = sizeof(struct cloneStruct);
- UConverterDataISO2022* cnvData = (UConverterDataISO2022*)cnv->extraInfo;
- int32_t i;
- int32_t sizes[UCNV_2022_MAX_CONVERTERS];
- int32_t numConverters = 0;
- int32_t currentConverterIndex = -1;
- int32_t fromUnicodeConverterIndex = -1;
- int32_t currentConverterSize = 0;
- char *ptr; /* buffer pointer */
-
- if (U_FAILURE(*status)) {
- return 0;
- }
-
- for(i=0;(imyConverterArray[i];i++) {
- int32_t size;
-
- size = 0;
- ucnv_safeClone(cnvData->myConverterArray[i], NULL, &size, status);
- bufferSizeNeeded += size;
- sizes[i] = size;
- numConverters++;
-
- if(cnvData->currentConverter == cnvData->myConverterArray[i]) {
- currentConverterIndex = i;
- }
-
- if(cnvData->fromUnicodeConverter == cnvData->myConverterArray[i]) {
- fromUnicodeConverterIndex = i;
- }
- }
-
- if(currentConverterIndex == -1) { /* -1 means - not found in array. Clone separately */
- currentConverterSize = 0;
- if(cnvData->currentConverter) {
- ucnv_safeClone(cnvData->currentConverter, NULL, ¤tConverterSize, status);
- bufferSizeNeeded += currentConverterSize;
- }
- }
-
- for(;iextraInfo;
localClone = (struct cloneStruct *)stackBuffer;
- uprv_memcpy(&localClone->cnv, cnv, sizeof(UConverter));
- uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataISO2022));
+ /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
- /* clone back sub cnvs */
+ uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
- ptr = (char*)&localClone->clonedConverters;
- for(i=0;imydata.myConverterArray[i] = ucnv_safeClone(cnvData->myConverterArray[i], (UConverter*)ptr, &size, status);
- ptr += size;
- }
- for(;imydata.myConverterArray[i] = NULL;
- }
+ /* share the subconverters */
- if(currentConverterIndex == -1) { /* -1 = not found in list */
- /* KR version 1 also uses the state in currentConverter for preserving state
- * so we need to clone it too!
- */
- if(cnvData->currentConverter) {
- localClone->mydata.currentConverter = ucnv_safeClone(cnvData->currentConverter, ptr, ¤tConverterSize, status);
- ptr += currentConverterSize;
- } else {
- localClone->mydata.currentConverter = NULL;
+ if(cnvData->currentConverter != NULL) {
+ size = (int32_t)sizeof(UConverter);
+ localClone->mydata.currentConverter =
+ ucnv_safeClone(cnvData->currentConverter,
+ &localClone->currentConverter,
+ &size, status);
+ if(U_FAILURE(*status)) {
+ return NULL;
}
- } else {
- localClone->mydata.currentConverter = localClone->mydata.myConverterArray[currentConverterIndex];
}
- if(fromUnicodeConverterIndex != -1) {
- /* fromUnicodeConverter is in the list */
- localClone->mydata.fromUnicodeConverter = localClone->mydata.myConverterArray[fromUnicodeConverterIndex];
- } else if(cnvData->currentConverter == cnvData->fromUnicodeConverter) {
- /* fromUnicodeConverter is the same as currentConverter */
- localClone->mydata.fromUnicodeConverter = localClone->mydata.currentConverter;
- } else {
- /* fromUnicodeConverter is NULL */
- localClone->mydata.fromUnicodeConverter = NULL;
+ for(i=0; imyConverterArray[i] != NULL) {
+ ucnv_incrementRefCount(cnvData->myConverterArray[i]);
+ }
}
localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
-
+ localClone->cnv.isExtraLocal = TRUE;
return &localClone->cnv;
}
static void
_ISO_2022_GetUnicodeSet(const UConverter *cnv,
- USet *set,
+ USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode)
{
int32_t i;
- USet *cnvSet;
UConverterDataISO2022* cnvData;
if (U_FAILURE(*pErrorCode)) {
return;
}
+#ifdef U_ENABLE_GENERIC_ISO_2022
if (cnv->sharedData == &_ISO2022Data) {
/* We use UTF-8 in this case */
- uset_addRange(set, 0, 0xd7FF);
- uset_addRange(set, 0xE000, 0x10FFFF);
+ sa->addRange(sa->set, 0, 0xd7FF);
+ sa->addRange(sa->set, 0xE000, 0x10FFFF);
return;
}
+#endif
cnvData = (UConverterDataISO2022*)cnv->extraInfo;
- if (cnv->sharedData == &_ISO2022KRData && cnvData->currentConverter != NULL) {
- ucnv_getUnicodeSet(cnvData->currentConverter, set, which, pErrorCode);
- return;
- }
- cnvSet = uset_open(0, 0);
- if (!cnvSet) {
- *pErrorCode =U_MEMORY_ALLOCATION_ERROR;
+ /* open a set and initialize it with code points that are algorithmically round-tripped */
+ switch(cnvData->locale[0]){
+ case 'j':
+ if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
+ /* include Latin-1 for some variants of JP */
+ sa->addRange(sa->set, 0, 0xff);
+ } else {
+ /* include ASCII for JP */
+ sa->addRange(sa->set, 0, 0x7f);
+ }
+ if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
+ /* include half-width Katakana for JP */
+ sa->addRange(sa->set, 0xff61, 0xff9f);
+ }
+ break;
+ case 'c':
+ case 'z':
+ /* include ASCII for CN */
+ sa->addRange(sa->set, 0, 0x7f);
+ break;
+ case 'k':
+ /* there is only one converter for KR, and it is not in the myConverterArray[] */
+ cnvData->currentConverter->sharedData->impl->getUnicodeSet(
+ cnvData->currentConverter, sa, which, pErrorCode);
return;
+ default:
+ break;
}
- for (i=0; (imyConverterArray[i]; i++) {
- ucnv_getUnicodeSet(cnvData->myConverterArray[i], cnvSet, which, pErrorCode);
- uset_addAll(set, cnvSet /* pErrorCode */);
+ /*
+ * TODO: need to make this version-specific for CN.
+ * CN version 0 does not map CNS planes 3..7 although
+ * they are all available in the CNS conversion table;
+ * CN version 1 does map them all.
+ * The two versions need to create different Unicode sets.
+ */
+ for (i=0; imyConverterArray[i]!=NULL) {
+ if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
+ cnvData->version==0 && i==CNS_11643
+ ) {
+ /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
+ ucnv_MBCSGetUnicodeSetForBytes(
+ cnvData->myConverterArray[i],
+ sa, UCNV_ROUNDTRIP_SET,
+ 0, 0x81, 0x82,
+ pErrorCode);
+ } else {
+ ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode);
+ }
+ }
}
- uset_close(cnvSet);
}
+static const UConverterImpl _ISO2022Impl={
+ UCNV_ISO_2022,
+
+ NULL,
+ NULL,
+
+ _ISO2022Open,
+ _ISO2022Close,
+ _ISO2022Reset,
+
+#ifdef U_ENABLE_GENERIC_ISO_2022
+ T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
+ T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
+ ucnv_fromUnicode_UTF8,
+ ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
+#else
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+#endif
+ NULL,
+
+ NULL,
+ _ISO2022getName,
+ _ISO_2022_WriteSub,
+ _ISO_2022_SafeClone,
+ _ISO_2022_GetUnicodeSet
+};
+static const UConverterStaticData _ISO2022StaticData={
+ sizeof(UConverterStaticData),
+ "ISO_2022",
+ 2022,
+ UCNV_IBM,
+ UCNV_ISO_2022,
+ 1,
+ 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
+ { 0x1a, 0, 0, 0 },
+ 1,
+ FALSE,
+ FALSE,
+ 0,
+ 0,
+ { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+const UConverterSharedData _ISO2022Data={
+ sizeof(UConverterSharedData),
+ ~((uint32_t) 0),
+ NULL,
+ NULL,
+ &_ISO2022StaticData,
+ FALSE,
+ &_ISO2022Impl,
+ 0
+};
+
+/*************JP****************/
+static const UConverterImpl _ISO2022JPImpl={
+ UCNV_ISO_2022,
+
+ NULL,
+ NULL,
+
+ _ISO2022Open,
+ _ISO2022Close,
+ _ISO2022Reset,
+
+ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
+ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
+ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
+ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
+ NULL,
+
+ NULL,
+ _ISO2022getName,
+ _ISO_2022_WriteSub,
+ _ISO_2022_SafeClone,
+ _ISO_2022_GetUnicodeSet
+};
+static const UConverterStaticData _ISO2022JPStaticData={
+ sizeof(UConverterStaticData),
+ "ISO_2022_JP",
+ 0,
+ UCNV_IBM,
+ UCNV_ISO_2022,
+ 1,
+ 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
+ { 0x1a, 0, 0, 0 },
+ 1,
+ FALSE,
+ FALSE,
+ 0,
+ 0,
+ { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+static const UConverterSharedData _ISO2022JPData={
+ sizeof(UConverterSharedData),
+ ~((uint32_t) 0),
+ NULL,
+ NULL,
+ &_ISO2022JPStaticData,
+ FALSE,
+ &_ISO2022JPImpl,
+ 0
+};
+
+/************* KR ***************/
+static const UConverterImpl _ISO2022KRImpl={
+ UCNV_ISO_2022,
+
+ NULL,
+ NULL,
+
+ _ISO2022Open,
+ _ISO2022Close,
+ _ISO2022Reset,
+
+ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
+ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
+ UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
+ UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
+ NULL,
+
+ NULL,
+ _ISO2022getName,
+ _ISO_2022_WriteSub,
+ _ISO_2022_SafeClone,
+ _ISO_2022_GetUnicodeSet
+};
+static const UConverterStaticData _ISO2022KRStaticData={
+ sizeof(UConverterStaticData),
+ "ISO_2022_KR",
+ 0,
+ UCNV_IBM,
+ UCNV_ISO_2022,
+ 1,
+ 3, /* max 3 bytes per UChar: SO+DBCS */
+ { 0x1a, 0, 0, 0 },
+ 1,
+ FALSE,
+ FALSE,
+ 0,
+ 0,
+ { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+static const UConverterSharedData _ISO2022KRData={
+ sizeof(UConverterSharedData),
+ ~((uint32_t) 0),
+ NULL,
+ NULL,
+ &_ISO2022KRStaticData,
+ FALSE,
+ &_ISO2022KRImpl,
+ 0
+};
+
+/*************** CN ***************/
+static const UConverterImpl _ISO2022CNImpl={
+
+ UCNV_ISO_2022,
+
+ NULL,
+ NULL,
+
+ _ISO2022Open,
+ _ISO2022Close,
+ _ISO2022Reset,
+
+ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
+ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
+ UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
+ UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
+ NULL,
+
+ NULL,
+ _ISO2022getName,
+ _ISO_2022_WriteSub,
+ _ISO_2022_SafeClone,
+ _ISO_2022_GetUnicodeSet
+};
+static const UConverterStaticData _ISO2022CNStaticData={
+ sizeof(UConverterStaticData),
+ "ISO_2022_CN",
+ 0,
+ UCNV_IBM,
+ UCNV_ISO_2022,
+ 2,
+ 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
+ { 0x1a, 0, 0, 0 },
+ 1,
+ FALSE,
+ FALSE,
+ 0,
+ 0,
+ { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+static const UConverterSharedData _ISO2022CNData={
+ sizeof(UConverterSharedData),
+ ~((uint32_t) 0),
+ NULL,
+ NULL,
+ &_ISO2022CNStaticData,
+ FALSE,
+ &_ISO2022CNImpl,
+ 0
+};
+
+
+
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
diff --git a/icuSources/common/ucnv_bld.c b/icuSources/common/ucnv_bld.c
index cd59fa36..f9f99c9b 100644
--- a/icuSources/common/ucnv_bld.c
+++ b/icuSources/common/ucnv_bld.c
@@ -1,7 +1,7 @@
/*
********************************************************************
- * COPYRIGHT:
- * Copyright (c) 1996-2003, International Business Machines Corporation and
+ * COPYRIGHT:
+ * Copyright (c) 1996-2004, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************
*
@@ -12,20 +12,25 @@
* uses uconv_io.h routines to access disk information
* is used by ucnv.h to implement public API create/delete/flushCache routines
* Modification History:
- *
+ *
* Date Name Description
- *
+ *
* 06/20/2000 helena OS/400 port changes; mostly typecast.
* 06/29/2000 helena Major rewrite of the callback interface.
*/
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
#include "unicode/udata.h"
#include "unicode/ucnv.h"
-#include "unicode/ucnv_err.h"
#include "unicode/uloc.h"
+#include "utracimp.h"
#include "ucnv_io.h"
#include "ucnv_bld.h"
+#include "ucnvmbcs.h"
+#include "ucnv_ext.h"
#include "ucnv_cnv.h"
#include "ucnv_imp.h"
#include "uhash.h"
@@ -33,7 +38,7 @@
#include "cstring.h"
#include "cmemory.h"
#include "ucln_cmn.h"
-#include "ustr_imp.h"
+#include "ustr_cnv.h"
@@ -95,7 +100,9 @@ static struct {
{ "cesu8", UCNV_CESU8 },
#if !UCONFIG_NO_LEGACY_CONVERSION
{ "hz",UCNV_HZ },
+#endif
{ "imapmailboxname", UCNV_IMAP_MAILBOX },
+#if !UCONFIG_NO_LEGACY_CONVERSION
{ "iscii", UCNV_ISCII },
{ "iso2022", UCNV_ISO_2022 },
#endif
@@ -154,7 +161,7 @@ static const char DATA_TYPE[] = "cnv";
/* by open converters. */
/* Not thread safe. */
/* Not supported API. Marked U_CAPI only for use by test programs. */
-U_CFUNC UBool U_EXPORT2 ucnv_cleanup(void) {
+static UBool U_CALLCONV ucnv_cleanup(void) {
if (SHARED_DATA_HASHTABLE != NULL) {
ucnv_flushCache();
if (SHARED_DATA_HASHTABLE != NULL && uhash_count(SHARED_DATA_HASHTABLE) == 0) {
@@ -169,10 +176,6 @@ U_CFUNC UBool U_EXPORT2 ucnv_cleanup(void) {
return (SHARED_DATA_HASHTABLE == NULL);
}
-U_CFUNC void ucnv_init(UErrorCode *status) {
- umtx_init(&cnvCacheMutex);
-}
-
static UBool U_CALLCONV
isCnvAcceptable(void *context,
const char *type, const char *name,
@@ -193,7 +196,7 @@ isCnvAcceptable(void *context,
* Un flatten shared data from a UDATA..
*/
static UConverterSharedData*
-ucnv_data_unFlattenClone(UDataMemory *pData, UErrorCode *status)
+ucnv_data_unFlattenClone(UConverterLoadArgs *pArgs, UDataMemory *pData, UErrorCode *status)
{
/* UDataInfo info; -- necessary only if some converters have different formatVersion */
const uint8_t *raw = (const uint8_t *)udata_getMemory(pData);
@@ -222,12 +225,16 @@ ucnv_data_unFlattenClone(UDataMemory *pData, UErrorCode *status)
/* copy initial values from the static structure for this type */
uprv_memcpy(data, converterData[type], sizeof(UConverterSharedData));
+#if 0 /* made UConverterMBCSTable part of UConverterSharedData -- markus 20031107 */
/*
* It would be much more efficient if the table were a direct member, not a pointer.
* However, that would add to the size of all UConverterSharedData objects
* even if they do not use this table (especially algorithmic ones).
* If this changes, then the static templates from converterData[type]
* need more entries.
+ *
+ * In principle, it would be cleaner if the load() function below
+ * allocated the table.
*/
data->table = (UConverterTable *)uprv_malloc(sizeof(UConverterTable));
if(data->table == NULL) {
@@ -236,16 +243,17 @@ ucnv_data_unFlattenClone(UDataMemory *pData, UErrorCode *status)
return NULL;
}
uprv_memset(data->table, 0, sizeof(UConverterTable));
-
+#endif
+
data->staticData = source;
-
+
data->sharedDataCached = FALSE;
/* fill in fields from the loaded data */
data->dataMemory = (void*)pData; /* for future use */
if(data->impl->load != NULL) {
- data->impl->load(data, raw + source->structSize, status);
+ data->impl->load(data, pArgs, raw + source->structSize, status);
if(U_FAILURE(*status)) {
uprv_free(data->table);
uprv_free(data);
@@ -259,32 +267,47 @@ ucnv_data_unFlattenClone(UDataMemory *pData, UErrorCode *status)
*goes to disk and opens it.
*allocates the memory and returns a new UConverter object
*/
-static UConverterSharedData *createConverterFromFile(const char* pkg, const char *fileName, UErrorCode * err)
+static UConverterSharedData *createConverterFromFile(UConverterLoadArgs *pArgs, UErrorCode * err)
{
UDataMemory *data;
UConverterSharedData *sharedData;
+ UTRACE_ENTRY_OC(UTRACE_UCNV_LOAD);
+
if (err == NULL || U_FAILURE (*err)) {
+ UTRACE_EXIT_STATUS(*err);
return NULL;
}
- data = udata_openChoice(pkg, DATA_TYPE, fileName, isCnvAcceptable, NULL, err);
+ UTRACE_DATA2(UTRACE_OPEN_CLOSE, "load converter %s from package %s", pArgs->name, pArgs->pkg);
+
+ data = udata_openChoice(pArgs->pkg, DATA_TYPE, pArgs->name, isCnvAcceptable, NULL, err);
if(U_FAILURE(*err))
{
+ UTRACE_EXIT_STATUS(*err);
return NULL;
}
- sharedData = ucnv_data_unFlattenClone(data, err);
+ sharedData = ucnv_data_unFlattenClone(pArgs, data, err);
if(U_FAILURE(*err))
{
udata_close(data);
+ UTRACE_EXIT_STATUS(*err);
return NULL;
}
+ /*
+ * TODO Store pkg in a field in the shared data so that delta-only converters
+ * can load base converters from the same package.
+ * If the pkg name is longer than the field, then either do not load the converter
+ * in the first place, or just set the pkg field to "".
+ */
+
+ UTRACE_EXIT_PTR_STATUS(sharedData, *err);
return sharedData;
}
-int32_t
+int32_t
ucnv_copyPlatformString(char *platformString, UConverterPlatform pltfrm)
{
switch (pltfrm)
@@ -307,7 +330,7 @@ static const UConverterSharedData *
getAlgorithmicTypeFromName(const char *realName)
{
uint32_t mid, start, limit;
- uint32_t lastMid;
+ uint32_t lastMid;
int result;
char strippedName[UCNV_MAX_CONVERTER_NAME_LENGTH];
@@ -318,14 +341,14 @@ getAlgorithmicTypeFromName(const char *realName)
start = 0;
limit = sizeof(cnvNameType)/sizeof(cnvNameType[0]);
mid = limit;
- lastMid = UINT32_MAX;
+ lastMid = UINT32_MAX;
for (;;) {
mid = (uint32_t)((start + limit) / 2);
- if (lastMid == mid) { /* Have we moved? */
- break; /* We haven't moved, and it wasn't found. */
- }
- lastMid = mid;
+ if (lastMid == mid) { /* Have we moved? */
+ break; /* We haven't moved, and it wasn't found. */
+ }
+ lastMid = mid;
result = uprv_strcmp(strippedName, cnvNameType[mid].name);
if (result < 0) {
@@ -358,7 +381,9 @@ ucnv_shareConverterData(UConverterSharedData * data)
SHARED_DATA_HASHTABLE = uhash_openSize(uhash_hashChars, uhash_compareChars,
ucnv_io_countAvailableAliases(&err),
&err);
- if (U_FAILURE(err))
+ ucln_common_registerCleanup(UCLN_COMMON_UCNV, ucnv_cleanup);
+
+ if (U_FAILURE(err))
return;
}
@@ -372,7 +397,7 @@ ucnv_shareConverterData(UConverterSharedData * data)
}
UCNV_DEBUG_LOG("put:chk",data->staticData->name,sanity);
*/
-
+
/* Mark it shared */
data->sharedDataCached = TRUE;
@@ -423,13 +448,18 @@ ucnv_getSharedConverterData(const char *name)
static UBool
ucnv_deleteSharedConverterData(UConverterSharedData * deadSharedData)
{
- if (deadSharedData->referenceCounter > 0)
+ UTRACE_ENTRY_OC(UTRACE_UCNV_UNLOAD);
+ UTRACE_DATA2(UTRACE_OPEN_CLOSE, "unload converter %s shared data %p", deadSharedData->staticData->name, deadSharedData);
+
+ if (deadSharedData->referenceCounter > 0) {
+ UTRACE_EXIT_VALUE((int32_t)FALSE);
return FALSE;
+ }
if (deadSharedData->impl->unload != NULL) {
deadSharedData->impl->unload(deadSharedData);
}
-
+
if(deadSharedData->dataMemory != NULL)
{
UDataMemory *data = (UDataMemory*)deadSharedData->dataMemory;
@@ -456,42 +486,94 @@ ucnv_deleteSharedConverterData(UConverterSharedData * deadSharedData)
#endif
uprv_free(deadSharedData);
-
+
+ UTRACE_EXIT_VALUE((int32_t)TRUE);
return TRUE;
}
+/**
+ * Load a non-algorithmic converter.
+ * If pkg==NULL, then this function must be called inside umtx_lock(&cnvCacheMutex).
+ */
+UConverterSharedData *
+ucnv_load(UConverterLoadArgs *pArgs, UErrorCode *err) {
+ UConverterSharedData *mySharedConverterData;
+
+ if(err == NULL || U_FAILURE(*err)) {
+ return NULL;
+ }
+
+ if(pArgs->pkg != NULL && *pArgs->pkg != 0) {
+ /* application-provided converters are not currently cached */
+ return createConverterFromFile(pArgs, err);
+ }
+
+ mySharedConverterData = ucnv_getSharedConverterData(pArgs->name);
+ if (mySharedConverterData == NULL)
+ {
+ /*Not cached, we need to stream it in from file */
+ mySharedConverterData = createConverterFromFile(pArgs, err);
+ if (U_FAILURE (*err) || (mySharedConverterData == NULL))
+ {
+ return NULL;
+ }
+ else
+ {
+ /* share it with other library clients */
+ ucnv_shareConverterData(mySharedConverterData);
+ }
+ }
+ else
+ {
+ /* The data for this converter was already in the cache. */
+ /* Update the reference counter on the shared data: one more client */
+ mySharedConverterData->referenceCounter++;
+ }
+
+ return mySharedConverterData;
+}
+
+/**
+ * Unload a non-algorithmic converter.
+ * It must be sharedData->referenceCounter != ~0
+ * and this function must be called inside umtx_lock(&cnvCacheMutex).
+ */
void
-ucnv_unloadSharedDataIfReady(UConverterSharedData *sharedData)
-{
- umtx_lock(&cnvCacheMutex);
- /*
- Double checking doesn't work on some platforms.
- Don't check referenceCounter outside of a mutex block.
- */
- if (sharedData->referenceCounter != ~0) {
+ucnv_unload(UConverterSharedData *sharedData) {
+ if(sharedData != NULL) {
if (sharedData->referenceCounter > 0) {
sharedData->referenceCounter--;
}
-
+
if((sharedData->referenceCounter <= 0)&&(sharedData->sharedDataCached == FALSE)) {
ucnv_deleteSharedConverterData(sharedData);
}
}
- umtx_unlock(&cnvCacheMutex);
}
void
-ucnv_incrementRefCount(UConverterSharedData *sharedData)
+ucnv_unloadSharedDataIfReady(UConverterSharedData *sharedData)
{
- umtx_lock(&cnvCacheMutex);
/*
- Double checking doesn't work on some platforms.
- Don't check referenceCounter outside of a mutex block.
+ Checking whether it's an algorithic converter is okay
+ in multithreaded applications because the value never changes.
+ Don't check referenceCounter for any other value.
*/
- if (sharedData->referenceCounter != ~0) {
+ if(sharedData != NULL && sharedData->referenceCounter != ~0) {
+ umtx_lock(&cnvCacheMutex);
+ ucnv_unload(sharedData);
+ umtx_unlock(&cnvCacheMutex);
+ }
+}
+
+void
+ucnv_incrementRefCount(UConverterSharedData *sharedData)
+{
+ if(sharedData != NULL && sharedData->referenceCounter != ~0) {
+ umtx_lock(&cnvCacheMutex);
sharedData->referenceCounter++;
+ umtx_unlock(&cnvCacheMutex);
}
- umtx_unlock(&cnvCacheMutex);
}
static void
@@ -575,54 +657,58 @@ parseConverterOptions(const char *inName,
* -Call dataConverter initializer (Data=TRUE, Cached=TRUE)
* -Call AlgorithmicConverter initializer (Data=FALSE, Cached=TRUE)
*/
-UConverter *
-ucnv_createConverter(UConverter *myUConverter, const char *converterName, UErrorCode * err)
-{
- char cnvName[UCNV_MAX_CONVERTER_NAME_LENGTH], locale[ULOC_FULLNAME_CAPACITY];
- const char *realName;
+UConverterSharedData *
+ucnv_loadSharedData(const char *converterName, UConverterLookupData *lookup, UErrorCode * err) {
+ UConverterLookupData stackLookup;
UConverterSharedData *mySharedConverterData = NULL;
UErrorCode internalErrorCode = U_ZERO_ERROR;
- uint32_t options = 0;
- if (U_FAILURE (*err))
+
+ if (U_FAILURE (*err)) {
return NULL;
+ }
+
+ if(lookup == NULL) {
+ lookup = &stackLookup;
+ }
- locale[0] = 0;
+ lookup->locale[0] = 0;
+ lookup->options = 0;
/* In case "name" is NULL we want to open the default converter. */
if (converterName == NULL) {
- realName = ucnv_io_getDefaultConverterName();
- if (realName == NULL) {
+ lookup->realName = ucnv_io_getDefaultConverterName();
+ if (lookup->realName == NULL) {
*err = U_MISSING_RESOURCE_ERROR;
return NULL;
}
/* the default converter name is already canonical */
} else {
/* separate the converter name from the options */
- parseConverterOptions(converterName, cnvName, locale, &options, err);
+ parseConverterOptions(converterName, lookup->cnvName, lookup->locale, &lookup->options, err);
if (U_FAILURE(*err)) {
/* Very bad name used. */
return NULL;
}
/* get the canonical converter name */
- realName = ucnv_io_getConverterName(cnvName, &internalErrorCode);
- if (U_FAILURE(internalErrorCode) || realName == NULL) {
+ lookup->realName = ucnv_io_getConverterName(lookup->cnvName, &internalErrorCode);
+ if (U_FAILURE(internalErrorCode) || lookup->realName == NULL) {
/*
* set the input name in case the converter was added
* without updating the alias table, or when there is no alias table
*/
- realName = cnvName;
+ lookup->realName = lookup->cnvName;
}
}
/* separate the converter name from the options */
- if(realName != cnvName) {
- parseConverterOptions(realName, cnvName, locale, &options, err);
- realName = cnvName;
+ if(lookup->realName != lookup->cnvName) {
+ parseConverterOptions(lookup->realName, lookup->cnvName, lookup->locale, &lookup->options, err);
+ lookup->realName = lookup->cnvName;
}
-
+
/* get the shared data for an algorithmic converter, if it is one */
- mySharedConverterData = (UConverterSharedData *)getAlgorithmicTypeFromName(realName);
+ mySharedConverterData = (UConverterSharedData *)getAlgorithmicTypeFromName(lookup->realName);
if (mySharedConverterData == NULL)
{
/* it is a data-based converter, get its shared data. */
@@ -630,50 +716,57 @@ ucnv_createConverter(UConverter *myUConverter, const char *converterName, UError
/* converter data cache, and adding new entries to the cache */
/* to prevent other threads from modifying the cache during the */
/* process. */
+ UConverterLoadArgs args={ 0 };
+
+ args.size=sizeof(UConverterLoadArgs);
+ args.nestedLoads=1;
+ args.options=lookup->options;
+ args.pkg=NULL;
+ args.name=lookup->realName;
+
umtx_lock(&cnvCacheMutex);
- mySharedConverterData = ucnv_getSharedConverterData(realName);
- if (mySharedConverterData == NULL)
- {
- /*Not cached, we need to stream it in from file */
- mySharedConverterData = createConverterFromFile(NULL, realName, err);
- if (U_FAILURE (*err) || (mySharedConverterData == NULL))
- {
- umtx_unlock(&cnvCacheMutex);
- return NULL;
- }
- else
- {
- /* share it with other library clients */
- ucnv_shareConverterData(mySharedConverterData);
- }
- }
- else
+ mySharedConverterData = ucnv_load(&args, err);
+ umtx_unlock(&cnvCacheMutex);
+ if (U_FAILURE (*err) || (mySharedConverterData == NULL))
{
- /* The data for this converter was already in the cache. */
- /* Update the reference counter on the shared data: one more client */
- mySharedConverterData->referenceCounter++;
+ return NULL;
}
- umtx_unlock(&cnvCacheMutex);
}
- myUConverter = ucnv_createConverterFromSharedData(myUConverter, mySharedConverterData, realName, locale, options, err);
+ return mySharedConverterData;
+}
- if (U_FAILURE(*err))
- {
- /*
- Checking whether it's an algorithic converter is okay
- in multithreaded applications because the value never changes.
- Don't check referenceCounter for any other value.
- */
- if (mySharedConverterData->referenceCounter != ~0) {
- umtx_lock(&cnvCacheMutex);
- --mySharedConverterData->referenceCounter;
- umtx_unlock(&cnvCacheMutex);
+UConverter *
+ucnv_createConverter(UConverter *myUConverter, const char *converterName, UErrorCode * err)
+{
+ UConverterLookupData stackLookup;
+ UConverterSharedData *mySharedConverterData;
+
+ UTRACE_ENTRY_OC(UTRACE_UCNV_OPEN);
+
+ if(U_SUCCESS(*err)) {
+ UTRACE_DATA1(UTRACE_OPEN_CLOSE, "open converter %s", converterName);
+
+ mySharedConverterData = ucnv_loadSharedData(converterName, &stackLookup, err);
+
+ if(U_SUCCESS(*err)) {
+ myUConverter = ucnv_createConverterFromSharedData(
+ myUConverter, mySharedConverterData,
+ stackLookup.realName, stackLookup.locale, stackLookup.options,
+ err);
+
+ if(U_SUCCESS(*err)) {
+ UTRACE_EXIT_PTR_STATUS(myUConverter, *err);
+ return myUConverter;
+ } else {
+ ucnv_unloadSharedDataIfReady(mySharedConverterData);
+ }
}
- return NULL;
}
- return myUConverter;
+ /* exit with error */
+ UTRACE_EXIT_STATUS(*err);
+ return NULL;
}
UConverter *
@@ -681,11 +774,16 @@ ucnv_createAlgorithmicConverter(UConverter *myUConverter,
UConverterType type,
const char *locale, uint32_t options,
UErrorCode *err) {
+ UConverter *cnv;
const UConverterSharedData *sharedData;
UBool isAlgorithmicConverter;
+ UTRACE_ENTRY_OC(UTRACE_UCNV_OPEN_ALGORITHMIC);
+ UTRACE_DATA1(UTRACE_OPEN_CLOSE, "open algorithmic converter type %d", (int32_t)type);
+
if(type<0 || UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES<=type) {
*err = U_ILLEGAL_ARGUMENT_ERROR;
+ UTRACE_EXIT_STATUS(U_ILLEGAL_ARGUMENT_ERROR);
return NULL;
}
@@ -696,47 +794,66 @@ ucnv_createAlgorithmicConverter(UConverter *myUConverter,
if (isAlgorithmicConverter) {
/* not a valid type, or not an algorithmic converter */
*err = U_ILLEGAL_ARGUMENT_ERROR;
+ UTRACE_EXIT_STATUS(U_ILLEGAL_ARGUMENT_ERROR);
return NULL;
}
- return ucnv_createConverterFromSharedData(myUConverter, (UConverterSharedData *)sharedData, "",
+ cnv = ucnv_createConverterFromSharedData(myUConverter, (UConverterSharedData *)sharedData, "",
locale != NULL ? locale : "", options, err);
+
+ UTRACE_EXIT_PTR_STATUS(cnv, *err);
+ return cnv;
}
UConverter*
ucnv_createConverterFromPackage(const char *packageName, const char *converterName, UErrorCode * err)
{
char cnvName[UCNV_MAX_CONVERTER_NAME_LENGTH], locale[ULOC_FULLNAME_CAPACITY];
- uint32_t options=0;
UConverter *myUConverter;
- UConverterSharedData *mySharedConverterData = NULL;
+ UConverterSharedData *mySharedConverterData;
+
+ UConverterLoadArgs args={ 0 };
+
+ UTRACE_ENTRY_OC(UTRACE_UCNV_OPEN_PACKAGE);
if(U_FAILURE(*err)) {
- return NULL;
+ UTRACE_EXIT_STATUS(*err);
+ return NULL;
}
- /* first, get the options out of the convertername string */
- parseConverterOptions(converterName, cnvName, locale, &options, err);
+ UTRACE_DATA2(UTRACE_OPEN_CLOSE, "open converter %s from package %s", converterName, packageName);
+
+ args.size=sizeof(UConverterLoadArgs);
+ args.nestedLoads=1;
+ args.pkg=packageName;
+
+ /* first, get the options out of the converterName string */
+ parseConverterOptions(converterName, cnvName, locale, &args.options, err);
if (U_FAILURE(*err)) {
/* Very bad name used. */
+ UTRACE_EXIT_STATUS(*err);
return NULL;
}
-
+ args.name=cnvName;
+
/* open the data, unflatten the shared structure */
- mySharedConverterData = createConverterFromFile(packageName, cnvName, err);
-
+ mySharedConverterData = createConverterFromFile(&args, err);
+
if (U_FAILURE(*err)) {
- return NULL;
+ UTRACE_EXIT_STATUS(*err);
+ return NULL;
}
/* create the actual converter */
- myUConverter = ucnv_createConverterFromSharedData(NULL, mySharedConverterData, cnvName, locale, options, err);
-
+ myUConverter = ucnv_createConverterFromSharedData(NULL, mySharedConverterData, cnvName, locale, args.options, err);
+
if (U_FAILURE(*err)) {
ucnv_close(myUConverter);
- return NULL;
+ UTRACE_EXIT_STATUS(*err);
+ return NULL;
}
-
+
+ UTRACE_EXIT_PTR_STATUS(myUConverter, *err);
return myUConverter;
}
@@ -768,13 +885,14 @@ ucnv_createConverterFromSharedData(UConverter *myUConverter,
myUConverter->isExtraLocal = FALSE;
myUConverter->sharedData = mySharedConverterData;
myUConverter->options = options;
- myUConverter->mode = UCNV_SI;
myUConverter->fromCharErrorBehaviour = (UConverterToUCallback) UCNV_TO_U_CALLBACK_SUBSTITUTE;
myUConverter->fromUCharErrorBehaviour = (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE;
myUConverter->toUnicodeStatus = myUConverter->sharedData->toUnicodeStatus;
+ myUConverter->maxBytesPerUChar = myUConverter->sharedData->staticData->maxBytesPerChar;
myUConverter->subChar1 = myUConverter->sharedData->staticData->subChar1;
myUConverter->subCharLen = myUConverter->sharedData->staticData->subCharLen;
uprv_memcpy (myUConverter->subChar, myUConverter->sharedData->staticData->subChar, myUConverter->subCharLen);
+ myUConverter->preFromUFirstCP = U_SENTINEL;
if(myUConverter != NULL && myUConverter->sharedData->impl->open != NULL) {
myUConverter->sharedData->impl->open(myUConverter, realName, locale,options, err);
@@ -793,10 +911,13 @@ U_CAPI int32_t U_EXPORT2
ucnv_flushCache ()
{
UConverterSharedData *mySharedData = NULL;
- int32_t pos = -1;
+ int32_t pos;
int32_t tableDeletedNum = 0;
const UHashElement *e;
UErrorCode status = U_ILLEGAL_ARGUMENT_ERROR;
+ int32_t i, remaining;
+
+ UTRACE_ENTRY_OC(UTRACE_UCNV_FLUSH_CACHE);
/* Close the default converter without creating a new one so that everything will be flushed. */
ucnv_close(u_getDefaultConverter(&status));
@@ -804,8 +925,10 @@ ucnv_flushCache ()
/*if shared data hasn't even been lazy evaluated yet
* return 0
*/
- if (SHARED_DATA_HASHTABLE == NULL)
+ if (SHARED_DATA_HASHTABLE == NULL) {
+ UTRACE_EXIT_VALUE((int32_t)0);
return 0;
+ }
/*creates an enumeration to iterate through every element in the
* table
@@ -819,25 +942,356 @@ ucnv_flushCache ()
* is protected by cnvCacheMutex.
*/
umtx_lock(&cnvCacheMutex);
- while ((e = uhash_nextElement (SHARED_DATA_HASHTABLE, &pos)) != NULL)
- {
- mySharedData = (UConverterSharedData *) e->value.pointer;
- /*deletes only if reference counter == 0 */
- if (mySharedData->referenceCounter == 0)
+ /*
+ * double loop: A delta/extension-only converter has a pointer to its base table's
+ * shared data; the first iteration of the outer loop may see the delta converter
+ * before the base converter, and unloading the delta converter may get the base
+ * converter's reference counter down to 0.
+ */
+ i = 0;
+ do {
+ remaining = 0;
+ pos = -1;
+ while ((e = uhash_nextElement (SHARED_DATA_HASHTABLE, &pos)) != NULL)
{
- tableDeletedNum++;
-
- UCNV_DEBUG_LOG("del",mySharedData->staticData->name,mySharedData);
-
- uhash_removeElement(SHARED_DATA_HASHTABLE, e);
- mySharedData->sharedDataCached = FALSE;
- ucnv_deleteSharedConverterData (mySharedData);
+ mySharedData = (UConverterSharedData *) e->value.pointer;
+ /*deletes only if reference counter == 0 */
+ if (mySharedData->referenceCounter == 0)
+ {
+ tableDeletedNum++;
+
+ UCNV_DEBUG_LOG("del",mySharedData->staticData->name,mySharedData);
+
+ uhash_removeElement(SHARED_DATA_HASHTABLE, e);
+ mySharedData->sharedDataCached = FALSE;
+ ucnv_deleteSharedConverterData (mySharedData);
+ } else {
+ ++remaining;
+ }
}
- }
+ } while(++i == 1 && remaining > 0);
umtx_unlock(&cnvCacheMutex);
+ UTRACE_DATA1(UTRACE_INFO, "ucnv_flushCache() exits with %d converters remaining", remaining);
+
ucnv_io_flushAvailableConverterCache();
+ UTRACE_EXIT_VALUE(tableDeletedNum);
return tableDeletedNum;
}
+/* data swapping ------------------------------------------------------------ */
+
+/* most of this might belong more properly into ucnvmbcs.c, but that is so large */
+
+#if !UCONFIG_NO_LEGACY_CONVERSION
+
+U_CAPI int32_t U_EXPORT2
+ucnv_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ const UDataInfo *pInfo;
+ int32_t headerSize;
+
+ const uint8_t *inBytes;
+ uint8_t *outBytes;
+
+ uint32_t offset, count, staticDataSize;
+ int32_t size;
+
+ const UConverterStaticData *inStaticData;
+ UConverterStaticData *outStaticData;
+
+ const _MBCSHeader *inMBCSHeader;
+ _MBCSHeader *outMBCSHeader;
+ _MBCSHeader mbcsHeader;
+ uint8_t outputType;
+
+ const int32_t *inExtIndexes;
+ int32_t extOffset;
+
+ /* udata_swapDataHeader checks the arguments */
+ headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ /* check data format and format version */
+ pInfo=(const UDataInfo *)((const char *)inData+4);
+ if(!(
+ pInfo->dataFormat[0]==0x63 && /* dataFormat="cnvt" */
+ pInfo->dataFormat[1]==0x6e &&
+ pInfo->dataFormat[2]==0x76 &&
+ pInfo->dataFormat[3]==0x74 &&
+ pInfo->formatVersion[0]==6 &&
+ pInfo->formatVersion[1]>=2
+ )) {
+ udata_printError(ds, "ucnv_swap(): data format %02x.%02x.%02x.%02x (format version %02x.%02x) is not recognized as an ICU .cnv conversion table\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0], pInfo->formatVersion[1]);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ inBytes=(const uint8_t *)inData+headerSize;
+ outBytes=(uint8_t *)outData+headerSize;
+
+ /* read the initial UConverterStaticData structure after the UDataInfo header */
+ inStaticData=(const UConverterStaticData *)inBytes;
+ outStaticData=(UConverterStaticData *)outBytes;
+
+ if(length<0) {
+ staticDataSize=ds->readUInt32(inStaticData->structSize);
+ } else {
+ length-=headerSize;
+ if( lengthreadUInt32(inStaticData->structSize))
+ ) {
+ udata_printError(ds, "ucnv_swap(): too few bytes (%d after header) for an ICU .cnv conversion table\n",
+ length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+ }
+
+ if(length>=0) {
+ /* swap the static data */
+ if(inStaticData!=outStaticData) {
+ uprv_memcpy(outStaticData, inStaticData, staticDataSize);
+ }
+
+ ds->swapArray32(ds, &inStaticData->structSize, 4,
+ &outStaticData->structSize, pErrorCode);
+ ds->swapArray32(ds, &inStaticData->codepage, 4,
+ &outStaticData->codepage, pErrorCode);
+
+ ds->swapInvChars(ds, inStaticData->name, uprv_strlen(inStaticData->name),
+ outStaticData->name, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ udata_printError(ds, "ucnv_swap(): error swapping converter name - %s\n",
+ u_errorName(*pErrorCode));
+ return 0;
+ }
+ }
+
+ inBytes+=staticDataSize;
+ outBytes+=staticDataSize;
+ if(length>=0) {
+ length-=(int32_t)staticDataSize;
+ }
+
+ /* check for supported conversionType values */
+ if(inStaticData->conversionType==UCNV_MBCS) {
+ /* swap MBCS data */
+ inMBCSHeader=(const _MBCSHeader *)inBytes;
+ outMBCSHeader=(_MBCSHeader *)outBytes;
+
+ if(!(inMBCSHeader->version[0]==4 || inMBCSHeader->version[1]>=1)) {
+ udata_printError(ds, "ucnv_swap(): unsupported _MBCSHeader.version %d.%d\n",
+ inMBCSHeader->version[0], inMBCSHeader->version[1]);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ uprv_memcpy(mbcsHeader.version, inMBCSHeader->version, 4);
+ mbcsHeader.countStates= ds->readUInt32(inMBCSHeader->countStates);
+ mbcsHeader.countToUFallbacks= ds->readUInt32(inMBCSHeader->countToUFallbacks);
+ mbcsHeader.offsetToUCodeUnits= ds->readUInt32(inMBCSHeader->offsetToUCodeUnits);
+ mbcsHeader.offsetFromUTable= ds->readUInt32(inMBCSHeader->offsetFromUTable);
+ mbcsHeader.offsetFromUBytes= ds->readUInt32(inMBCSHeader->offsetFromUBytes);
+ mbcsHeader.flags= ds->readUInt32(inMBCSHeader->flags);
+ mbcsHeader.fromUBytesLength= ds->readUInt32(inMBCSHeader->fromUBytesLength);
+
+ extOffset=(int32_t)mbcsHeader.flags>>8;
+ outputType=(uint8_t)mbcsHeader.flags;
+
+ /* make sure that the output type is known */
+ switch(outputType) {
+ case MBCS_OUTPUT_1:
+ case MBCS_OUTPUT_2:
+ case MBCS_OUTPUT_3:
+ case MBCS_OUTPUT_4:
+ case MBCS_OUTPUT_3_EUC:
+ case MBCS_OUTPUT_4_EUC:
+ case MBCS_OUTPUT_2_SISO:
+ case MBCS_OUTPUT_EXT_ONLY:
+ /* OK */
+ break;
+ default:
+ udata_printError(ds, "ucnv_swap(): unsupported MBCS output type 0x%x\n",
+ outputType);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ /* calculate the length of the MBCS data */
+ if(extOffset==0) {
+ size=(int32_t)(mbcsHeader.offsetFromUBytes+mbcsHeader.fromUBytesLength);
+
+ /* avoid compiler warnings - not otherwise necessary, and the value does not matter */
+ inExtIndexes=NULL;
+ } else {
+ /* there is extension data after the base data, see ucnv_ext.h */
+ if(length>=0 && length<(extOffset+UCNV_EXT_INDEXES_MIN_LENGTH*4)) {
+ udata_printError(ds, "ucnv_swap(): too few bytes (%d after headers) for an ICU MBCS .cnv conversion table with extension data\n",
+ length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ inExtIndexes=(const int32_t *)(inBytes+extOffset);
+ size=extOffset+udata_readInt32(ds, inExtIndexes[UCNV_EXT_SIZE]);
+ }
+
+ if(length>=0) {
+ if(lengthswapArray32(ds, &inMBCSHeader->countStates, 7*4,
+ &outMBCSHeader->countStates, pErrorCode);
+
+ if(outputType==MBCS_OUTPUT_EXT_ONLY) {
+ /*
+ * extension-only file,
+ * contains a base name instead of normal base table data
+ */
+
+ /* swap the base name, between the header and the extension data */
+ ds->swapInvChars(ds, inMBCSHeader+1, uprv_strlen((const char *)(inMBCSHeader+1)),
+ outMBCSHeader+1, pErrorCode);
+ } else {
+ /* normal file with base table data */
+
+ /* swap the state table, 1kB per state */
+ ds->swapArray32(ds, inMBCSHeader+1, (int32_t)(mbcsHeader.countStates*1024),
+ outMBCSHeader+1, pErrorCode);
+
+ /* swap the toUFallbacks[] */
+ offset=sizeof(_MBCSHeader)+mbcsHeader.countStates*1024;
+ ds->swapArray32(ds, inBytes+offset, (int32_t)(mbcsHeader.countToUFallbacks*8),
+ outBytes+offset, pErrorCode);
+
+ /* swap the unicodeCodeUnits[] */
+ offset=mbcsHeader.offsetToUCodeUnits;
+ count=mbcsHeader.offsetFromUTable-offset;
+ ds->swapArray16(ds, inBytes+offset, (int32_t)count,
+ outBytes+offset, pErrorCode);
+
+ /* offset to the stage 1 table, independent of the outputType */
+ offset=mbcsHeader.offsetFromUTable;
+
+ if(outputType==MBCS_OUTPUT_1) {
+ /* SBCS: swap the fromU tables, all 16 bits wide */
+ count=(mbcsHeader.offsetFromUBytes-offset)+mbcsHeader.fromUBytesLength;
+ ds->swapArray16(ds, inBytes+offset, (int32_t)count,
+ outBytes+offset, pErrorCode);
+ } else {
+ /* otherwise: swap the stage tables separately */
+
+ /* stage 1 table: uint16_t[0x440 or 0x40] */
+ if(inStaticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
+ count=0x440*2; /* for all of Unicode */
+ } else {
+ count=0x40*2; /* only BMP */
+ }
+ ds->swapArray16(ds, inBytes+offset, (int32_t)count,
+ outBytes+offset, pErrorCode);
+
+ /* stage 2 table: uint32_t[] */
+ offset+=count;
+ count=mbcsHeader.offsetFromUBytes-offset;
+ ds->swapArray32(ds, inBytes+offset, (int32_t)count,
+ outBytes+offset, pErrorCode);
+
+ /* stage 3/result bytes: sometimes uint16_t[] or uint32_t[] */
+ offset=mbcsHeader.offsetFromUBytes;
+ count=mbcsHeader.fromUBytesLength;
+ switch(outputType) {
+ case MBCS_OUTPUT_2:
+ case MBCS_OUTPUT_3_EUC:
+ case MBCS_OUTPUT_2_SISO:
+ ds->swapArray16(ds, inBytes+offset, (int32_t)count,
+ outBytes+offset, pErrorCode);
+ break;
+ case MBCS_OUTPUT_4:
+ ds->swapArray32(ds, inBytes+offset, (int32_t)count,
+ outBytes+offset, pErrorCode);
+ break;
+ default:
+ /* just uint8_t[], nothing to swap */
+ break;
+ }
+ }
+ }
+
+ if(extOffset!=0) {
+ /* swap the extension data */
+ inBytes+=extOffset;
+ outBytes+=extOffset;
+
+ /* swap toUTable[] */
+ offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_TO_U_INDEX]);
+ length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_TO_U_LENGTH]);
+ ds->swapArray32(ds, inBytes+offset, length*4, outBytes+offset, pErrorCode);
+
+ /* swap toUUChars[] */
+ offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_TO_U_UCHARS_INDEX]);
+ length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_TO_U_UCHARS_LENGTH]);
+ ds->swapArray16(ds, inBytes+offset, length*2, outBytes+offset, pErrorCode);
+
+ /* swap fromUTableUChars[] */
+ offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_UCHARS_INDEX]);
+ length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_LENGTH]);
+ ds->swapArray16(ds, inBytes+offset, length*2, outBytes+offset, pErrorCode);
+
+ /* swap fromUTableValues[] */
+ offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_VALUES_INDEX]);
+ /* same length as for fromUTableUChars[] */
+ ds->swapArray32(ds, inBytes+offset, length*4, outBytes+offset, pErrorCode);
+
+ /* no need to swap fromUBytes[] */
+
+ /* swap fromUStage12[] */
+ offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_STAGE_12_INDEX]);
+ length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_STAGE_12_LENGTH]);
+ ds->swapArray16(ds, inBytes+offset, length*2, outBytes+offset, pErrorCode);
+
+ /* swap fromUStage3[] */
+ offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_STAGE_3_INDEX]);
+ length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_STAGE_3_LENGTH]);
+ ds->swapArray16(ds, inBytes+offset, length*2, outBytes+offset, pErrorCode);
+
+ /* swap fromUStage3b[] */
+ offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_STAGE_3B_INDEX]);
+ length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_STAGE_3B_LENGTH]);
+ ds->swapArray32(ds, inBytes+offset, length*4, outBytes+offset, pErrorCode);
+
+ /* swap indexes[] */
+ length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_INDEXES_LENGTH]);
+ ds->swapArray32(ds, inBytes, length*4, outBytes, pErrorCode);
+ }
+ }
+ } else {
+ udata_printError(ds, "ucnv_swap(): unknown conversionType=%d!=UCNV_MBCS\n",
+ inStaticData->conversionType);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ return headerSize+(int32_t)staticDataSize+size;
+}
+
+#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
+
+#endif
diff --git a/icuSources/common/ucnv_bld.h b/icuSources/common/ucnv_bld.h
index 1a467423..e3983ec4 100644
--- a/icuSources/common/ucnv_bld.h
+++ b/icuSources/common/ucnv_bld.h
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 1999-2003, International Business Machines
+* Copyright (C) 1999-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@@ -18,15 +18,25 @@
#define UCNV_BLD_H
#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
-
+#include "ucnv_cnv.h"
+#include "ucnvmbcs.h"
+#include "ucnv_ext.h"
+#include "udataswp.h"
/* size of the overflow buffers in UConverter, enough for escaping callbacks */
#define UCNV_ERROR_BUFFER_LENGTH 32
+/* at most 4 bytes per substitution character (part of .cnv file format! see UConverterStaticData) */
#define UCNV_MAX_SUBCHAR_LEN 4
+/* at most 8 bytes per character in toUBytes[] (UTF-8 uses up to 6) */
+#define UCNV_MAX_CHAR_LEN 8
+
/* converter options bits */
#define UCNV_OPTION_VERSION 0xf
#define UCNV_OPTION_SWAP_LFNL 0x10
@@ -37,7 +47,10 @@ U_CDECL_BEGIN /* We must declare the following as 'extern "C"' so that if ucnv
work.
*/
-union UConverterTable;
+union UConverterTable {
+ UConverterMBCSTable mbcs;
+};
+
typedef union UConverterTable UConverterTable;
struct UConverterImpl;
@@ -59,7 +72,7 @@ typedef struct UConverterStaticData { /* +offset: size */
int8_t conversionType; /* +69: 1 conversion type */
int8_t minBytesPerChar; /* +70: 1 Minimum # bytes per char in this codepage */
- int8_t maxBytesPerChar; /* +71: 1 Maximum # bytes per char in this codepage */
+ int8_t maxBytesPerChar; /* +71: 1 Maximum # bytes output per UChar in this codepage */
uint8_t subChar[UCNV_MAX_SUBCHAR_LEN]; /* +72: 4 [note: 4 and 8 byte boundary] */
int8_t subCharLen; /* +76: 1 */
@@ -81,7 +94,7 @@ struct UConverterSharedData {
uint32_t referenceCounter; /* used to count number of clients, 0xffffffff for static SharedData */
const void *dataMemory; /* from udata_openChoice() - for cleanup */
- UConverterTable *table; /* Pointer to conversion data */
+ void *table; /* Unused. This used to be a UConverterTable - Pointer to conversion data - see mbcs below */
const UConverterStaticData *staticData; /* pointer to the static (non changing) data. */
@@ -92,9 +105,23 @@ struct UConverterSharedData {
/*initial values of some members of the mutable part of object */
uint32_t toUnicodeStatus;
-};
-typedef struct UConverterSharedData UConverterSharedData;
+ /*
+ * Shared data structures currently come in two flavors:
+ * - readonly for built-in algorithmic converters
+ * - allocated for MBCS, with a pointer to an allocated UConverterTable
+ * which always has a UConverterMBCSTable
+ *
+ * To eliminate one allocation, I am making the UConverterMBCSTable
+ * a member of the shared data. It is the last member so that static
+ * definitions of UConverterSharedData work as before.
+ * The table field above also remains to avoid updating all static
+ * definitions, but is now unused.
+ *
+ * markus 2003-nov-07
+ */
+ UConverterMBCSTable mbcs;
+};
/* Defines a UConverter, the lightweight mutable part the user sees */
@@ -112,7 +139,7 @@ struct UConverter {
UErrorCode *);
/*
* Error function pointer called when conversion issues
- * occur during a T_UConverter_toUnicode call
+ * occur during a ucnv_toUnicode call
*/
void (U_EXPORT2 *fromCharErrorBehaviour) (const void *context,
UConverterToUnicodeArgs *args,
@@ -140,11 +167,29 @@ struct UConverter {
UBool useFallback;
int8_t toULength; /* number of bytes in toUBytes */
- uint8_t toUBytes[7]; /* more "toU status"; keeps the bytes of the current character */
+ uint8_t toUBytes[UCNV_MAX_CHAR_LEN-1];/* more "toU status"; keeps the bytes of the current character */
uint32_t toUnicodeStatus; /* Used to internalize stream status information */
int32_t mode;
uint32_t fromUnicodeStatus;
- UChar fromUSurrogateLead; /* similar to toUBytes; keeps the lead surrogate of the current character */
+
+ /*
+ * More fromUnicode() status. Serves 3 purposes:
+ * - keeps a lead surrogate between buffers (similar to toUBytes[])
+ * - keeps a lead surrogate at the end of the stream,
+ * which the framework handles as truncated input
+ * - if the fromUnicode() implementation returns to the framework
+ * (ucnv.c ucnv_fromUnicode()), then the framework calls the callback
+ * for this code point
+ */
+ UChar32 fromUChar32;
+
+ /*
+ * value for ucnv_getMaxCharSize()
+ *
+ * usually simply copied from the static data, but ucnvmbcs.c modifies
+ * the value depending on the converter type and options
+ */
+ int8_t maxBytesPerUChar;
int8_t subCharLen; /* length of the codepage specific character sequence */
int8_t invalidCharLength;
@@ -154,26 +199,52 @@ struct UConverter {
int8_t UCharErrorBufferLength; /* number of valid UChars in charErrorBuffer */
uint8_t subChar1; /* single-byte substitution character if different from subChar */
+ UBool useSubChar1;
uint8_t subChar[UCNV_MAX_SUBCHAR_LEN]; /* codepage specific character sequence */
- char invalidCharBuffer[UCNV_MAX_SUBCHAR_LEN]; /* bytes from last error/callback situation */
+ char invalidCharBuffer[UCNV_MAX_CHAR_LEN]; /* bytes from last error/callback situation */
uint8_t charErrorBuffer[UCNV_ERROR_BUFFER_LENGTH]; /* codepage output from Error functions */
- UChar invalidUCharBuffer[3]; /* UChars from last error/callback situation */
+ UChar invalidUCharBuffer[U16_MAX_LENGTH]; /* UChars from last error/callback situation */
UChar UCharErrorBuffer[UCNV_ERROR_BUFFER_LENGTH]; /* unicode output from Error functions */
+ /* fields for conversion extension */
+
+ /* store previous UChars/chars to continue partial matches */
+ UChar32 preFromUFirstCP; /* >=0: partial match */
+ UChar preFromU[UCNV_EXT_MAX_UCHARS];
+ char preToU[UCNV_EXT_MAX_BYTES];
+ int8_t preFromULength, preToULength; /* negative: replay */
+ int8_t preToUFirstLength; /* length of first character */
};
U_CDECL_END /* end of UConverter */
-typedef struct
- {
- UConverter *OptGrpConverter[0x20]; /* Converter per Opt. grp. */
- uint8_t OptGroup; /* default Opt. grp. for this LMBCS session */
- uint8_t localeConverterIndex; /* reasonable locale match for index */
+#define CONVERTER_FILE_EXTENSION ".cnv"
+
+/**
+ * Load a non-algorithmic converter.
+ * If pkg==NULL, then this function must be called inside umtx_lock(&cnvCacheMutex).
+ */
+UConverterSharedData *
+ucnv_load(UConverterLoadArgs *pArgs, UErrorCode *err);
+
+/**
+ * Unload a non-algorithmic converter.
+ * It must be sharedData->referenceCounter != ~0
+ * and this function must be called inside umtx_lock(&cnvCacheMutex).
+ */
+void
+ucnv_unload(UConverterSharedData *sharedData);
- }
-UConverterDataLMBCS;
+/**
+ * Swap ICU .cnv conversion tables. See udataswp.h.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+ucnv_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode);
-#define CONVERTER_FILE_EXTENSION ".cnv"
+#endif
#endif /* _UCNV_BLD */
diff --git a/icuSources/common/ucnv_cb.c b/icuSources/common/ucnv_cb.c
index 9cbf25e6..5038ab57 100644
--- a/icuSources/common/ucnv_cb.c
+++ b/icuSources/common/ucnv_cb.c
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 2000-2001, International Business Machines
+* Copyright (C) 2000-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* ucnv_cb.c:
@@ -19,6 +19,9 @@
*/
#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
#include "unicode/ucnv_cb.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
@@ -35,50 +38,16 @@ ucnv_cbFromUWriteBytes (UConverterFromUnicodeArgs *args,
int32_t offsetIndex,
UErrorCode * err)
{
- int32_t togo;
- int8_t toerr;
- int32_t i;
-
- if((args->targetLimit - args->target) >= length) /* If the buffer fits.. */
- {
- uprv_memcpy(args->target, source, length);
- args->target += length;
- if(args->offsets) /* set all the offsets to the same # */
- {
- for(i=0;ioffsets++) = offsetIndex;
- }
- }
+ if(U_FAILURE(*err)) {
+ return;
}
- else
- {
- togo = (int32_t)(args->targetLimit - args->target);
-
- uprv_memcpy(args->target, source, togo);
- args->target += togo;
- if(args->offsets)
- {
- for(i=0;ioffsets++) = offsetIndex;
- }
- }
-
- /* Now, copy the remainder into the errbuff */
- source += togo;
- toerr = (int8_t)(length - togo);
-
- uprv_memcpy(args->converter->charErrorBuffer +
- args->converter->charErrorBufferLength,
- source,
- toerr * sizeof(source[0]));
- args->converter->charErrorBufferLength += toerr;
-
- *err = U_BUFFER_OVERFLOW_ERROR;
-
- }
+ ucnv_fromUWriteBytes(
+ args->converter,
+ source, length,
+ &args->target, args->targetLimit,
+ &args->offsets, offsetIndex,
+ err);
}
U_CAPI void U_EXPORT2
@@ -232,55 +201,16 @@ ucnv_cbToUWriteUChars (UConverterToUnicodeArgs *args,
int32_t offsetIndex,
UErrorCode * err)
{
- int32_t togo;
- int8_t toerr;
- int32_t i;
-
- if(U_FAILURE(*err))
- {
+ if(U_FAILURE(*err)) {
return;
}
-
- if((args->targetLimit - args->target) >= length) /* If the buffer fits.. */
- {
- uprv_memcpy(args->target, source, length * sizeof(args->target[0]) );
- args->target += length;
- if(args->offsets) /* set all the offsets to the same # */
- {
- for(i=0;ioffsets++) = offsetIndex;
- }
- }
- }
- else
- {
- togo = (int32_t)(args->targetLimit - args->target);
-
- uprv_memcpy(args->target, source, togo * sizeof(args->target[0]) );
- args->target += togo;
-
- if(args->offsets)
- {
- for(i=0;ioffsets++) = offsetIndex;
- }
- }
-
- /* Now, copy the remainder into the errbuff */
- source += togo;
- toerr = (int8_t)(length - togo);
-
- uprv_memcpy(args->converter->UCharErrorBuffer +
- args->converter->UCharErrorBufferLength,
- source,
- toerr * sizeof(source[0]));
- args->converter->UCharErrorBufferLength += toerr;
-
- *err = U_BUFFER_OVERFLOW_ERROR;
- }
+ ucnv_toUWriteUChars(
+ args->converter,
+ source, length,
+ &args->target, args->targetLimit,
+ &args->offsets, offsetIndex,
+ err);
}
U_CAPI void U_EXPORT2
@@ -297,3 +227,5 @@ ucnv_cbToUWriteSub (UConverterToUnicodeArgs *args,
ucnv_cbToUWriteUChars(args, &kSubstituteChar, 1, offsetIndex, err);
}
}
+
+#endif
diff --git a/icuSources/common/ucnv_cnv.c b/icuSources/common/ucnv_cnv.c
index ae0d61ed..48c22010 100644
--- a/icuSources/common/ucnv_cnv.c
+++ b/icuSources/common/ucnv_cnv.c
@@ -1,7 +1,7 @@
/*
******************************************************************************
*
-* Copyright (C) 1999-2003, International Business Machines
+* Copyright (C) 1999-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@@ -16,244 +16,159 @@
*/
#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
#include "unicode/ucnv_err.h"
#include "unicode/ucnv.h"
#include "unicode/uset.h"
#include "ucnv_cnv.h"
+#include "ucnv_bld.h"
#include "cmemory.h"
-/*Empties the internal unicode output buffer */
-void ucnv_flushInternalUnicodeBuffer (UConverter * _this,
- UChar * myTarget,
- int32_t * myTargetIndex,
- int32_t targetLength,
- int32_t** offsets,
- UErrorCode * err)
-{
- int32_t myUCharErrorBufferLength = _this->UCharErrorBufferLength;
-
- if (myUCharErrorBufferLength <= targetLength)
- {
- /*we have enough space
- *So we just copy the whole Error Buffer in to the output stream
- */
- uprv_memcpy (myTarget,
- _this->UCharErrorBuffer,
- sizeof (UChar) * myUCharErrorBufferLength);
- if (offsets)
- {
- int32_t i=0;
- for (i=0; iUCharErrorBufferLength = 0;
- }
- else
- {
- /* We don't have enough space so we copy as much as we can
- * on the output stream and update the object
- * by updating the internal buffer*/
- uprv_memcpy (myTarget, _this->UCharErrorBuffer, sizeof (UChar) * targetLength);
- if (offsets)
- {
- int32_t i=0;
- for (i=0; i< targetLength;i++) (*offsets)[i] = -1;
- *offsets += targetLength;
- }
- uprv_memmove (_this->UCharErrorBuffer,
- _this->UCharErrorBuffer + targetLength,
- sizeof (UChar) * (myUCharErrorBufferLength - targetLength));
- _this->UCharErrorBufferLength -= (int8_t) targetLength;
- *myTargetIndex = targetLength;
- *err = U_BUFFER_OVERFLOW_ERROR;
- }
+U_CFUNC void
+ucnv_getCompleteUnicodeSet(const UConverter *cnv,
+ USetAdder *sa,
+ UConverterUnicodeSet which,
+ UErrorCode *pErrorCode) {
+ sa->addRange(sa->set, 0, 0x10ffff);
+}
+
+U_CFUNC void
+ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
+ USetAdder *sa,
+ UConverterUnicodeSet which,
+ UErrorCode *pErrorCode) {
+ sa->addRange(sa->set, 0, 0xd7ff);
+ sa->addRange(sa->set, 0xe000, 0x10ffff);
}
-/*Empties the internal codepage output buffer */
-void ucnv_flushInternalCharBuffer (UConverter * _this,
- char *myTarget,
- int32_t * myTargetIndex,
- int32_t targetLength,
- int32_t** offsets,
- UErrorCode * err)
-{
- int32_t myCharErrorBufferLength = _this->charErrorBufferLength;
-
- /*we have enough space */
- if (myCharErrorBufferLength <= targetLength)
- {
- uprv_memcpy (myTarget, _this->charErrorBuffer, myCharErrorBufferLength);
- if (offsets)
- {
- int32_t i=0;
- for (i=0; i0 && tcharErrorBufferLength = 0;
+ } else {
+ /* output with offsets */
+ while(length>0 && tcharErrorBuffer, targetLength);
- if (offsets)
- {
- int32_t i=0;
- for (i=0; i< targetLength;i++) (*offsets)[i] = -1;
- *offsets += targetLength;
+ *target=t;
+
+ /* write overflow */
+ if(length>0) {
+ if(cnv!=NULL) {
+ t=(char *)cnv->charErrorBuffer;
+ cnv->charErrorBufferLength=(int8_t)length;
+ do {
+ *t++=(uint8_t)*bytes++;
+ } while(--length>0);
}
- uprv_memmove (_this->charErrorBuffer,
- _this->charErrorBuffer + targetLength,
- (myCharErrorBufferLength - targetLength));
- _this->charErrorBufferLength -= (int8_t) targetLength;
- *myTargetIndex = targetLength;
- *err = U_BUFFER_OVERFLOW_ERROR;
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
}
-/**
- * This function is useful for implementations of getNextUChar().
- * After a call to a callback function or to toUnicode(), an output buffer
- * begins with a Unicode code point that needs to be returned as UChar32,
- * and all following code units must be prepended to the - potentially
- * prefilled - overflow buffer in the UConverter.
- * The buffer should be at least of capacity UTF_MAX_CHAR_LENGTH so that a
- * complete UChar32's UChars fit into it.
- *
- * @param cnv The converter that will get remaining UChars copied to its overflow area.
- * @param buffer An array of UChars that was passed into a callback function
- * or a toUnicode() function.
- * @param length The number of code units (UChars) that are actually in the buffer.
- * This must be >0.
- * @return The code point from the first UChars in the buffer.
- */
-U_CFUNC UChar32
-ucnv_getUChar32KeepOverflow(UConverter *cnv, const UChar *buffer, int32_t length) {
- UChar32 c;
- int32_t i;
-
- if(length<=0) {
- return 0xffff;
+U_CFUNC void
+ucnv_toUWriteUChars(UConverter *cnv,
+ const UChar *uchars, int32_t length,
+ UChar **target, const UChar *targetLimit,
+ int32_t **offsets,
+ int32_t sourceIndex,
+ UErrorCode *pErrorCode) {
+ UChar *t=*target;
+ int32_t *o;
+
+ /* write UChars */
+ if(offsets==NULL || (o=*offsets)==NULL) {
+ while(length>0 && t0 && tUCharErrorBuffer;
- int32_t j=cnv->UCharErrorBufferLength;
-
- if(j>0) {
- /* move the overflow buffer contents to make room for the extra UChars */
- int32_t k;
-
- cnv->UCharErrorBufferLength=(int8_t)(k=(length-i)+j);
+ /* write overflow */
+ if(length>0) {
+ if(cnv!=NULL) {
+ t=cnv->UCharErrorBuffer;
+ cnv->UCharErrorBufferLength=(int8_t)length;
do {
- overflow[--k]=overflow[--j];
- } while(j>0);
- } else {
- cnv->UCharErrorBufferLength=(int8_t)(length-i);
+ *t++=*uchars++;
+ } while(--length>0);
}
-
- /* copy the remaining UChars to the beginning of the overflow buffer */
- do {
- overflow[j++]=buffer[i++];
- } while(i=0) {
- /* add the sourceIndex to the relative offsets that the callback wrote */
- while(length>0) {
- *offsets+=sourceIndex;
- ++offsets;
- --length;
- }
- } else {
- /* sourceIndex==-1, set -1 offsets */
- while(length>0) {
- *offsets=-1;
- ++offsets;
- --length;
+U_CFUNC void
+ucnv_toUWriteCodePoint(UConverter *cnv,
+ UChar32 c,
+ UChar **target, const UChar *targetLimit,
+ int32_t **offsets,
+ int32_t sourceIndex,
+ UErrorCode *pErrorCode) {
+ UChar *t;
+ int32_t *o;
+
+ t=*target;
+
+ if(tsourceLimit;
-
- pArgs->target=buffer;
- pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
-
- while(pArgs->sourcesourceLimit=pArgs->source+1;
- pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit);
-
- /* convert this byte and check the result */
- toU(pArgs, pErrorCode);
- if(U_SUCCESS(*pErrorCode)) {
- int32_t length=(int32_t)(pArgs->target-buffer);
- /* this test is UTF-16 specific */
- if(/* some output and
- (source consumed or don't collect surrogate pairs or not a surrogate or a surrogate pair) */
- length>0 &&
- (pArgs->flush || !collectPairs || !UTF_IS_FIRST_SURROGATE(buffer[0]) || length==2)
- ) {
- return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, length);
+ /* write offsets */
+ if(offsets!=NULL && (o=*offsets)!=NULL) {
+ *o++=sourceIndex;
+ if((*target+1)converter, buffer, UTF_MAX_CHAR_LENGTH);
- } else {
- /* U_FAILURE() */
- return 0xffff;
+ *offsets=o;
}
}
- /* no output because of empty input or only state changes and skipping callbacks */
- *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0xffff;
-}
+ *target=t;
-U_CFUNC void
-ucnv_getCompleteUnicodeSet(const UConverter *cnv,
- USet *set,
- UConverterUnicodeSet which,
- UErrorCode *pErrorCode) {
- uset_addRange(set, 0, 0x10ffff);
+ /* write overflow from c */
+ if(c>=0) {
+ if(cnv!=NULL) {
+ int8_t i=0;
+ U16_APPEND_UNSAFE(cnv->UCharErrorBuffer, i, c);
+ cnv->UCharErrorBufferLength=i;
+ }
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ }
}
-U_CFUNC void
-ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
- USet *set,
- UConverterUnicodeSet which,
- UErrorCode *pErrorCode) {
- uset_addRange(set, 0, 0xd7ff);
- uset_addRange(set, 0xe000, 0x10ffff);
-}
+#endif
diff --git a/icuSources/common/ucnv_cnv.h b/icuSources/common/ucnv_cnv.h
index a4f468ea..e0692a50 100644
--- a/icuSources/common/ucnv_cnv.h
+++ b/icuSources/common/ucnv_cnv.h
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 1999-2003, International Business Machines
+* Copyright (C) 1999-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@@ -19,16 +19,13 @@
#define UCNV_CNV_H
#include "unicode/utypes.h"
-#include "unicode/ucnv.h"
-#include "unicode/ucnv_err.h"
-#include "ucnv_bld.h"
-#include "ucnvmbcs.h"
-union UConverterTable
- {
- UConverterMBCSTable mbcs;
- };
+#if !UCONFIG_NO_CONVERSION
+#include "unicode/ucnv.h"
+#include "unicode/ucnv_err.h"
+#include "unicode/uset.h"
+#include "uset_imp.h"
U_CDECL_BEGIN
@@ -38,14 +35,29 @@ U_CDECL_BEGIN
/*
* #define missingUCharMarker 0xfffe
*
- * there are actually two values used in toUnicode tables:
+ * commented out because there are actually two values used in toUnicode tables:
* U+fffe "unassigned"
* U+ffff "illegal"
*/
+/** Forward declaration, see ucnv_bld.h */
+struct UConverterSharedData;
+typedef struct UConverterSharedData UConverterSharedData;
+
+/* function types for UConverterImpl ---------------------------------------- */
+/* struct with arguments for UConverterLoad and ucnv_load() */
+typedef struct {
+ int32_t size; /* sizeof(UConverterLoadArgs) */
+ int32_t nestedLoads; /* count nested ucnv_load() calls */
+ int32_t reserved; /* reserved - for good alignment of the pointers */
+ uint32_t options;
+ const char *pkg, *name;
+} UConverterLoadArgs;
-typedef void (*UConverterLoad) (UConverterSharedData *sharedData, const uint8_t *raw, UErrorCode *pErrorCode);
+typedef void (*UConverterLoad) (UConverterSharedData *sharedData,
+ UConverterLoadArgs *pArgs,
+ const uint8_t *raw, UErrorCode *pErrorCode);
typedef void (*UConverterUnload) (UConverterSharedData *sharedData);
typedef void (*UConverterOpen) (UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *pErrorCode);
@@ -59,11 +71,60 @@ typedef enum UConverterResetChoice {
typedef void (*UConverterReset) (UConverter *cnv, UConverterResetChoice choice);
-typedef void (*T_ToUnicodeFunction) (UConverterToUnicodeArgs *, UErrorCode *);
+/*
+ * Converter implementation function(s) for ucnv_toUnicode().
+ * If the toUnicodeWithOffsets function pointer is NULL,
+ * then the toUnicode function will be used and the offsets will be set to -1.
+ *
+ * Must maintain state across buffers. Use toUBytes[toULength] for partial input
+ * sequences; it will be checked in ucnv.c at the end of the input stream
+ * to detect truncated input.
+ * Some converters may need additional detection and may then set U_TRUNCATED_CHAR_FOUND.
+ *
+ * The toUnicodeWithOffsets must write exactly as many offset values as target
+ * units. Write offset values of -1 for when the source index corresponding to
+ * the output unit is not known (e.g., the character started in an earlier buffer).
+ * The pArgs->offsets pointer need not be moved forward.
+ *
+ * At function return, either one of the following conditions must be true:
+ * - U_BUFFER_OVERFLOW_ERROR and the target is full: target==targetLimit
+ * - another error code with toUBytes[toULength] set to the offending input
+ * - no error, and the source is consumed: source==sourceLimit
+ *
+ * The ucnv.c code will handle the end of the input (reset)
+ * (reset, and truncation detection) and callbacks.
+ */
+typedef void (*UConverterToUnicode) (UConverterToUnicodeArgs *, UErrorCode *);
-typedef void (*T_FromUnicodeFunction) (UConverterFromUnicodeArgs *, UErrorCode *);
+/*
+ * Same rules as for UConverterToUnicode.
+ * A lead surrogate is kept in fromUChar32 across buffers, and if an error
+ * occurs, then the offending input code point must be put into fromUChar32
+ * as well.
+ */
+typedef void (*UConverterFromUnicode) (UConverterFromUnicodeArgs *, UErrorCode *);
-typedef UChar32 (*T_GetNextUCharFunction) (UConverterToUnicodeArgs *, UErrorCode *);
+/*
+ * Converter implementation function for ucnv_getNextUChar().
+ * If the function pointer is NULL, then the toUnicode function will be used.
+ *
+ * Will be called at a character boundary (toULength==0).
+ * May return with
+ * - U_INDEX_OUTOFBOUNDS_ERROR if there was no output for the input
+ * (the return value will be ignored)
+ * - U_TRUNCATED_CHAR_FOUND or another error code (never U_BUFFER_OVERFLOW_ERROR!)
+ * with toUBytes[toULength] set to the offending input
+ * (the return value will be ignored)
+ * - return UCNV_GET_NEXT_UCHAR_USE_TO_U, without moving the source pointer,
+ * to indicate that the ucnv.c code shall call the toUnicode function instead
+ * - return a real code point result
+ *
+ * Unless UCNV_GET_NEXT_UCHAR_USE_TO_U is returned, the source bytes must be consumed.
+ *
+ * The ucnv.c code will handle the end of the input (reset)
+ * (except for truncation detection!) and callbacks.
+ */
+typedef UChar32 (*UConverterGetNextUChar) (UConverterToUnicodeArgs *, UErrorCode *);
typedef void (*UConverterGetStarters)(const UConverter* converter,
UBool starters[256],
@@ -110,26 +171,12 @@ typedef UConverter * (*UConverterSafeClone) (const UConverter *cnv,
* For more documentation, see ucnv_getUnicodeSet() in ucnv.h.
*/
typedef void (*UConverterGetUnicodeSet) (const UConverter *cnv,
- USet *set,
+ USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);
UBool CONVERSION_U_SUCCESS (UErrorCode err);
-void ucnv_flushInternalUnicodeBuffer (UConverter * _this,
- UChar * myTarget,
- int32_t * myTargetIndex,
- int32_t targetLength,
- int32_t** offsets,
- UErrorCode * err);
-
-void ucnv_flushInternalCharBuffer (UConverter * _this,
- char *myTarget,
- int32_t * myTargetIndex,
- int32_t targetLength,
- int32_t** offsets,
- UErrorCode * err);
-
/**
* UConverterImpl contains all the data and functions for a converter type.
* Its function pointers work much like a C++ vtable.
@@ -156,11 +203,11 @@ struct UConverterImpl {
UConverterClose close;
UConverterReset reset;
- T_ToUnicodeFunction toUnicode;
- T_ToUnicodeFunction toUnicodeWithOffsets;
- T_FromUnicodeFunction fromUnicode;
- T_FromUnicodeFunction fromUnicodeWithOffsets;
- T_GetNextUCharFunction getNextUChar;
+ UConverterToUnicode toUnicode;
+ UConverterToUnicode toUnicodeWithOffsets;
+ UConverterFromUnicode fromUnicode;
+ UConverterFromUnicode fromUnicodeWithOffsets;
+ UConverterGetNextUChar getNextUChar;
UConverterGetStarters getStarters;
UConverterGetName getName;
@@ -180,40 +227,6 @@ extern const UConverterSharedData
U_CDECL_END
-/**
- * This function is useful for implementations of getNextUChar().
- * After a call to a callback function or to toUnicode(), an output buffer
- * begins with a Unicode code point that needs to be returned as UChar32,
- * and all following code units must be prepended to the - potentially
- * prefilled - overflow buffer in the UConverter.
- * The buffer should be at least of capacity UTF_MAX_CHAR_LENGTH so that a
- * complete UChar32's UChars fit into it.
- *
- * @param cnv The converter that will get remaining UChars copied to its overflow area.
- * @param buffer An array of UChars that was passed into a callback function
- * or a toUnicode() function.
- * @param length The number of code units (UChars) that are actually in the buffer.
- * This must be >0.
- * @return The code point from the first UChars in the buffer.
- */
-U_CFUNC UChar32
-ucnv_getUChar32KeepOverflow(UConverter *cnv, const UChar *buffer, int32_t length);
-
-/**
- * This helper function updates the offsets array after a callback function call.
- * It adds the sourceIndex to each offsets item, or sets each of them to -1 if
- * sourceIndex==-1.
- *
- * @param offsets The pointer to offsets entry that corresponds to the first target
- * unit that the callback wrote.
- * @param length The number of output units that the callback wrote.
- * @param sourceIndex The sourceIndex of the input sequence that the callback
- * function was called for.
- * @return offsets+length if offsets!=NULL, otherwise NULL
- */
-U_CFUNC int32_t *
-ucnv_updateCallbackOffsets(int32_t *offsets, int32_t length, int32_t sourceIndex);
-
/** Always use fallbacks from codepage to Unicode */
#define TO_U_USE_FALLBACK(useFallback) TRUE
#define UCNV_TO_U_USE_FALLBACK(cnv) TRUE
@@ -224,41 +237,48 @@ ucnv_updateCallbackOffsets(int32_t *offsets, int32_t length, int32_t sourceIndex
#define UCNV_FROM_U_USE_FALLBACK(cnv, c) FROM_U_USE_FALLBACK((cnv)->useFallback, c)
/**
- * This is a simple implementation of ucnv_getNextUChar() that uses the
- * converter's toUnicode() function.
- *
- * \par
- * A surrogate pair from a single byte sequence is always
- * combined to a supplementary code point.
- * A surrogate pair from consecutive byte sequences is only combined
- * if collectPairs is set. This is necessary for SCSU
- * but not allowed for most legacy codepages.
- *
- * @param pArgs The argument structure supplied by ucnv_getNextUChar()
- * @param toU A function pointer to the converter's toUnicode() function
- * @param collectPairs indicates whether separate surrogate results from
- * consecutive byte sequences should be combined into
- * a single code point
- * @param pErrorCode An ICU error code parameter
- * @return The Unicode code point as a result of a conversion of a minimal
- * number of input bytes
+ * Magic number for ucnv_getNextUChar(), returned by a
+ * getNextUChar() implementation to indicate to use the converter's toUnicode()
+ * instead of the native function.
+ * @internal
*/
-U_CFUNC UChar32
-ucnv_getNextUCharFromToUImpl(UConverterToUnicodeArgs *pArgs,
- T_ToUnicodeFunction toU,
- UBool collectPairs,
- UErrorCode *pErrorCode);
+#define UCNV_GET_NEXT_UCHAR_USE_TO_U -9
U_CFUNC void
ucnv_getCompleteUnicodeSet(const UConverter *cnv,
- USet *set,
+ USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);
U_CFUNC void
ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
- USet *set,
+ USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);
+U_CFUNC void
+ucnv_fromUWriteBytes(UConverter *cnv,
+ const char *bytes, int32_t length,
+ char **target, const char *targetLimit,
+ int32_t **offsets,
+ int32_t sourceIndex,
+ UErrorCode *pErrorCode);
+U_CFUNC void
+ucnv_toUWriteUChars(UConverter *cnv,
+ const UChar *uchars, int32_t length,
+ UChar **target, const UChar *targetLimit,
+ int32_t **offsets,
+ int32_t sourceIndex,
+ UErrorCode *pErrorCode);
+
+U_CFUNC void
+ucnv_toUWriteCodePoint(UConverter *cnv,
+ UChar32 c,
+ UChar **target, const UChar *targetLimit,
+ int32_t **offsets,
+ int32_t sourceIndex,
+ UErrorCode *pErrorCode);
+
+#endif
+
#endif /* UCNV_CNV */
diff --git a/icuSources/common/ucnv_err.c b/icuSources/common/ucnv_err.c
index be7ef6e1..d28e7495 100644
--- a/icuSources/common/ucnv_err.c
+++ b/icuSources/common/ucnv_err.c
@@ -1,7 +1,7 @@
/*
*****************************************************************************
*
- * Copyright (C) 1998-2003, International Business Machines
+ * Copyright (C) 1998-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
*****************************************************************************
@@ -15,6 +15,10 @@
* 06/29/2000 helena Major rewrite of the callback APIs.
*/
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
#include "unicode/ucnv_err.h"
#include "unicode/ucnv_cb.h"
#include "ucnv_cnv.h"
@@ -453,3 +457,5 @@ UCNV_TO_U_CALLBACK_ESCAPE (
ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
}
+
+#endif
diff --git a/icuSources/common/ucnv_ext.c b/icuSources/common/ucnv_ext.c
new file mode 100644
index 00000000..ed1bc65d
--- /dev/null
+++ b/icuSources/common/ucnv_ext.c
@@ -0,0 +1,1075 @@
+/*
+******************************************************************************
+*
+* Copyright (C) 2003-2004, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+* file name: ucnv_ext.c
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2003jun13
+* created by: Markus W. Scherer
+*
+* Conversion extensions
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
+
+#include "unicode/uset.h"
+#include "ucnv_bld.h"
+#include "ucnv_cnv.h"
+#include "ucnv_ext.h"
+#include "cmemory.h"
+
+/* to Unicode --------------------------------------------------------------- */
+
+/*
+ * @return lookup value for the byte, if found; else 0
+ */
+static U_INLINE uint32_t
+ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) {
+ uint32_t word0, word;
+ int32_t i, start, limit;
+
+ /* check the input byte against the lowest and highest section bytes */
+ start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]);
+ limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]);
+ if(byte1) {
+ return 0; /* no match of a DBCS sequence in SBCS mode */
+ } else if(preLength==1) {
+ srcLength=0;
+ } else /* preLength==0 */ {
+ if(srcLength>1) {
+ srcLength=1;
+ }
+ }
+ flush=TRUE;
+ }
+
+ /* we must not remember fallback matches when not using fallbacks */
+
+ /* match input units until there is a full match or the input is consumed */
+ for(;;) {
+ /* go to the next section */
+ toUSection=toUTable+index;
+
+ /* read first pair of the section */
+ value=*toUSection++;
+ length=UCNV_EXT_TO_U_GET_BYTE(value);
+ value=UCNV_EXT_TO_U_GET_VALUE(value);
+ if( value!=0 &&
+ (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
+ TO_U_USE_FALLBACK(useFallback)) &&
+ UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
+ ) {
+ /* remember longest match so far */
+ matchValue=value;
+ matchLength=i+j;
+ }
+
+ /* match pre[] then src[] */
+ if(iUCNV_EXT_MAX_BYTES) {
+ /*
+ * end of the entire input stream, stop with the longest match so far
+ * or: partial match must not be longer than UCNV_EXT_MAX_BYTES
+ * because it must fit into state buffers
+ */
+ break;
+ } else {
+ /* continue with more input next time */
+ return -length;
+ }
+ }
+
+ /* search for the current UChar */
+ value=ucnv_extFindToU(toUSection, length, b);
+ if(value==0) {
+ /* no match here, stop with the longest match so far */
+ break;
+ } else {
+ if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
+ /* partial match, continue */
+ index=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value);
+ } else {
+ if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
+ TO_U_USE_FALLBACK(useFallback)) &&
+ UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
+ ) {
+ /* full match, stop with result */
+ matchValue=value;
+ matchLength=i+j;
+ } else {
+ /* full match on fallback not taken, stop with the longest match so far */
+ }
+ break;
+ }
+ }
+ }
+
+ if(matchLength==0) {
+ /* no match at all */
+ return 0;
+ }
+
+ /* return result */
+ *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue);
+ return matchLength;
+}
+
+static U_INLINE void
+ucnv_extWriteToU(UConverter *cnv, const int32_t *cx,
+ uint32_t value,
+ UChar **target, const UChar *targetLimit,
+ int32_t **offsets, int32_t srcIndex,
+ UErrorCode *pErrorCode) {
+ /* output the result */
+ if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
+ /* output a single code point */
+ ucnv_toUWriteCodePoint(
+ cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value),
+ target, targetLimit,
+ offsets, srcIndex,
+ pErrorCode);
+ } else {
+ /* output a string - with correct data we have resultLength>0 */
+ ucnv_toUWriteUChars(
+ cnv,
+ UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+
+ UCNV_EXT_TO_U_GET_INDEX(value),
+ UCNV_EXT_TO_U_GET_LENGTH(value),
+ target, targetLimit,
+ offsets, srcIndex,
+ pErrorCode);
+ }
+}
+
+/*
+ * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS),
+ * or 1 for DBCS-only,
+ * or -1 if the converter is not SI/SO stateful
+ *
+ * Note: For SI/SO stateful converters getting here,
+ * cnv->mode==0 is equivalent to firstLength==1.
+ */
+#define UCNV_SISO_STATE(cnv) \
+ ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \
+ (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1)
+
+/*
+ * targettoUBytes, firstLength,
+ *src, (int32_t)(srcLimit-*src),
+ &value,
+ cnv->useFallback, flush);
+ if(match>0) {
+ /* advance src pointer for the consumed input */
+ *src+=match-firstLength;
+
+ /* write result to target */
+ ucnv_extWriteToU(cnv, cx,
+ value,
+ target, targetLimit,
+ offsets, srcIndex,
+ pErrorCode);
+ return TRUE;
+ } else if(match<0) {
+ /* save state for partial match */
+ const char *s;
+ int32_t j;
+
+ /* copy the first code point */
+ s=(const char *)cnv->toUBytes;
+ cnv->preToUFirstLength=(int8_t)firstLength;
+ for(j=0; jpreToU[j]=*s++;
+ }
+
+ /* now copy the newly consumed input */
+ s=*src;
+ match=-match;
+ for(; jpreToU[j]=*s++;
+ }
+ *src=s; /* same as *src=srcLimit; because we reached the end of input */
+ cnv->preToULength=(int8_t)match;
+ return TRUE;
+ } else /* match==0 no match */ {
+ return FALSE;
+ }
+}
+
+U_CFUNC UChar32
+ucnv_extSimpleMatchToU(const int32_t *cx,
+ const char *source, int32_t length,
+ UBool useFallback) {
+ uint32_t value;
+ int32_t match;
+
+ if(length<=0) {
+ return 0xffff;
+ }
+
+ /* try to match */
+ match=ucnv_extMatchToU(cx, -1,
+ source, length,
+ NULL, 0,
+ &value,
+ useFallback, TRUE);
+ if(match==length) {
+ /* write result for simple, single-character conversion */
+ if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
+ return UCNV_EXT_TO_U_GET_CODE_POINT(value);
+ }
+ }
+
+ /*
+ * return no match because
+ * - match>0 && value points to string: simple conversion cannot handle multiple code points
+ * - match>0 && match!=length: not all input consumed, forbidden for this function
+ * - match==0: no match found in the first place
+ * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
+ */
+ return 0xfffe;
+}
+
+/*
+ * continue partial match with new input
+ * never called for simple, single-character conversion
+ */
+U_CFUNC void
+ucnv_extContinueMatchToU(UConverter *cnv,
+ UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
+ UErrorCode *pErrorCode) {
+ uint32_t value;
+ int32_t match, length;
+
+ match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv),
+ cnv->preToU, cnv->preToULength,
+ pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
+ &value,
+ cnv->useFallback, pArgs->flush);
+ if(match>0) {
+ if(match>=cnv->preToULength) {
+ /* advance src pointer for the consumed input */
+ pArgs->source+=match-cnv->preToULength;
+ cnv->preToULength=0;
+ } else {
+ /* the match did not use all of preToU[] - keep the rest for replay */
+ length=cnv->preToULength-match;
+ uprv_memmove(cnv->preToU, cnv->preToU+match, length);
+ cnv->preToULength=(int8_t)-length;
+ }
+
+ /* write result */
+ ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes,
+ value,
+ &pArgs->target, pArgs->targetLimit,
+ &pArgs->offsets, srcIndex,
+ pErrorCode);
+ } else if(match<0) {
+ /* save state for partial match */
+ const char *s;
+ int32_t j;
+
+ /* just _append_ the newly consumed input to preToU[] */
+ s=pArgs->source;
+ match=-match;
+ for(j=cnv->preToULength; jpreToU[j]=*s++;
+ }
+ pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
+ cnv->preToULength=(int8_t)match;
+ } else /* match==0 */ {
+ /*
+ * no match
+ *
+ * We need to split the previous input into two parts:
+ *
+ * 1. The first codepage character is unmappable - that's how we got into
+ * trying the extension data in the first place.
+ * We need to move it from the preToU buffer
+ * to the error buffer, set an error code,
+ * and prepare the rest of the previous input for 2.
+ *
+ * 2. The rest of the previous input must be converted once we
+ * come back from the callback for the first character.
+ * At that time, we have to try again from scratch to convert
+ * these input characters.
+ * The replay will be handled by the ucnv.c conversion code.
+ */
+
+ /* move the first codepage character to the error field */
+ uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength);
+ cnv->toULength=cnv->preToUFirstLength;
+
+ /* move the rest up inside the buffer */
+ length=cnv->preToULength-cnv->preToUFirstLength;
+ if(length>0) {
+ uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length);
+ }
+
+ /* mark preToU for replay */
+ cnv->preToULength=(int8_t)-length;
+
+ /* set the error code for unassigned */
+ *pErrorCode=U_INVALID_CHAR_FOUND;
+ }
+}
+
+/* from Unicode ------------------------------------------------------------- */
+
+/*
+ * @return index of the UChar, if found; else <0
+ */
+static U_INLINE int32_t
+ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) {
+ int32_t i, start, limit;
+
+ /* binary search */
+ start=0;
+ limit=length;
+ for(;;) {
+ i=limit-start;
+ if(i<=1) {
+ break; /* done */
+ }
+ /* start=0
+ * @param src UChars that can be used to complete a match
+ * @param srcLength length of src, >=0
+ * @param pMatchValue [out] output result value for the match from the data structure
+ * @param useFallback "use fallback" flag, usually from cnv->useFallback
+ * @param flush TRUE if the end of the input stream is reached
+ * @return >1: matched, return value=total match length (number of input units matched)
+ * 1: matched, no mapping but request for
+ * (only for the first code point)
+ * 0: no match
+ * <0: partial match, return value=negative total match length
+ * (partial matches are never returned for flush==TRUE)
+ * (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS)
+ * the matchLength is 2 if only firstCP matched, and >2 if firstCP and
+ * further code units matched
+ */
+static int32_t
+ucnv_extMatchFromU(const int32_t *cx,
+ UChar32 firstCP,
+ const UChar *pre, int32_t preLength,
+ const UChar *src, int32_t srcLength,
+ uint32_t *pMatchValue,
+ UBool useFallback, UBool flush) {
+ const uint16_t *stage12, *stage3;
+ const uint32_t *stage3b;
+
+ const UChar *fromUTableUChars, *fromUSectionUChars;
+ const uint32_t *fromUTableValues, *fromUSectionValues;
+
+ uint32_t value, matchValue;
+ int32_t i, j, index, length, matchLength;
+ UChar c;
+
+ if(cx==NULL) {
+ return 0; /* no extension data, no match */
+ }
+
+ /* trie lookup of firstCP */
+ index=firstCP>>10; /* stage 1 index */
+ if(index>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) {
+ return 0; /* the first code point is outside the trie */
+ }
+
+ stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
+ stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
+ index=UCNV_EXT_FROM_U(stage12, stage3, index, firstCP);
+
+ stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
+ value=stage3b[index];
+ if(value==0) {
+ return 0;
+ }
+
+ if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
+ /* partial match, enter the loop below */
+ index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
+
+ /* initialize */
+ fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar);
+ fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t);
+
+ matchValue=0;
+ i=j=matchLength=0;
+
+ /* we must not remember fallback matches when not using fallbacks */
+
+ /* match input units until there is a full match or the input is consumed */
+ for(;;) {
+ /* go to the next section */
+ fromUSectionUChars=fromUTableUChars+index;
+ fromUSectionValues=fromUTableValues+index;
+
+ /* read first pair of the section */
+ length=*fromUSectionUChars++;
+ value=*fromUSectionValues++;
+ if( value!=0 &&
+ (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
+ FROM_U_USE_FALLBACK(useFallback, firstCP))
+ ) {
+ /* remember longest match so far */
+ matchValue=value;
+ matchLength=2+i+j;
+ }
+
+ /* match pre[] then src[] */
+ if(iUCNV_EXT_MAX_UCHARS) {
+ /*
+ * end of the entire input stream, stop with the longest match so far
+ * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS
+ * because it must fit into state buffers
+ */
+ break;
+ } else {
+ /* continue with more input next time */
+ return -(2+length);
+ }
+ }
+
+ /* search for the current UChar */
+ index=ucnv_extFindFromU(fromUSectionUChars, length, c);
+ if(index<0) {
+ /* no match here, stop with the longest match so far */
+ break;
+ } else {
+ value=fromUSectionValues[index];
+ if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
+ /* partial match, continue */
+ index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
+ } else {
+ if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
+ FROM_U_USE_FALLBACK(useFallback, firstCP)
+ ) {
+ /* full match, stop with result */
+ matchValue=value;
+ matchLength=2+i+j;
+ } else {
+ /* full match on fallback not taken, stop with the longest match so far */
+ }
+ break;
+ }
+ }
+ }
+
+ if(matchLength==0) {
+ /* no match at all */
+ return 0;
+ }
+ } else /* result from firstCP trie lookup */ {
+ if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
+ FROM_U_USE_FALLBACK(useFallback, firstCP)
+ ) {
+ /* full match, stop with result */
+ matchValue=value;
+ matchLength=2;
+ } else {
+ /* fallback not taken */
+ return 0;
+ }
+ }
+
+ if(matchValue&UCNV_EXT_FROM_U_RESERVED_MASK) {
+ /* do not interpret values with reserved bits used, for forward compatibility */
+ return 0;
+ }
+
+ /* return result */
+ if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) {
+ return 1; /* assert matchLength==2 */
+ }
+
+ *pMatchValue=UCNV_EXT_FROM_U_MASK_ROUNDTRIP(matchValue);
+ return matchLength;
+}
+
+static U_INLINE void
+ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx,
+ uint32_t value,
+ char **target, const char *targetLimit,
+ int32_t **offsets, int32_t srcIndex,
+ UErrorCode *pErrorCode) {
+ uint8_t buffer[1+UCNV_EXT_MAX_BYTES];
+ const uint8_t *result;
+ int32_t length, prevLength;
+
+ length=UCNV_EXT_FROM_U_GET_LENGTH(value);
+ value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value);
+
+ /* output the result */
+ if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
+ /*
+ * Generate a byte array and then write it below.
+ * This is not the fastest possible way, but it should be ok for
+ * extension mappings, and it is much simpler.
+ * Offset and overflow handling are only done once this way.
+ */
+ uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */
+ switch(length) {
+ case 3:
+ *p++=(uint8_t)(value>>16);
+ case 2:
+ *p++=(uint8_t)(value>>8);
+ case 1:
+ *p++=(uint8_t)value;
+ default:
+ break; /* will never occur */
+ }
+ result=buffer+1;
+ } else {
+ result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
+ }
+
+ /* with correct data we have length>0 */
+
+ if((prevLength=cnv->fromUnicodeStatus)!=0) {
+ /* handle SI/SO stateful output */
+ uint8_t shiftByte;
+
+ if(prevLength>1 && length==1) {
+ /* change from double-byte mode to single-byte */
+ shiftByte=(uint8_t)UCNV_SI;
+ cnv->fromUnicodeStatus=1;
+ } else if(prevLength==1 && length>1) {
+ /* change from single-byte mode to double-byte */
+ shiftByte=(uint8_t)UCNV_SO;
+ cnv->fromUnicodeStatus=2;
+ } else {
+ shiftByte=0;
+ }
+
+ if(shiftByte!=0) {
+ /* prepend the shift byte to the result bytes */
+ buffer[0]=shiftByte;
+ if(result!=buffer+1) {
+ uprv_memcpy(buffer+1, result, length);
+ }
+ result=buffer;
+ ++length;
+ }
+ }
+
+ ucnv_fromUWriteBytes(cnv, (const char *)result, length,
+ target, targetLimit,
+ offsets, srcIndex,
+ pErrorCode);
+}
+
+/*
+ * targetuseFallback, flush);
+
+ /* reject a match if the result is a single byte for DBCS-only */
+ if( match>=2 &&
+ !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 &&
+ cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY)
+ ) {
+ /* advance src pointer for the consumed input */
+ *src+=match-2; /* remove 2 for the initial code point */
+
+ /* write result to target */
+ ucnv_extWriteFromU(cnv, cx,
+ value,
+ target, targetLimit,
+ offsets, srcIndex,
+ pErrorCode);
+ return TRUE;
+ } else if(match<0) {
+ /* save state for partial match */
+ const UChar *s;
+ int32_t j;
+
+ /* copy the first code point */
+ cnv->preFromUFirstCP=cp;
+
+ /* now copy the newly consumed input */
+ s=*src;
+ match=-match-2; /* remove 2 for the initial code point */
+ for(j=0; jpreFromU[j]=*s++;
+ }
+ *src=s; /* same as *src=srcLimit; because we reached the end of input */
+ cnv->preFromULength=(int8_t)match;
+ return TRUE;
+ } else if(match==1) {
+ /* matched, no mapping but request for */
+ cnv->useSubChar1=TRUE;
+ return FALSE;
+ } else /* match==0 no match */ {
+ return FALSE;
+ }
+}
+
+U_CFUNC int32_t
+ucnv_extSimpleMatchFromU(const int32_t *cx,
+ UChar32 cp, uint32_t *pValue,
+ UBool useFallback) {
+ uint32_t value;
+ int32_t match;
+
+ /* try to match */
+ match=ucnv_extMatchFromU(cx,
+ cp,
+ NULL, 0,
+ NULL, 0,
+ &value,
+ useFallback, TRUE);
+ if(match>=2) {
+ /* write result for simple, single-character conversion */
+ int32_t length;
+
+ length=UCNV_EXT_FROM_U_GET_LENGTH(value);
+ value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value);
+
+ if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
+ *pValue=value;
+ return length;
+#if 0 /* not currently used */
+ } else if(length==4) {
+ /* de-serialize a 4-byte result */
+ const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
+ *pValue=
+ ((uint32_t)result[0]<<24)|
+ ((uint32_t)result[1]<<16)|
+ ((uint32_t)result[2]<<8)|
+ result[3];
+ return 4;
+#endif
+ }
+ }
+
+ /*
+ * return no match because
+ * - match>1 && resultLength>4: result too long for simple conversion
+ * - match==1: no match found, preferred
+ * - match==0: no match found in the first place
+ * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
+ */
+ return 0;
+}
+
+/*
+ * continue partial match with new input, requires cnv->preFromUFirstCP>=0
+ * never called for simple, single-character conversion
+ */
+U_CFUNC void
+ucnv_extContinueMatchFromU(UConverter *cnv,
+ UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
+ UErrorCode *pErrorCode) {
+ uint32_t value;
+ int32_t match;
+
+ match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes,
+ cnv->preFromUFirstCP,
+ cnv->preFromU, cnv->preFromULength,
+ pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
+ &value,
+ cnv->useFallback, pArgs->flush);
+ if(match>=2) {
+ match-=2; /* remove 2 for the initial code point */
+
+ if(match>=cnv->preFromULength) {
+ /* advance src pointer for the consumed input */
+ pArgs->source+=match-cnv->preFromULength;
+ cnv->preFromULength=0;
+ } else {
+ /* the match did not use all of preFromU[] - keep the rest for replay */
+ int32_t length=cnv->preFromULength-match;
+ uprv_memmove(cnv->preFromU, cnv->preFromU+match, length*U_SIZEOF_UCHAR);
+ cnv->preFromULength=(int8_t)-length;
+ }
+
+ /* finish the partial match */
+ cnv->preFromUFirstCP=U_SENTINEL;
+
+ /* write result */
+ ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes,
+ value,
+ &pArgs->target, pArgs->targetLimit,
+ &pArgs->offsets, srcIndex,
+ pErrorCode);
+ } else if(match<0) {
+ /* save state for partial match */
+ const UChar *s;
+ int32_t j;
+
+ /* just _append_ the newly consumed input to preFromU[] */
+ s=pArgs->source;
+ match=-match-2; /* remove 2 for the initial code point */
+ for(j=cnv->preFromULength; jpreFromU[j]=*s++;
+ }
+ pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
+ cnv->preFromULength=(int8_t)match;
+ } else /* match==0 or 1 */ {
+ /*
+ * no match
+ *
+ * We need to split the previous input into two parts:
+ *
+ * 1. The first code point is unmappable - that's how we got into
+ * trying the extension data in the first place.
+ * We need to move it from the preFromU buffer
+ * to the error buffer, set an error code,
+ * and prepare the rest of the previous input for 2.
+ *
+ * 2. The rest of the previous input must be converted once we
+ * come back from the callback for the first code point.
+ * At that time, we have to try again from scratch to convert
+ * these input characters.
+ * The replay will be handled by the ucnv.c conversion code.
+ */
+
+ if(match==1) {
+ /* matched, no mapping but request for */
+ cnv->useSubChar1=TRUE;
+ }
+
+ /* move the first code point to the error field */
+ cnv->fromUChar32=cnv->preFromUFirstCP;
+ cnv->preFromUFirstCP=U_SENTINEL;
+
+ /* mark preFromU for replay */
+ cnv->preFromULength=-cnv->preFromULength;
+
+ /* set the error code for unassigned */
+ *pErrorCode=U_INVALID_CHAR_FOUND;
+ }
+}
+
+static void
+ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
+ const int32_t *cx,
+ USetAdder *sa,
+ UConverterUnicodeSet which,
+ int32_t minLength,
+ UChar32 c,
+ UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
+ int32_t sectionIndex,
+ UErrorCode *pErrorCode) {
+ const UChar *fromUSectionUChars;
+ const uint32_t *fromUSectionValues;
+
+ uint32_t value;
+ int32_t i, count;
+
+ fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex;
+ fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex;
+
+ /* read first pair of the section */
+ count=*fromUSectionUChars++;
+ value=*fromUSectionValues++;
+
+ if( value!=0 &&
+ UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&
+ UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
+ ) {
+ if(c>=0) {
+ /* add the initial code point */
+ sa->add(sa->set, c);
+ } else {
+ /* add the string so far */
+ sa->addString(sa->set, s, length);
+ }
+ }
+
+ for(i=0; i=minLength
+ ) {
+ sa->addString(sa->set, s, length+1);
+ }
+ }
+}
+
+U_CFUNC void
+ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
+ USetAdder *sa,
+ UConverterUnicodeSet which,
+ UErrorCode *pErrorCode) {
+ const int32_t *cx;
+ const uint16_t *stage12, *stage3, *ps2, *ps3;
+ const uint32_t *stage3b;
+
+ uint32_t value;
+ int32_t st1, stage1Length, st2, st3, minLength;
+
+ UChar s[UCNV_EXT_MAX_UCHARS];
+ UChar32 c;
+ int32_t length;
+
+ cx=sharedData->mbcs.extIndexes;
+ if(cx==NULL) {
+ return;
+ }
+
+ stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
+ stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
+ stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
+
+ stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
+
+ /* enumerate the from-Unicode trie table */
+ c=0; /* keep track of the current code point while enumerating */
+
+ if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) {
+ /* DBCS-only, ignore single-byte results */
+ minLength=2;
+ } else {
+ minLength=1;
+ }
+
+ /*
+ * the trie enumeration is almost the same as
+ * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1
+ */
+ for(st1=0; st1stage1Length) {
+ ps2=stage12+st2;
+ for(st2=0; st2<64; ++st2) {
+ if((st3=(int32_t)ps2[st2]< entries or other (future?) pseudo-entries
+ * with an output length of 0, or entries with reserved bits set.
+ * Recurse for partial results.
+ */
+ do {
+ value=stage3b[*ps3++];
+ if(value==0) {
+ /* no mapping, do nothing */
+ } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
+ length=0;
+ U16_APPEND_UNSAFE(s, length, c);
+ ucnv_extGetUnicodeSetString(
+ sharedData, cx, sa, which, minLength,
+ c, s, length,
+ (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
+ pErrorCode);
+ } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
+ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
+ UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
+ ) {
+ sa->add(sa->set, c);
+ }
+ } while((++c&0xf)!=0);
+ } else {
+ c+=16; /* empty stage 3 block */
+ }
+ }
+ } else {
+ c+=1024; /* empty stage 2 block */
+ }
+ }
+}
+
+#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
diff --git a/icuSources/common/ucnv_ext.h b/icuSources/common/ucnv_ext.h
new file mode 100644
index 00000000..6fb43d96
--- /dev/null
+++ b/icuSources/common/ucnv_ext.h
@@ -0,0 +1,463 @@
+/*
+******************************************************************************
+*
+* Copyright (C) 2003-2004, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+* file name: ucnv_ext.h
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2003jun13
+* created by: Markus W. Scherer
+*
+* Conversion extensions
+*/
+
+#ifndef __UCNV_EXT_H__
+#define __UCNV_EXT_H__
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
+#include "unicode/ucnv.h"
+#include "ucnv_cnv.h"
+
+/*
+ * See icuhtml/design/conversion/conversion_extensions.html
+ *
+ * Conversion extensions serve two purposes:
+ * 1. They support m:n mappings.
+ * 2. They support extension-only conversion files that are used together
+ * with the regular conversion data in base files.
+ *
+ * A base file may contain an extension table (explicitly requested or
+ * implicitly generated for m:n mappings), but its extension table is not
+ * used when an extension-only file is used.
+ *
+ * It is an error if a base file contains any regular (not extension) mapping
+ * from the same sequence as a mapping in the extension file
+ * because the base mapping would hide the extension mapping.
+ *
+ *
+ * Data for conversion extensions:
+ *
+ * One set of data structures per conversion direction (to/from Unicode).
+ * The data structures are sorted by input units to allow for binary search.
+ * Input sequences of more than one unit are handled like contraction tables
+ * in collation:
+ * The lookup value of a unit points to another table that is to be searched
+ * for the next unit, recursively.
+ *
+ * For conversion from Unicode, the initial code point is looked up in
+ * a 3-stage trie for speed,
+ * with an additional table of unique results to save space.
+ *
+ * Long output strings are stored in separate arrays, with length and index
+ * in the lookup tables.
+ * Output results also include a flag distinguishing roundtrip from
+ * (reverse) fallback mappings.
+ *
+ * Input Unicode strings must not begin or end with unpaired surrogates
+ * to avoid problems with matches on parts of surrogate pairs.
+ *
+ * Mappings from multiple characters (code points or codepage state
+ * table sequences) must be searched preferring the longest match.
+ * For this to work and be efficient, the variable-width table must contain
+ * all mappings that contain prefixes of the multiple characters.
+ * If an extension table is built on top of a base table in another file
+ * and a base table entry is a prefix of a multi-character mapping, then
+ * this is an error.
+ *
+ *
+ * Implementation note:
+ *
+ * Currently, the parser and several checks in the code limit the number
+ * of UChars or bytes in a mapping to
+ * UCNV_EXT_MAX_UCHARS and UCNV_EXT_MAX_BYTES, respectively,
+ * which are output value limits in the data structure.
+ *
+ * For input, this is not strictly necessary - it is a hard limit only for the
+ * buffers in UConverter that are used to store partial matches.
+ *
+ * Input sequences could otherwise be arbitrarily long if partial matches
+ * need not be stored (i.e., if a sequence does not span several buffers with too
+ * many units before the last buffer), although then results would differ
+ * depending on whether partial matches exceed the limits or not,
+ * which depends on the pattern of buffer sizes.
+ *
+ *
+ * Data structure:
+ *
+ * int32_t indexes[>=32];
+ *
+ * Array of indexes and lengths etc. The length of the array is at least 32.
+ * The actual length is stored in indexes[0] to be forward compatible.
+ *
+ * Each index to another array is the number of bytes from indexes[].
+ * Each length of an array is the number of array base units in that array.
+ *
+ * Some of the structures may not be present, in which case their indexes
+ * and lengths are 0.
+ *
+ * Usage of indexes[i]:
+ * [0] length of indexes[]
+ *
+ * // to Unicode table
+ * [1] index of toUTable[] (array of uint32_t)
+ * [2] length of toUTable[]
+ * [3] index of toUUChars[] (array of UChar)
+ * [4] length of toUUChars[]
+ *
+ * // from Unicode table, not for the initial code point
+ * [5] index of fromUTableUChars[] (array of UChar)
+ * [6] index of fromUTableValues[] (array of uint32_t)
+ * [7] length of fromUTableUChars[] and fromUTableValues[]
+ * [8] index of fromUBytes[] (array of char)
+ * [9] length of fromUBytes[]
+ *
+ * // from Unicode trie for initial-code point lookup
+ * [10] index of fromUStage12[] (combined array of uint16_t for stages 1 & 2)
+ * [11] length of stage 1 portion of fromUStage12[]
+ * [12] length of fromUStage12[]
+ * [13] index of fromUStage3[] (array of uint16_t indexes into fromUStage3b[])
+ * [14] length of fromUStage3[]
+ * [15] index of fromUStage3b[] (array of uint32_t like fromUTableValues[])
+ * [16] length of fromUStage3b[]
+ *
+ * [17] Bit field containing numbers of bytes:
+ * 31..24 reserved, 0
+ * 23..16 maximum input bytes
+ * 15.. 8 maximum output bytes
+ * 7.. 0 maximum bytes per UChar
+ *
+ * [18] Bit field containing numbers of UChars:
+ * 31..24 reserved, 0
+ * 23..16 maximum input UChars
+ * 15.. 8 maximum output UChars
+ * 7.. 0 maximum UChars per byte
+ *
+ * [19] Bit field containing flags:
+ * (extension table unicodeMask)
+ * 1 UCNV_HAS_SURROGATES flag for the extension table
+ * 0 UCNV_HAS_SUPPLEMENTARY flag for the extension table
+ *
+ * [20]..[30] reserved, 0
+ * [31] number of bytes for the entire extension structure
+ * [>31] reserved; there are indexes[0] indexes
+ *
+ *
+ * uint32_t toUTable[];
+ *
+ * Array of byte/value pairs for lookups for toUnicode conversion.
+ * The array is partitioned into sections like collation contraction tables.
+ * Each section contains one word with the number of following words and
+ * a default value for when the lookup in this section yields no match.
+ *
+ * A section is sorted in ascending order of input bytes,
+ * allowing for fast linear or binary searches.
+ * The builder may store entries for a contiguous range of byte values
+ * (compare difference between the first and last one with count),
+ * which then allows for direct array access.
+ * The builder should always do this for the initial table section.
+ *
+ * Entries may have 0 values, see below.
+ * No two entries in a section have the same byte values.
+ *
+ * Each uint32_t contains an input byte value in bits 31..24 and the
+ * corresponding lookup value in bits 23..0.
+ * Interpret the value as follows:
+ * if(value==0) {
+ * no match, see below
+ * } else if(value<0x1f0000) {
+ * partial match - use value as index to the next toUTable section
+ * and match the next unit; (value indexes toUTable[value])
+ * } else {
+ * if(bit 23 set) {
+ * roundtrip;
+ * } else {
+ * fallback;
+ * }
+ * unset value bit 23;
+ * if(value<=0x2fffff) {
+ * (value-0x1f0000) is a code point; (BMP: value<=0x1fffff)
+ * } else {
+ * bits 17..0 (value&0x3ffff) is an index to
+ * the result UChars in toUUChars[]; (0 indexes toUUChars[0])
+ * length of the result=((value>>18)-12); (length=0..19)
+ * }
+ * }
+ *
+ * The first word in a section contains the number of following words in the
+ * input byte position (bits 31..24, number=1..0xff).
+ * The value of the initial word is used when the current byte is not found
+ * in this section.
+ * If the value is not 0, then it represents a result as above.
+ * If the value is 0, then the search has to return a shorter match with an
+ * earlier default value as the result, or result in "unmappable" even for the
+ * initial bytes.
+ * If the value is 0 for the initial toUTable entry, then the initial byte
+ * does not start any mapping input.
+ *
+ *
+ * UChar toUUChars[];
+ *
+ * Contains toUnicode mapping results, stored as sequences of UChars.
+ * Indexes and lengths stored in the toUTable[].
+ *
+ *
+ * UChar fromUTableUChars[];
+ * uint32_t fromUTableValues[];
+ *
+ * The fromUTable is split into two arrays, but works otherwise much like
+ * the toUTable. The array is partitioned into sections like collation
+ * contraction tables and toUTable.
+ * A row in the table consists of same-index entries in fromUTableUChars[]
+ * and fromUTableValues[].
+ *
+ * Interpret a value as follows:
+ * if(value==0) {
+ * no match, see below
+ * } else if(value<=0xffffff) { (bits 31..24 are 0)
+ * partial match - use value as index to the next fromUTable section
+ * and match the next unit; (value indexes fromUTable[value])
+ * } else {
+ * if(value==0x80000001) {
+ * return no mapping, but request for ;
+ * }
+ * if(bit 31 set) {
+ * roundtrip;
+ * } else {
+ * fallback;
+ * }
+ * // bits 30..29 reserved, 0
+ * length=(value>>24)&0x1f; (bits 28..24)
+ * if(length==1..3) {
+ * bits 23..0 contain 1..3 bytes, padded with 00s on the left;
+ * } else {
+ * bits 23..0 (value&0xffffff) is an index to
+ * the result bytes in fromUBytes[]; (0 indexes fromUBytes[0])
+ * }
+ * }
+ *
+ * The first pair in a section contains the number of following pairs in the
+ * UChar position (16 bits, number=1..0xffff).
+ * The value of the initial pair is used when the current UChar is not found
+ * in this section.
+ * If the value is not 0, then it represents a result as above.
+ * If the value is 0, then the search has to return a shorter match with an
+ * earlier default value as the result, or result in "unmappable" even for the
+ * initial UChars.
+ *
+ * If the from Unicode trie is present, then the from Unicode search tables
+ * are not used for initial code points.
+ * In this case, the first entries (index 0) in the tables are not used
+ * (reserved, set to 0) because a value of 0 is used in trie results
+ * to indicate no mapping.
+ *
+ *
+ * uint16_t fromUStage12[];
+ *
+ * Stages 1 & 2 of a trie that maps an initial code point.
+ * Indexes in stage 1 are all offset by the length of stage 1 so that the
+ * same array pointer can be used for both stages.
+ * If (c>>10)>=(length of stage 1) then c does not start any mapping.
+ * Same bit distribution as for regular conversion tries.
+ *
+ *
+ * uint16_t fromUStage3[];
+ * uint32_t fromUStage3b[];
+ *
+ * Stage 3 of the trie. The first array simply contains indexes to the second,
+ * which contains words in the same format as fromUTableValues[].
+ * Use a stage 3 granularity of 4, which allows for 256k stage 3 entries,
+ * and 16-bit entries in stage 3 allow for 64k stage 3b entries.
+ * The stage 3 granularity means that the stage 2 entry needs to be left-shifted.
+ *
+ * Two arrays are used because it is expected that more than half of the stage 3
+ * entries will be zero. The 16-bit index stage 3 array saves space even
+ * considering storing a total of 6 bytes per non-zero entry in both arrays
+ * together.
+ * Using a stage 3 granularity of >1 diminishes the compactability in that stage
+ * but provides a larger effective addressing space in stage 2.
+ * All but the final result stage use 16-bit entries to save space.
+ *
+ * fromUStage3b[] contains a zero for "no mapping" at its index 0,
+ * and may contain UCNV_EXT_FROM_U_SUBCHAR1 at index 1 for " SUB mapping"
+ * (i.e., "no mapping" with preference for rather than ),
+ * and all other items are unique non-zero results.
+ *
+ * The default value of a fromUTableValues[] section that is referenced
+ * _directly_ from a fromUStage3b[] item may also be UCNV_EXT_FROM_U_SUBCHAR1,
+ * but this value must not occur anywhere else in fromUTableValues[]
+ * because "no mapping" is always a property of a single code point,
+ * never of multiple.
+ *
+ *
+ * char fromUBytes[];
+ *
+ * Contains fromUnicode mapping results, stored as sequences of chars.
+ * Indexes and lengths stored in the fromUTableValues[].
+ */
+enum {
+ UCNV_EXT_INDEXES_LENGTH, /* 0 */
+
+ UCNV_EXT_TO_U_INDEX, /* 1 */
+ UCNV_EXT_TO_U_LENGTH,
+ UCNV_EXT_TO_U_UCHARS_INDEX,
+ UCNV_EXT_TO_U_UCHARS_LENGTH,
+
+ UCNV_EXT_FROM_U_UCHARS_INDEX, /* 5 */
+ UCNV_EXT_FROM_U_VALUES_INDEX,
+ UCNV_EXT_FROM_U_LENGTH,
+ UCNV_EXT_FROM_U_BYTES_INDEX,
+ UCNV_EXT_FROM_U_BYTES_LENGTH,
+
+ UCNV_EXT_FROM_U_STAGE_12_INDEX, /* 10 */
+ UCNV_EXT_FROM_U_STAGE_1_LENGTH,
+ UCNV_EXT_FROM_U_STAGE_12_LENGTH,
+ UCNV_EXT_FROM_U_STAGE_3_INDEX,
+ UCNV_EXT_FROM_U_STAGE_3_LENGTH,
+ UCNV_EXT_FROM_U_STAGE_3B_INDEX,
+ UCNV_EXT_FROM_U_STAGE_3B_LENGTH,
+
+ UCNV_EXT_COUNT_BYTES, /* 17 */
+ UCNV_EXT_COUNT_UCHARS,
+ UCNV_EXT_FLAGS,
+
+ UCNV_EXT_RESERVED_INDEX, /* 20, moves with additional indexes */
+
+ UCNV_EXT_SIZE=31,
+ UCNV_EXT_INDEXES_MIN_LENGTH=32
+};
+
+/* get the pointer to an extension array from indexes[index] */
+#define UCNV_EXT_ARRAY(indexes, index, itemType) \
+ ((const itemType *)((const char *)(indexes)+(indexes)[index]))
+
+#define UCNV_GET_MAX_BYTES_PER_UCHAR(indexes) \
+ ((indexes)[UCNV_EXT_COUNT_BYTES]&0xff)
+
+/* internal API ------------------------------------------------------------- */
+
+U_CFUNC UBool
+ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
+ int32_t firstLength,
+ const char **src, const char *srcLimit,
+ UChar **target, const UChar *targetLimit,
+ int32_t **offsets, int32_t srcIndex,
+ UBool flush,
+ UErrorCode *pErrorCode);
+
+U_CFUNC UChar32
+ucnv_extSimpleMatchToU(const int32_t *cx,
+ const char *source, int32_t length,
+ UBool useFallback);
+
+U_CFUNC void
+ucnv_extContinueMatchToU(UConverter *cnv,
+ UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
+ UErrorCode *pErrorCode);
+
+
+U_CFUNC UBool
+ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
+ UChar32 cp,
+ const UChar **src, const UChar *srcLimit,
+ char **target, const char *targetLimit,
+ int32_t **offsets, int32_t srcIndex,
+ UBool flush,
+ UErrorCode *pErrorCode);
+
+U_CFUNC int32_t
+ucnv_extSimpleMatchFromU(const int32_t *cx,
+ UChar32 cp, uint32_t *pValue,
+ UBool useFallback);
+
+U_CFUNC void
+ucnv_extContinueMatchFromU(UConverter *cnv,
+ UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
+ UErrorCode *pErrorCode);
+
+U_CFUNC void
+ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
+ USetAdder *sa,
+ UConverterUnicodeSet which,
+ UErrorCode *pErrorCode);
+
+/* toUnicode helpers -------------------------------------------------------- */
+
+#define UCNV_EXT_TO_U_BYTE_SHIFT 24
+#define UCNV_EXT_TO_U_VALUE_MASK 0xffffff
+#define UCNV_EXT_TO_U_MIN_CODE_POINT 0x1f0000
+#define UCNV_EXT_TO_U_MAX_CODE_POINT 0x2fffff
+#define UCNV_EXT_TO_U_ROUNDTRIP_FLAG ((uint32_t)1<<23)
+#define UCNV_EXT_TO_U_INDEX_MASK 0x3ffff
+#define UCNV_EXT_TO_U_LENGTH_SHIFT 18
+#define UCNV_EXT_TO_U_LENGTH_OFFSET 12
+
+/* maximum number of indexed UChars */
+#define UCNV_EXT_MAX_UCHARS 19
+
+#define UCNV_EXT_TO_U_MAKE_WORD(byte, value) (((uint32_t)(byte)<>UCNV_EXT_TO_U_BYTE_SHIFT)
+#define UCNV_EXT_TO_U_GET_VALUE(word) ((word)&UCNV_EXT_TO_U_VALUE_MASK)
+
+#define UCNV_EXT_TO_U_IS_PARTIAL(value) ((value)>UCNV_EXT_TO_U_LENGTH_SHIFT)-UCNV_EXT_TO_U_LENGTH_OFFSET)
+
+/* fromUnicode helpers ------------------------------------------------------ */
+
+/* most trie constants are shared with ucnvmbcs.h */
+
+/* see similar utrie.h UTRIE_INDEX_SHIFT and UTRIE_DATA_GRANULARITY */
+#define UCNV_EXT_STAGE_2_LEFT_SHIFT 2
+#define UCNV_EXT_STAGE_3_GRANULARITY 4
+
+/* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */
+#define UCNV_EXT_FROM_U(stage12, stage3, s1Index, c) \
+ (stage3)[ ((int32_t)(stage12)[ (stage12)[s1Index] +(((c)>>4)&0x3f) ]< (impossible roundtrip to 0 bytes, value 01) */
+#define UCNV_EXT_FROM_U_SUBCHAR1 0x80000001
+
+/* at most 3 bytes in the lower part of the value */
+#define UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH 3
+
+/* maximum number of indexed bytes */
+#define UCNV_EXT_MAX_BYTES 0x1f
+
+#define UCNV_EXT_FROM_U_IS_PARTIAL(value) (((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)==0)
+#define UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value) (value)
+
+#define UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)!=0)
+#define UCNV_EXT_FROM_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)
+
+/* use after masking off the roundtrip flag */
+#define UCNV_EXT_FROM_U_GET_LENGTH(value) (int32_t)(((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)&UCNV_EXT_MAX_BYTES)
+
+/* get bytes or bytes index */
+#define UCNV_EXT_FROM_U_GET_DATA(value) ((value)&UCNV_EXT_FROM_U_DATA_MASK)
+
+#endif
+
+#endif
diff --git a/icuSources/common/ucnv_imp.h b/icuSources/common/ucnv_imp.h
index d697a398..0f06c1e2 100644
--- a/icuSources/common/ucnv_imp.h
+++ b/icuSources/common/ucnv_imp.h
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 1999-2003, International Business Machines
+* Copyright (C) 1999-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@@ -21,6 +21,10 @@
#define UCNV_IMP_H
#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
+#include "unicode/uloc.h"
#include "ucnv_bld.h"
/* figures out if we need to go to file to read in the data tables.
@@ -55,6 +59,21 @@ ucnv_createConverterFromSharedData(UConverter *myUConverter, UConverterSharedDat
UConverter* ucnv_createConverterFromPackage(const char *packageName, const char *converterName,
UErrorCode *err);
+typedef struct {
+ char cnvName[UCNV_MAX_CONVERTER_NAME_LENGTH], locale[ULOC_FULLNAME_CAPACITY];
+ const char *realName;
+ uint32_t options;
+} UConverterLookupData;
+
+/**
+ * Load a converter but do not create a UConverter object.
+ * Simply return the UConverterSharedData.
+ * Performs alias lookup etc.
+ * @internal
+ */
+UConverterSharedData *
+ucnv_loadSharedData(const char *converterName, UConverterLookupData *lookup, UErrorCode * err);
+
/**
* This may unload the shared data in a thread safe manner.
* This will only unload the data if no other converters are sharing it.
@@ -68,13 +87,6 @@ ucnv_unloadSharedDataIfReady(UConverterSharedData *sharedData);
void
ucnv_incrementRefCount(UConverterSharedData *sharedData);
-
-/* returns true if "name" is in algorithmicConverterNames
- * @param name The converter name.
- * @return TRUE if "name" is in algorithmicConverterNames.
- */
-UBool ucnv_isDataBasedConverter (const char *name);
-
/* Copy the string that is represented by the UConverterPlatform enum
* @param platformString An output buffer
* @param platform An enum representing a platform
@@ -82,5 +94,6 @@ UBool ucnv_isDataBasedConverter (const char *name);
*/
int32_t ucnv_copyPlatformString(char *platformString, UConverterPlatform platform);
+#endif
#endif /* _UCNV_IMP */
diff --git a/icuSources/common/ucnv_io.c b/icuSources/common/ucnv_io.c
index a313ef5e..a86af8f5 100644
--- a/icuSources/common/ucnv_io.c
+++ b/icuSources/common/ucnv_io.c
@@ -1,7 +1,7 @@
/*
******************************************************************************
*
-* Copyright (C) 1999-2003, International Business Machines
+* Copyright (C) 1999-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@@ -28,11 +28,16 @@
*/
#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
#include "unicode/putil.h"
#include "unicode/ucnv.h" /* This file implements ucnv_xXXX() APIs */
#include "unicode/udata.h"
#include "umutex.h"
+#include "uarrsort.h"
+#include "udataswp.h"
#include "cstring.h"
#include "cmemory.h"
#include "ucnv_io.h"
@@ -51,14 +56,18 @@
* First there is the size of the Table of Contents (TOC). The TOC
* entries contain the size of each section. In order to find the offset
* you just need to sum up the previous offsets.
+ * The TOC length and entries are an array of uint32_t values.
+ * The first section after the TOC starts immediately after the TOC.
*
* 1) This section contains a list of converters. This list contains indexes
* into the string table for the converter name. The index of this list is
* also used by other sections, which are mentioned later on.
+ * This list is not sorted.
*
* 2) This section contains a list of tags. This list contains indexes
* into the string table for the tag name. The index of this list is
* also used by other sections, which are mentioned later on.
+ * This list is in priority order of standards.
*
* 3) This section contains a list of sorted unique aliases. This
* list contains indexes into the string table for the alias name. The
@@ -74,7 +83,7 @@
* an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
* and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
* the predigested form of the 5th section so that an alias lookup can be fast.
- *
+ *
* 5) This section contains a 2D array with indexes to the 6th section. This
* section is the full form of all alias mappings. The column index is the
* index into the converter list (column header). The row index is the index
@@ -130,18 +139,18 @@
* -------------------------------------------.
* T /##########################################/|
* a / # # /#
- * g / # ## ## ### # ### ### ### #/
- * s / # ##### #### ## ## #/#
- * / ### # # ## # # # ### # # #/##
+ * g / # ## ## ### # ### ### ### #/
+ * s / # ##### #### ## ## #/#
+ * / ### # # ## # # # ### # # #/##
* ------------------------------------------/# #
* A |### # # ## # # # ### # # #|# #
* l |# # # # # ## # #|# #
* i |# # # # # # #|#
* a |# #|#
* s | #|#
- * e
- * s
- *
+ * e
+ * s
+ *
*/
/**
@@ -157,6 +166,20 @@ static const char DATA_TYPE[] = "icu";
static UDataMemory *gAliasData=NULL;
+enum {
+ tocLengthIndex=0,
+ converterListIndex=1,
+ tagListIndex=2,
+ aliasListIndex=3,
+ untaggedConvArrayIndex=4,
+ taggedAliasArrayIndex=5,
+ taggedAliasListsIndex=6,
+ reservedIndex1=7,
+ stringTableIndex=8,
+ minTocLength=8, /* min. tocLength in the file, does not count the tocLengthIndex! */
+ offsetsCount /* length of the swapper's temporary offsets[] */
+};
+
static const uint16_t *gConverterList = NULL;
static const uint16_t *gTagList = NULL;
static const uint16_t *gAliasList = NULL;
@@ -196,6 +219,37 @@ isAcceptable(void *context,
pInfo->formatVersion[0]==3);
}
+static UBool U_CALLCONV ucnv_io_cleanup(void)
+{
+ if (gAliasData) {
+ udata_close(gAliasData);
+ gAliasData = NULL;
+ }
+
+ ucnv_io_flushAvailableConverterCache();
+
+ gConverterListSize = 0;
+ gTagListSize = 0;
+ gAliasListSize = 0;
+ gUntaggedConvArraySize = 0;
+ gTaggedAliasArraySize = 0;
+ gTaggedAliasListsSize = 0;
+ gStringTableSize = 0;
+
+ gConverterList = NULL;
+ gTagList = NULL;
+ gAliasList = NULL;
+ gUntaggedConvArray = NULL;
+ gTaggedAliasArray = NULL;
+ gTaggedAliasLists = NULL;
+ gStringTable = NULL;
+
+ gDefaultConverterName = NULL;
+ gDefaultConverterNameBuffer[0] = 0;
+
+ return TRUE; /* Everything was cleaned up */
+}
+
static UBool
haveAliasData(UErrorCode *pErrorCode) {
int haveData;
@@ -224,7 +278,7 @@ haveAliasData(UErrorCode *pErrorCode) {
table = (const uint16_t *)udata_getMemory(data);
tableStart = ((const uint32_t *)(table))[0];
- if (tableStart < 8) {
+ if (tableStart < minTocLength) {
*pErrorCode = U_INVALID_FORMAT_ERROR;
udata_close(data);
return FALSE;
@@ -269,6 +323,7 @@ haveAliasData(UErrorCode *pErrorCode) {
currOffset += reservedSize1;
gStringTable = table + currOffset;
+ ucln_common_registerCleanup(UCLN_COMMON_UCNV_IO, ucnv_io_cleanup);
}
umtx_unlock(NULL);
@@ -293,39 +348,6 @@ isAlias(const char *alias, UErrorCode *pErrorCode) {
}
}
-UBool
-ucnv_io_cleanup()
-{
- if (gAliasData) {
- udata_close(gAliasData);
- gAliasData = NULL;
- }
-
- ucnv_io_flushAvailableConverterCache();
-
- gConverterListSize = 0;
- gTagListSize = 0;
- gAliasListSize = 0;
- gUntaggedConvArraySize = 0;
- gTaggedAliasArraySize = 0;
- gTaggedAliasListsSize = 0;
- gStringTableSize = 0;
-
- gConverterList = NULL;
- gTagList = NULL;
- gAliasList = NULL;
- gUntaggedConvArray = NULL;
- gTaggedAliasArray = NULL;
- gTaggedAliasLists = NULL;
- gStringTable = NULL;
-
- gDefaultConverterName = NULL;
- gDefaultConverterNameBuffer[0] = 0;
-
- return TRUE; /* Everything was cleaned up */
-}
-
-
static uint32_t getTagNumber(const char *tagname) {
if (gTagList) {
uint32_t tagNum;
@@ -341,18 +363,36 @@ static uint32_t getTagNumber(const char *tagname) {
/* @see ucnv_compareNames */
U_CFUNC char * U_EXPORT2
-ucnv_io_stripForCompare(char *dst, const char *name) {
+ucnv_io_stripASCIIForCompare(char *dst, const char *name) {
char c1 = *name;
char *dstItr = dst;
while (c1) {
/* Ignore delimiters '-', '_', and ' ' */
- while ((c1 = *name) == '-' || c1 == '_' || c1 == ' ') {
+ while ((c1 = *name) == 0x2d || c1 == 0x5f || c1 == 0x20) {
++name;
}
/* lowercase for case-insensitive comparison */
- *(dstItr++) = uprv_tolower(c1);
+ *(dstItr++) = uprv_asciitolower(c1);
+ ++name;
+ }
+ return dst;
+}
+
+U_CFUNC char * U_EXPORT2
+ucnv_io_stripEBCDICForCompare(char *dst, const char *name) {
+ char c1 = *name;
+ char *dstItr = dst;
+
+ while (c1) {
+ /* Ignore delimiters '-', '_', and ' ' */
+ while ((c1 = *name) == 0x60 || c1 == 0x6d || c1 == 0x40) {
+ ++name;
+ }
+
+ /* lowercase for case-insensitive comparison */
+ *(dstItr++) = uprv_ebcdictolower(c1);
++name;
}
return dst;
@@ -363,12 +403,12 @@ ucnv_io_stripForCompare(char *dst, const char *name) {
* is case-insensitive. It also ignores the characters '-', '_', and
* ' ' (dash, underscore, and space). Thus the strings "UTF-8",
* "utf_8", and "Utf 8" are exactly equivalent.
- *
+ *
* This is a symmetrical (commutative) operation; order of arguments
* is insignificant. This is an important property for sorting the
* list (when the list is preprocessed into binary form) and for
* performing binary searches on it at run time.
- *
+ *
* @param name1 a converter name or alias, zero-terminated
* @param name2 a converter name or alias, zero-terminated
* @return 0 if the names match, or a negative value if the name1
@@ -395,7 +435,7 @@ ucnv_compareNames(const char *name1, const char *name2) {
if ((c1|c2)==0) {
return 0;
}
-
+
/* Case-insensitive comparison */
rc = (int)(unsigned char)uprv_tolower(c1) -
(int)(unsigned char)uprv_tolower(c2);
@@ -414,21 +454,21 @@ ucnv_compareNames(const char *name1, const char *name2) {
static U_INLINE uint32_t
findConverter(const char *alias, UErrorCode *pErrorCode) {
uint32_t mid, start, limit;
- uint32_t lastMid;
+ uint32_t lastMid;
int result;
/* do a binary search for the alias */
start = 0;
limit = gUntaggedConvArraySize;
mid = limit;
- lastMid = UINT32_MAX;
+ lastMid = UINT32_MAX;
for (;;) {
mid = (uint32_t)((start + limit) / 2);
- if (lastMid == mid) { /* Have we moved? */
- break; /* We haven't moved, and it wasn't found. */
- }
- lastMid = mid;
+ if (lastMid == mid) { /* Have we moved? */
+ break; /* We haven't moved, and it wasn't found. */
+ }
+ lastMid = mid;
result = ucnv_compareNames(alias, GET_STRING(gAliasList[mid]));
if (result < 0) {
@@ -454,7 +494,7 @@ findConverter(const char *alias, UErrorCode *pErrorCode) {
* Is this alias in this list?
* alias and listOffset should be non-NULL.
*/
-static U_INLINE UBool
+static U_INLINE UBool
isAliasInList(const char *alias, uint32_t listOffset) {
if (listOffset) {
uint32_t currAlias;
@@ -612,7 +652,7 @@ ucnv_io_nextStandardAliases(UEnumeration *enumerator,
if (myContext->listIdx < listCount) {
const char *myStr = GET_STRING(currList[myContext->listIdx++]);
if (resultLength) {
- *resultLength = uprv_strlen(myStr);
+ *resultLength = (int32_t)uprv_strlen(myStr);
}
return myStr;
}
@@ -848,6 +888,7 @@ static UBool haveAvailableConverterList(UErrorCode *pErrorCode) {
if (gAvailableConverters == NULL) {
gAvailableConverters = localConverterList;
gAvailableConverterCount = localConverterCount;
+ /* haveData should have already registered the cleanup function */
}
else {
uprv_free((char **)localConverterList);
@@ -891,7 +932,7 @@ ucnv_io_nextAllConverters(UEnumeration *enumerator,
if (*myContext < gConverterListSize) {
const char *myStr = GET_STRING(gConverterList[(*myContext)++]);
if (resultLength) {
- *resultLength = uprv_strlen(myStr);
+ *resultLength = (int32_t)uprv_strlen(myStr);
}
return myStr;
}
@@ -971,9 +1012,7 @@ ucnv_io_getDefaultConverterName() {
UConverter *cnv = NULL;
int32_t length = 0;
- umtx_lock(NULL);
name = uprv_getDefaultCodepage();
- umtx_unlock(NULL);
/* if the name is there, test it out and get the canonical name with options */
if(name != NULL) {
@@ -988,7 +1027,7 @@ ucnv_io_getDefaultConverterName() {
|| length>=sizeof(gDefaultConverterNameBuffer))
{
/* Panic time, let's use a fallback. */
-#if (U_CHARSET_FAMILY == U_ASCII_FAMILY)
+#if (U_CHARSET_FAMILY == U_ASCII_FAMILY)
name = "US-ASCII";
/* there is no 'algorithmic' converter for EBCDIC */
#elif defined(OS390)
@@ -1006,6 +1045,7 @@ ucnv_io_getDefaultConverterName() {
gDefaultConverterNameBuffer[length]=0;
gDefaultConverterName = gDefaultConverterNameBuffer;
name = gDefaultConverterName;
+ ucln_common_registerCleanup(UCLN_COMMON_UCNV_IO, ucnv_io_cleanup);
umtx_unlock(NULL);
/* The close may make the current name go away. */
@@ -1019,7 +1059,9 @@ U_CFUNC void
ucnv_io_setDefaultConverterName(const char *converterName) {
if(converterName==NULL) {
/* reset to the default codepage */
+ umtx_lock(NULL);
gDefaultConverterName=NULL;
+ umtx_unlock(NULL);
} else {
UErrorCode errorCode=U_ZERO_ERROR;
const char *name=ucnv_io_getConverterName(converterName, &errorCode);
@@ -1038,11 +1080,261 @@ ucnv_io_setDefaultConverterName(const char *converterName) {
gDefaultConverterName=gDefaultConverterNameBuffer;
}
}
-
umtx_unlock(NULL);
}
}
+/* alias table swapping ----------------------------------------------------- */
+
+typedef char * U_CALLCONV StripForCompareFn(char *dst, const char *name);
+
+/*
+ * row of a temporary array
+ *
+ * gets platform-endian charset string indexes and sorting indexes;
+ * after sorting this array by strings, the actual arrays are permutated
+ * according to the sorting indexes
+ */
+typedef struct TempRow {
+ uint16_t strIndex, sortIndex;
+} TempRow;
+
+typedef struct TempAliasTable {
+ const char *chars;
+ TempRow *rows;
+ uint16_t *resort;
+ StripForCompareFn *stripForCompare;
+} TempAliasTable;
+
+enum {
+ STACK_ROW_CAPACITY=500
+};
+
+static int32_t
+io_compareRows(const void *context, const void *left, const void *right) {
+ char strippedLeft[UCNV_MAX_CONVERTER_NAME_LENGTH],
+ strippedRight[UCNV_MAX_CONVERTER_NAME_LENGTH];
+
+ TempAliasTable *tempTable=(TempAliasTable *)context;
+ const char *chars=tempTable->chars;
+
+ return (int32_t)uprv_strcmp(tempTable->stripForCompare(strippedLeft, chars+2*((const TempRow *)left)->strIndex),
+ tempTable->stripForCompare(strippedRight, chars+2*((const TempRow *)right)->strIndex));
+}
+
+U_CAPI int32_t U_EXPORT2
+ucnv_swapAliases(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ const UDataInfo *pInfo;
+ int32_t headerSize;
+
+ const uint16_t *inTable;
+ uint32_t toc[offsetsCount];
+ uint32_t offsets[offsetsCount]; /* 16-bit-addressed offsets from inTable/outTable */
+ uint32_t i, count, tocLength, topOffset;
+
+ TempRow rows[STACK_ROW_CAPACITY];
+ uint16_t resort[STACK_ROW_CAPACITY];
+ TempAliasTable tempTable;
+
+ /* udata_swapDataHeader checks the arguments */
+ headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ /* check data format and format version */
+ pInfo=(const UDataInfo *)((const char *)inData+4);
+ if(!(
+ pInfo->dataFormat[0]==0x43 && /* dataFormat="CvAl" */
+ pInfo->dataFormat[1]==0x76 &&
+ pInfo->dataFormat[2]==0x41 &&
+ pInfo->dataFormat[3]==0x6c &&
+ pInfo->formatVersion[0]==3
+ )) {
+ udata_printError(ds, "ucnv_swapAliases(): data format %02x.%02x.%02x.%02x (format version %02x) is not an alias table\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0]);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ /* an alias table must contain at least the table of contents array */
+ if(length>=0 && (length-headerSize)<4*(1+minTocLength)) {
+ udata_printError(ds, "ucnv_swapAliases(): too few bytes (%d after header) for an alias table\n",
+ length-headerSize);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ inTable=(const uint16_t *)((const char *)inData+headerSize);
+ toc[tocLengthIndex]=tocLength=ds->readUInt32(((const uint32_t *)inTable)[tocLengthIndex]);
+ if(tocLengthreadUInt32(((const uint32_t *)inTable)[i]);
+ }
+
+ /* compute offsets */
+ offsets[tocLengthIndex]=0;
+ offsets[converterListIndex]=2*(1+tocLength); /* count two 16-bit units per toc entry */
+ for(i=tagListIndex; i<=stringTableIndex; ++i) {
+ offsets[i]=offsets[i-1]+toc[i-1];
+ }
+
+ /* compute the overall size of the after-header data, in numbers of 16-bit units */
+ topOffset=offsets[i-1]+toc[i-1];
+
+ if(length>=0) {
+ uint16_t *outTable;
+ const uint16_t *p, *p2;
+ uint16_t *q, *q2;
+ uint16_t oldIndex;
+
+ if((length-headerSize)<(2*(int32_t)topOffset)) {
+ udata_printError(ds, "ucnv_swapAliases(): too few bytes (%d after header) for an alias table\n",
+ length-headerSize);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ outTable=(uint16_t *)((char *)outData+headerSize);
+
+ /* swap the entire table of contents */
+ ds->swapArray32(ds, inTable, 4*(1+tocLength), outTable, pErrorCode);
+
+ /* swap strings */
+ ds->swapInvChars(ds, inTable+offsets[stringTableIndex], 2*(int32_t)toc[stringTableIndex],
+ outTable+offsets[stringTableIndex], pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ udata_printError(ds, "ucnv_swapAliases().swapInvChars(charset names) failed - %s\n",
+ u_errorName(*pErrorCode));
+ return 0;
+ }
+
+ if(ds->inCharset==ds->outCharset) {
+ /* no need to sort, just swap all 16-bit values together */
+ ds->swapArray16(ds,
+ inTable+offsets[converterListIndex],
+ 2*(int32_t)(offsets[stringTableIndex]-offsets[converterListIndex]),
+ outTable+offsets[converterListIndex],
+ pErrorCode);
+ } else {
+ /* allocate the temporary table for sorting */
+ count=toc[aliasListIndex];
+
+ tempTable.chars=(const char *)(outTable+offsets[stringTableIndex]); /* sort by outCharset */
+
+ if(count<=STACK_ROW_CAPACITY) {
+ tempTable.rows=rows;
+ tempTable.resort=resort;
+ } else {
+ tempTable.rows=(TempRow *)uprv_malloc(count*sizeof(TempRow)+count*2);
+ if(tempTable.rows==NULL) {
+ udata_printError(ds, "ucnv_swapAliases(): unable to allocate memory for sorting tables (max length: %u)\n",
+ count);
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ tempTable.resort=(uint16_t *)(tempTable.rows+count);
+ }
+
+ if(ds->outCharset==U_ASCII_FAMILY) {
+ tempTable.stripForCompare=ucnv_io_stripASCIIForCompare;
+ } else /* U_EBCDIC_FAMILY */ {
+ tempTable.stripForCompare=ucnv_io_stripEBCDICForCompare;
+ }
+
+ /*
+ * Sort unique aliases+mapped names.
+ *
+ * We need to sort the list again by outCharset strings because they
+ * sort differently for different charset families.
+ * First we set up a temporary table with the string indexes and
+ * sorting indexes and sort that.
+ * Then we permutate and copy/swap the actual values.
+ */
+ p=inTable+offsets[aliasListIndex];
+ q=outTable+offsets[aliasListIndex];
+
+ p2=inTable+offsets[untaggedConvArrayIndex];
+ q2=outTable+offsets[untaggedConvArrayIndex];
+
+ for(i=0; ireadUInt16(p[i]);
+ tempTable.rows[i].sortIndex=(uint16_t)i;
+ }
+
+ uprv_sortArray(tempTable.rows, (int32_t)count, sizeof(TempRow),
+ io_compareRows, &tempTable,
+ FALSE, pErrorCode);
+
+ if(U_SUCCESS(*pErrorCode)) {
+ /* copy/swap/permutate items */
+ if(p!=q) {
+ for(i=0; iswapArray16(ds, p+oldIndex, 2, q+i, pErrorCode);
+ ds->swapArray16(ds, p2+oldIndex, 2, q2+i, pErrorCode);
+ }
+ } else {
+ /*
+ * If we swap in-place, then the permutation must use another
+ * temporary array (tempTable.resort)
+ * before the results are copied to the outBundle.
+ */
+ uint16_t *r=tempTable.resort;
+
+ for(i=0; iswapArray16(ds, p+oldIndex, 2, r+i, pErrorCode);
+ }
+ uprv_memcpy(q, r, 2*count);
+
+ for(i=0; iswapArray16(ds, p2+oldIndex, 2, r+i, pErrorCode);
+ }
+ uprv_memcpy(q2, r, 2*count);
+ }
+ }
+
+ if(tempTable.rows!=rows) {
+ uprv_free(tempTable.rows);
+ }
+
+ if(U_FAILURE(*pErrorCode)) {
+ udata_printError(ds, "ucnv_swapAliases().uprv_sortArray(%u items) failed - %s\n",
+ count, u_errorName(*pErrorCode));
+ return 0;
+ }
+
+ /* swap remaining 16-bit values */
+ ds->swapArray16(ds,
+ inTable+offsets[converterListIndex],
+ 2*(int32_t)(offsets[aliasListIndex]-offsets[converterListIndex]),
+ outTable+offsets[converterListIndex],
+ pErrorCode);
+ ds->swapArray16(ds,
+ inTable+offsets[taggedAliasArrayIndex],
+ 2*(int32_t)(offsets[stringTableIndex]-offsets[taggedAliasArrayIndex]),
+ outTable+offsets[taggedAliasArrayIndex],
+ pErrorCode);
+ }
+ }
+
+ return headerSize+2*(int32_t)topOffset;
+}
+
+#endif
+
/*
* Hey, Emacs, please set the following:
*
@@ -1051,4 +1343,3 @@ ucnv_io_setDefaultConverterName(const char *converterName) {
* End:
*
*/
-
diff --git a/icuSources/common/ucnv_io.h b/icuSources/common/ucnv_io.h
index fe85950f..7ae6def4 100644
--- a/icuSources/common/ucnv_io.h
+++ b/icuSources/common/ucnv_io.h
@@ -1,6 +1,6 @@
/*
**********************************************************************
- * Copyright (C) 1999-2003, International Business Machines
+ * Copyright (C) 1999-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@@ -15,20 +15,36 @@
#include "unicode/utypes.h"
+#if !UCONFIG_NO_CONVERSION
+
+#include "udataswp.h"
+
#define UCNV_AMBIGUOUS_ALIAS_MAP_BIT 0x8000
#define UCNV_CONVERTER_INDEX_MASK 0xFFF
#define UCNV_NUM_RESERVED_TAGS 2
#define UCNV_NUM_HIDDEN_TAGS 1
/**
+ * \var ucnv_io_stripForCompare
* Remove the underscores, dashes and spaces from the name, and convert
* the name to lower case.
* @param dst The destination buffer, which is <= the buffer of name.
* @param dst The destination buffer, which is <= the buffer of name.
* @return the destination buffer.
*/
+#if U_CHARSET_FAMILY==U_ASCII_FAMILY
+# define ucnv_io_stripForCompare ucnv_io_stripASCIIForCompare
+#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
+# define ucnv_io_stripForCompare ucnv_io_stripEBCDICForCompare
+#else
+# error U_CHARSET_FAMILY is not valid
+#endif
+
+U_CFUNC char * U_EXPORT2
+ucnv_io_stripASCIIForCompare(char *dst, const char *name);
+
U_CFUNC char * U_EXPORT2
-ucnv_io_stripForCompare(char *dst, const char *name);
+ucnv_io_stripEBCDICForCompare(char *dst, const char *name);
/**
* Map a converter alias name to a canonical converter name.
@@ -137,6 +153,17 @@ ucnv_io_getDefaultConverterName(void);
U_CFUNC void
ucnv_io_setDefaultConverterName(const char *name);
+/**
+ * Swap an ICU converter alias table. See ucnv_io.c.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+ucnv_swapAliases(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode);
+
+#endif
+
#endif /* _UCNV_IO */
/*
diff --git a/icuSources/common/ucnv_lmb.c b/icuSources/common/ucnv_lmb.c
index 465040f8..e9942a4b 100644
--- a/icuSources/common/ucnv_lmb.c
+++ b/icuSources/common/ucnv_lmb.c
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 2000-2003, International Business Machines
+* Copyright (C) 2000-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_lmb.cpp
@@ -25,14 +25,20 @@
#include "unicode/utypes.h"
-#if !UCONFIG_NO_LEGACY_CONVERSION
+#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
-#include "cmemory.h"
#include "unicode/ucnv_err.h"
-#include "ucnv_bld.h"
#include "unicode/ucnv.h"
+#include "unicode/uset.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "uassert.h"
+#include "ucnv_imp.h"
+#include "ucnv_bld.h"
#include "ucnv_cnv.h"
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
/*
LMBCS
@@ -218,7 +224,13 @@ Because of the extensive use of other character sets, the LMBCS converter
keeps a mapping between optimization groups and IBM character sets, so that
ICU converters can be created and used as needed. */
-static const char * const OptGroupByteToCPName[ULMBCS_CTRLOFFSET] = {
+/* As you can see, even though any byte below 0x20 could be an optimization
+byte, only those at 0x13 or below can map to an actual converter. To limit
+some loops and searches, we define a value for that last group converter:*/
+
+#define ULMBCS_GRP_LAST 0x13 /* last LMBCS group that has a converter */
+
+static const char * const OptGroupByteToCPName[ULMBCS_GRP_LAST + 1] = {
/* 0x0000 */ "lmb-excp", /* internal home for the LOTUS exceptions list */
/* 0x0001 */ "ibm-850",
/* 0x0002 */ "ibm-851",
@@ -244,12 +256,6 @@ static const char * const OptGroupByteToCPName[ULMBCS_CTRLOFFSET] = {
and 0x0019, the 1-2-3 system range control char */
};
-/* As you can see, even though any byte below 0x20 could be an optimization
-byte, only those at 0x13 or below can map to an actual converter. To limit
-some loops and searches, we define a value for that last group converter:*/
-
-#define ULMBCS_GRP_LAST 0x13 /* last LMBCS group that has a converter */
-
/* That's approximately all the data that's needed for translating
LMBCS to Unicode.
@@ -480,7 +486,7 @@ FindLMBCSLocale(const char *LocaleID)
if (*pTable->LocaleID == *LocaleID) /* Check only first char for speed */
{
/* First char matches - check whole name, for entry-length */
- if (strncmp(pTable->LocaleID, LocaleID, strlen(pTable->LocaleID)) == 0)
+ if (uprv_strncmp(pTable->LocaleID, LocaleID, strlen(pTable->LocaleID)) == 0)
return pTable->OptGroup;
}
else
@@ -505,6 +511,13 @@ FindLMBCSLocale(const char *LocaleID)
the definitions of these structures, see unicode\ucnv_bld.h
*/
+typedef struct
+ {
+ UConverterSharedData *OptGrpConverter[ULMBCS_GRP_LAST+1]; /* Converter per Opt. grp. */
+ uint8_t OptGroup; /* default Opt. grp. for this LMBCS session */
+ uint8_t localeConverterIndex; /* reasonable locale match for index */
+ }
+UConverterDataLMBCS;
#define DECLARE_LMBCS_DATA(n) \
@@ -518,17 +531,17 @@ static const UConverterImpl _LMBCSImpl##n={\
_LMBCSToUnicodeWithOffsets,\
_LMBCSFromUnicode,\
_LMBCSFromUnicode,\
- _LMBCSGetNextUChar,\
NULL,\
NULL,\
NULL,\
NULL,\
- ucnv_getCompleteUnicodeSet\
+ _LMBCSSafeClone,\
+ _LMBCSGetUnicodeSet\
};\
static const UConverterStaticData _LMBCSStaticData##n={\
sizeof(UConverterStaticData),\
"LMBCS-" #n,\
- 0, UCNV_IBM, UCNV_LMBCS_##n, 1, 2,\
+ 0, UCNV_IBM, UCNV_LMBCS_##n, 1, 3,\
{ 0x3f, 0, 0, 0 },1,FALSE,FALSE,0,0,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} \
};\
const UConverterSharedData _LMBCSData##n={\
@@ -558,21 +571,32 @@ _LMBCSOpenWorker(UConverter* _this,
ulmbcs_byte_t OptGroup
)
{
- UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS*)uprv_malloc (sizeof (UConverterDataLMBCS));
- if(extraInfo != NULL)
+ UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS*)uprv_malloc (sizeof (UConverterDataLMBCS));
+ if(extraInfo != NULL)
{
- ulmbcs_byte_t i;
- ulmbcs_byte_t imax;
- imax = sizeof(extraInfo->OptGrpConverter)/sizeof(extraInfo->OptGrpConverter[0]);
-
- for (i=0; i < imax; i++)
- {
- extraInfo->OptGrpConverter[i] =
- (OptGroupByteToCPName[i] != NULL) ?
- ucnv_open(OptGroupByteToCPName[i], err) : NULL;
- }
- extraInfo->OptGroup = OptGroup;
- extraInfo->localeConverterIndex = FindLMBCSLocale(locale);
+ ulmbcs_byte_t i;
+
+ uprv_memset(extraInfo, 0, sizeof(UConverterDataLMBCS));
+
+ for (i=0; i <= ULMBCS_GRP_LAST && U_SUCCESS(*err); i++)
+ {
+ if(OptGroupByteToCPName[i] != NULL) {
+ extraInfo->OptGrpConverter[i] = ucnv_loadSharedData(OptGroupByteToCPName[i], NULL, err);
+ }
+ }
+
+ if(U_SUCCESS(*err)) {
+ extraInfo->OptGroup = OptGroup;
+ extraInfo->localeConverterIndex = FindLMBCSLocale(locale);
+ } else {
+ /* one of the subconverters could not be loaded, unload the previous ones */
+ while(i > 0) {
+ if(extraInfo->OptGrpConverter[--i] != NULL) {
+ ucnv_unloadSharedDataIfReady(extraInfo->OptGrpConverter[i]);
+ extraInfo->OptGrpConverter[i] = NULL;
+ }
+ }
+ }
}
else
{
@@ -584,30 +608,69 @@ _LMBCSOpenWorker(UConverter* _this,
static void
_LMBCSClose(UConverter * _this)
{
- if (_this->extraInfo != NULL && !_this->isExtraLocal)
+ if (_this->extraInfo != NULL)
{
ulmbcs_byte_t Ix;
UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo;
- for (Ix=0; Ix < ULMBCS_GRP_UNICODE; Ix++)
+ for (Ix=0; Ix <= ULMBCS_GRP_LAST; Ix++)
{
if (extraInfo->OptGrpConverter[Ix] != NULL)
- ucnv_close (extraInfo->OptGrpConverter[Ix]);
+ ucnv_unloadSharedDataIfReady(extraInfo->OptGrpConverter[Ix]);
+ }
+ if (!_this->isExtraLocal) {
+ uprv_free (_this->extraInfo);
}
- uprv_free (_this->extraInfo);
}
}
-/*
-Here's an all-crash stop for debugging, since ICU does not have asserts.
-Turn this on by defining LMBCS_DEBUG, or by changing it to
-#if 1
-*/
-#if LMBCS_DEBUG
-#define MyAssert(b) {if (!(b)) {*(char *)0 = 1;}}
-#else
-#define MyAssert(b)
-#endif
+typedef struct LMBCSClone {
+ UConverter cnv;
+ UConverterDataLMBCS lmbcs;
+} LMBCSClone;
+
+static UConverter *
+_LMBCSSafeClone(const UConverter *cnv,
+ void *stackBuffer,
+ int32_t *pBufferSize,
+ UErrorCode *status) {
+ LMBCSClone *newLMBCS;
+ UConverterDataLMBCS *extraInfo;
+ int32_t i;
+
+ if(*pBufferSize<=0) {
+ *pBufferSize=(int32_t)sizeof(LMBCSClone);
+ return NULL;
+ }
+
+ extraInfo=(UConverterDataLMBCS *)cnv->extraInfo;
+ newLMBCS=(LMBCSClone *)stackBuffer;
+
+ /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
+
+ uprv_memcpy(&newLMBCS->lmbcs, extraInfo, sizeof(UConverterDataLMBCS));
+
+ /* share the subconverters */
+ for(i = 0; i <= ULMBCS_GRP_LAST; ++i) {
+ if(extraInfo->OptGrpConverter[i] != NULL) {
+ ucnv_incrementRefCount(extraInfo->OptGrpConverter[i]);
+ }
+ }
+
+ newLMBCS->cnv.extraInfo = &newLMBCS->lmbcs;
+ newLMBCS->cnv.isExtraLocal = TRUE;
+ return &newLMBCS->cnv;
+}
+
+static void
+_LMBCSGetUnicodeSet(const UConverter *cnv,
+ USetAdder *sa,
+ UConverterUnicodeSet which,
+ UErrorCode *pErrorCode) {
+ /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */
+ sa->addRange(sa->set, 0, 0xf5ff);
+ sa->addRange(sa->set, 0xf700, 0x10ffff);
+}
/*
Here's the basic helper function that we use when converting from
@@ -627,33 +690,21 @@ LMBCSConversionWorker (
)
{
ulmbcs_byte_t * pLMBCS = pStartLMBCS;
- UConverter * xcnv = extraInfo->OptGrpConverter[group];
+ UConverterSharedData * xcnv = extraInfo->OptGrpConverter[group];
int bytesConverted;
uint32_t value;
ulmbcs_byte_t firstByte;
- MyAssert(xcnv);
- MyAssert(groupsharedData, *pUniChar, &value, FALSE);
+ bytesConverted = ucnv_MBCSFromUChar32(xcnv, *pUniChar, &value, FALSE);
/* get the first result byte */
- switch(bytesConverted)
- {
- case 4:
- firstByte = (ulmbcs_byte_t)(value >> 24);
- break;
- case 3:
- firstByte = (ulmbcs_byte_t)(value >> 16);
- break;
- case 2:
- firstByte = (ulmbcs_byte_t)(value >> 8);
- break;
- case 1:
- firstByte = (ulmbcs_byte_t)value;
- break;
- default:
+ if(bytesConverted > 0) {
+ firstByte = (ulmbcs_byte_t)(value >> ((bytesConverted - 1) * 8));
+ } else {
/* most common failure mode is an unassigned character */
groups_tried[group] = TRUE;
return 0;
@@ -664,7 +715,7 @@ LMBCSConversionWorker (
/* All initial byte values in lower ascii range should have been caught by now,
except with the exception group.
*/
- MyAssert((firstByte <= ULMBCS_C0END) || (firstByte >= ULMBCS_C1START) || (group == ULMBCS_GRP_EXCEPT));
+ U_ASSERT((firstByte <= ULMBCS_C0END) || (firstByte >= ULMBCS_C1START) || (group == ULMBCS_GRP_EXCEPT));
/* use converted data: first write 0, 1 or two group bytes */
if (group != ULMBCS_GRP_EXCEPT && extraInfo->OptGroup != group)
@@ -826,7 +877,7 @@ _LMBCSFromUnicode(UConverterFromUnicodeArgs* args,
}
if (!bytes_written) /* the ambiguous group cases (Strategy 3) */
{
- memset(groups_tried, 0, sizeof(groups_tried));
+ uprv_memset(groups_tried, 0, sizeof(groups_tried));
/* check for non-default optimization group (Strategy 3A )*/
if (extraInfo->OptGroup != 1
@@ -930,16 +981,6 @@ _LMBCSFromUnicode(UConverterFromUnicodeArgs* args,
/* Now, the Unicode from LMBCS section */
-/*
- Special codes for the getNextUnicodeWorker -- usually as the result of
- special error-callback behavior:
- ULMBCS_SKIP To control skipping over LMBCS sequences
- ULMBCS_MULTI To indicate that a single LMBCS char translates to
- multiple uniChars
-*/
-#define ULMBCS_SKIP U_ERROR_LIMIT
-#define ULMBCS_MULTI ULMBCS_SKIP+1
-
/* A function to call when we are looking at the Unicode group byte in LMBCS */
static UChar
GetUniFromLMBCSUni(char const ** ppLMBCSin) /* Called with LMBCS-style Unicode byte stream */
@@ -958,26 +999,22 @@ GetUniFromLMBCSUni(char const ** ppLMBCSin) /* Called with LMBCS-style Unicode
/* CHECK_SOURCE_LIMIT: Helper macro to verify that there are at least'index'
- bytes left in source up to sourceLimit.Errors appropriately if not
+ bytes left in source up to sourceLimit.Errors appropriately if not.
+ If we reach the limit, then update the source pointer to there to consume
+ all input as required by ICU converter semantics.
*/
#define CHECK_SOURCE_LIMIT(index) \
if (args->source+index > args->sourceLimit){\
*err = U_TRUNCATED_CHAR_FOUND;\
- args->source = saveSource;\
+ args->source = args->sourceLimit;\
return 0xffff;}
-/* Return the Unicode representation for the current LMBCS character
-
- This worker function is used by both ucnv_getNextUChar() and ucnv_ToUnicode().
- The last parameter says whether the return value should be treated as UTF-16 or
- UTF-32. The only difference is in surrogate handling
-*/
+/* Return the Unicode representation for the current LMBCS character */
static UChar32
_LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
- UErrorCode* err,
- UBool returnUTF32)
+ UErrorCode* err)
{
UChar32 uniChar = 0; /* an output UNICODE char */
ulmbcs_byte_t CurByte; /* A byte from the input stream */
@@ -1015,7 +1052,7 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
{
UConverterDataLMBCS * extraInfo;
ulmbcs_byte_t group;
- UConverter* cnv;
+ UConverterSharedData *cnv;
if (CurByte == ULMBCS_GRP_CTRL) /* Control character group - no opt group update */
{
@@ -1027,27 +1064,16 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
else
if (CurByte == ULMBCS_GRP_UNICODE) /* Unicode compatibility group: BigEndian UTF16 */
{
- UChar second;
CHECK_SOURCE_LIMIT(2);
- uniChar = GetUniFromLMBCSUni(&(args->source));
-
- /* at this point we are usually done, but we need to make sure we are not in
- a situation where we can successfully put together a surrogate pair */
-
- if(returnUTF32 && UTF_IS_FIRST_SURROGATE(uniChar) && (args->source+3 <= args->sourceLimit)
- && *(args->source)++ == ULMBCS_GRP_UNICODE
- && UTF_IS_SECOND_SURROGATE(second = GetUniFromLMBCSUni(&(args->source))))
- {
- uniChar = UTF16_GET_PAIR_VALUE(uniChar, second);
- }
+ /* don't check for error indicators fffe/ffff below */
+ return GetUniFromLMBCSUni(&(args->source));
}
else if (CurByte <= ULMBCS_CTRLOFFSET)
{
group = CurByte; /* group byte is in the source */
extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
- cnv = extraInfo->OptGrpConverter[group];
- if (!cnv)
+ if (group > ULMBCS_GRP_LAST || (cnv = extraInfo->OptGrpConverter[group]) == NULL)
{
/* this is not a valid group byte - no converter*/
*err = U_INVALID_CHAR_FOUND;
@@ -1061,12 +1087,12 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
if (*args->source == group) {
/* single byte */
++args->source;
- uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 1, FALSE);
+ uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source, 1, FALSE);
+ ++args->source;
} else {
/* double byte */
- const char *newLimit = args->source + 2;
- uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, newLimit, FALSE);
- args->source = newLimit; /* set the correct limit even in case of an error */
+ uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source, 2, FALSE);
+ args->source += 2;
}
}
else { /* single byte conversion */
@@ -1075,14 +1101,13 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
if (CurByte >= ULMBCS_C1START)
{
- uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv->sharedData, CurByte);
+ uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv, CurByte);
}
else
{
/* The non-optimizable oddballs where there is an explicit byte
* AND the second byte is not in the upper ascii range
*/
- const char *s;
char bytes[2];
extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
@@ -1091,8 +1116,7 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
/* Lookup value must include opt group */
bytes[0] = group;
bytes[1] = CurByte;
- s = bytes;
- uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &s, bytes + 2, FALSE);
+ uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, bytes, 2, FALSE);
}
}
}
@@ -1103,92 +1127,31 @@ _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs* args,
cnv = extraInfo->OptGrpConverter[group];
if (group >= ULMBCS_DOUBLEOPTGROUP_START) /* double byte conversion */
{
- if (!_MBCSIsLeadByte(cnv->sharedData, CurByte))
+ if (!ucnv_MBCSIsLeadByte(cnv, CurByte))
{
CHECK_SOURCE_LIMIT(0);
/* let the MBCS conversion consume CurByte again */
- --args->source;
- uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 1, FALSE);
+ uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source - 1, 1, FALSE);
}
else
{
CHECK_SOURCE_LIMIT(1);
/* let the MBCS conversion consume CurByte again */
- --args->source;
- /* since we know that we start at a lead byte, args->source _will_ be incremented by 2 */
- uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 2, FALSE);
+ uniChar = ucnv_MBCSSimpleGetNextUChar(cnv, args->source - 1, 2, FALSE);
+ ++args->source;
}
}
else /* single byte conversion */
{
- uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv->sharedData, CurByte);
+ uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv, CurByte);
}
}
}
- if (((uint32_t)uniChar - 0xfffe) <= 1) /* 0xfffe<=uniChar<=0xffff */
- {
- UConverterToUnicodeArgs cbArgs = *args;
- UConverterCallbackReason reason;
- UChar UCh;
-
- if (uniChar == 0xfffe)
- {
- reason = UCNV_UNASSIGNED;
- *err = U_INVALID_CHAR_FOUND;
- }
- else
- {
- reason = UCNV_ILLEGAL;
- *err = U_ILLEGAL_CHAR_FOUND;
- }
-
- cbArgs.target = &UCh;
- cbArgs.targetLimit = &UCh + 1;
- cbArgs.converter->fromCharErrorBehaviour(cbArgs.converter->toUContext,
- &cbArgs,
- saveSource,
- args->source - saveSource,
- reason,
- err);
-
- if (cbArgs.target != &UCh)
- {
- uniChar = (UChar32) UCh;
- }
- /* Did error functor skip */
- if (U_SUCCESS(*err) && cbArgs.target == &UCh)
- {
- *err = ULMBCS_SKIP;
- }
- /* Did error functor try to write multiple UChars? */
- else if (*err == U_BUFFER_OVERFLOW_ERROR)
- {
- *err = ULMBCS_MULTI;
- }
- }
return uniChar;
}
-/* The exported function that gets one UTF32 character from a LMBCS stream
-*/
-static UChar32
-_LMBCSGetNextUChar(UConverterToUnicodeArgs* args,
- UErrorCode* err)
-{
- UChar32 nextUChar;
- do {
- nextUChar = _LMBCSGetNextUCharWorker(args, err, TRUE);
- } while (*err == ULMBCS_SKIP);
-
- if (*err == ULMBCS_MULTI)
- {
- *err = U_ZERO_ERROR;
- }
- return nextUChar;
-}
-
/* The exported function that converts lmbcs to one or more
UChars - currently UTF-16
*/
@@ -1196,50 +1159,44 @@ static void
_LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args,
UErrorCode* err)
{
+ char LMBCS [ULMBCS_CHARSIZE_MAX];
UChar uniChar; /* one output UNICODE char */
- const char * saveSource = args->source; /* beginning of current code point */
+ const char * saveSource; /* beginning of current code point */
const char * pStartLMBCS = args->source; /* beginning of whole string */
+ const char * errSource = NULL; /* pointer to actual input in case an error occurs */
+ int8_t savebytes = 0;
- if (args->targetLimit == args->target) /* error check may belong in common code */
- {
- *err = U_BUFFER_OVERFLOW_ERROR;
- return;
- }
-
/* Process from source to limit, or until error */
- while (!*err && args->sourceLimit > args->source && args->targetLimit > args->target)
+ while (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit > args->target)
{
saveSource = args->source; /* beginning of current code point */
- if (args->converter->invalidCharLength) /* reassemble char from previous call */
+ if (args->converter->toULength) /* reassemble char from previous call */
{
- char LMBCS [ULMBCS_CHARSIZE_MAX];
- const char *pLMBCS = LMBCS, *saveSourceLimit;
- size_t size_old = args->converter->invalidCharLength;
+ const char *saveSourceLimit;
+ size_t size_old = args->converter->toULength;
- /* limit from source is either reminder of temp buffer, or user limit on source */
+ /* limit from source is either remainder of temp buffer, or user limit on source */
size_t size_new_maybe_1 = sizeof(LMBCS) - size_old;
size_t size_new_maybe_2 = args->sourceLimit - args->source;
size_t size_new = (size_new_maybe_1 < size_new_maybe_2) ? size_new_maybe_1 : size_new_maybe_2;
- uprv_memcpy(LMBCS, args->converter->invalidCharBuffer, size_old);
+ uprv_memcpy(LMBCS, args->converter->toUBytes, size_old);
uprv_memcpy(LMBCS + size_old, args->source, size_new);
saveSourceLimit = args->sourceLimit;
- args->source = pLMBCS;
- args->sourceLimit = pLMBCS+size_old+size_new;
- uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err, FALSE);
- pLMBCS = args->source;
- args->source =saveSource;
+ args->source = errSource = LMBCS;
+ args->sourceLimit = LMBCS+size_old+size_new;
+ savebytes = (int8_t)(size_old+size_new);
+ uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
+ args->source = saveSource + ((args->source - LMBCS) - size_old);
args->sourceLimit = saveSourceLimit;
- args->source += (pLMBCS - LMBCS - size_old);
- if (*err == U_TRUNCATED_CHAR_FOUND && !args->flush)
+ if (*err == U_TRUNCATED_CHAR_FOUND)
{
/* evil special case: source buffers so small a char spans more than 2 buffers */
- int8_t savebytes = (int8_t)(size_old+size_new);
- args->converter->invalidCharLength = savebytes;
- uprv_memcpy(args->converter->invalidCharBuffer, LMBCS, savebytes);
+ args->converter->toULength = savebytes;
+ uprv_memcpy(args->converter->toUBytes, LMBCS, savebytes);
args->source = args->sourceLimit;
*err = U_ZERO_ERROR;
return;
@@ -1247,12 +1204,14 @@ _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args,
else
{
/* clear the partial-char marker */
- args->converter->invalidCharLength = 0;
+ args->converter->toULength = 0;
}
}
else
{
- uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err, FALSE);
+ errSource = saveSource;
+ uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
+ savebytes = (int8_t)(args->source - saveSource);
}
if (U_SUCCESS(*err))
{
@@ -1273,53 +1232,22 @@ _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs* args,
*err = U_ILLEGAL_CHAR_FOUND;
}
}
- else if (*err == ULMBCS_MULTI)
- {
- UChar * pUChar = args->converter->UCharErrorBuffer;
- int8_t BufferLength = args->converter->UCharErrorBufferLength;
-
- *err = U_ZERO_ERROR;
- do
- { /* error functor wants to write multiple UniChars */
- *(args->target)++ = uniChar;
- if(args->offsets)
- {
- *(args->offsets)++ = saveSource - pStartLMBCS;
- }
- uniChar = *pUChar++;
- }
- while(BufferLength-- && args->targetLimit > args->target);
-
- if (++BufferLength > 0)
- { /* fix up remaining UChars that can't fit in caller's buffer */
- uprv_memmove( args->converter->UCharErrorBuffer,
- args->converter->UCharErrorBuffer + args->converter->UCharErrorBufferLength - BufferLength,
- sizeof(UChar) * BufferLength);
- }
- args->converter->UCharErrorBufferLength = BufferLength;
- }
- else if (*err == ULMBCS_SKIP)
- {
- *err = U_ZERO_ERROR; /* and just go around again..*/
- }
}
/* if target ran out before source, return U_BUFFER_OVERFLOW_ERROR */
if (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit <= args->target)
{
*err = U_BUFFER_OVERFLOW_ERROR;
}
-
- /* If character incomplete, store away partial char if more to come */
- if (*err == U_TRUNCATED_CHAR_FOUND)
+ else if (U_FAILURE(*err))
{
- args->source = args->sourceLimit;
- if (!args->flush )
- {
- int8_t savebytes = (int8_t)(args->sourceLimit - saveSource);
- args->converter->invalidCharLength = (int8_t)savebytes;
- uprv_memcpy(args->converter->invalidCharBuffer, saveSource, savebytes);
- *err = U_ZERO_ERROR;
- }
+ /* If character incomplete or unmappable/illegal, store it in toUBytes[] */
+ args->converter->toULength = savebytes;
+ if (savebytes > 0) {
+ uprv_memcpy(args->converter->toUBytes, errSource, savebytes);
+ }
+ if (*err == U_TRUNCATED_CHAR_FOUND) {
+ *err = U_ZERO_ERROR;
+ }
}
}
diff --git a/icuSources/common/ucnv_set.c b/icuSources/common/ucnv_set.c
new file mode 100644
index 00000000..c192ea4f
--- /dev/null
+++ b/icuSources/common/ucnv_set.c
@@ -0,0 +1,66 @@
+/*
+*******************************************************************************
+*
+* Copyright (C) 2003-2004, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: ucnv_set.c
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2004sep07
+* created by: Markus W. Scherer
+*
+* Conversion API functions using USet (ucnv_getUnicodeSet())
+* moved here from ucnv.c for removing the dependency of other ucnv_
+* implementation functions on the USet implementation.
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/uset.h"
+#include "unicode/ucnv.h"
+#include "ucnv_bld.h"
+#include "uset_imp.h"
+
+#if !UCONFIG_NO_CONVERSION
+
+U_CAPI void U_EXPORT2
+ucnv_getUnicodeSet(const UConverter *cnv,
+ USet *setFillIn,
+ UConverterUnicodeSet whichSet,
+ UErrorCode *pErrorCode) {
+ /* argument checking */
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return;
+ }
+ if(cnv==NULL || setFillIn==NULL || whichSetsharedData->impl->getUnicodeSet==NULL) {
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return;
+ }
+
+ {
+ USetAdder sa={
+ NULL,
+ uset_add,
+ uset_addRange,
+ uset_addString
+ };
+ sa.set=setFillIn;
+
+ /* empty the set */
+ uset_clear(setFillIn);
+
+ /* call the converter to add the code points it supports */
+ cnv->sharedData->impl->getUnicodeSet(cnv, &sa, whichSet, pErrorCode);
+ }
+}
+
+#endif
diff --git a/icuSources/common/ucnv_u16.c b/icuSources/common/ucnv_u16.c
index ab64edf8..86fd1999 100644
--- a/icuSources/common/ucnv_u16.c
+++ b/icuSources/common/ucnv_u16.c
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 2002-2003, International Business Machines
+* Copyright (C) 2002-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u16.c
@@ -15,411 +15,520 @@
*/
#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
#include "unicode/ucnv.h"
-#include "unicode/ucnv_err.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
#include "cmemory.h"
-/* UTF-16 Platform Endian --------------------------------------------------- */
+/* UTF-16BE ----------------------------------------------------------------- */
+
+#if U_IS_BIG_ENDIAN
+# define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
+#else
+# define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
+#endif
static void
-_UTF16PEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
- UErrorCode *pErrorCode) {
- UConverter *cnv = pArgs->converter;
- const uint8_t *source = (const uint8_t *)pArgs->source;
- UChar *target = pArgs->target;
- int32_t *offsets = pArgs->offsets;
- int32_t targetCapacity = pArgs->targetLimit - pArgs->target;
- int32_t length = (const uint8_t *)pArgs->sourceLimit - source;
- int32_t count;
- int32_t sourceIndex = 0;
-
- if(length <= 0 && cnv->toUnicodeStatus == 0) {
+_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode) {
+ UConverter *cnv;
+ const UChar *source;
+ uint8_t *target;
+ int32_t *offsets;
+
+ int32_t targetCapacity, length, count, sourceIndex;
+ UChar c, trail;
+ char overflow[4];
+
+ source=pArgs->source;
+ length=pArgs->sourceLimit-source;
+ if(length<=0) {
/* no input, nothing to do */
return;
}
- if(targetCapacity <= 0) {
+ targetCapacity=pArgs->targetLimit-pArgs->target;
+ if(targetCapacity<=0) {
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
return;
}
- /* complete a partial UChar from the last call */
- if(length != 0 && cnv->toUnicodeStatus != 0) {
- /*
- * copy the byte from the last call and the first one here into the target,
- * byte-wise to keep the platform endianness
- */
- uint8_t *p = (uint8_t *)target++;
- *p++ = (uint8_t)cnv->toUnicodeStatus;
- cnv->toUnicodeStatus = 0;
- *p = *source++;
+ cnv=pArgs->converter;
+ target=(uint8_t *)pArgs->target;
+ offsets=pArgs->offsets;
+ sourceIndex=0;
+
+ /* c!=0 indicates in several places outside the main loops that a surrogate was found */
+
+ if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
+ /* the last buffer ended with a lead surrogate, output the surrogate pair */
+ ++source;
--length;
- --targetCapacity;
- if(offsets != NULL) {
- *offsets++ = -1;
+ target[0]=(uint8_t)(c>>8);
+ target[1]=(uint8_t)c;
+ target[2]=(uint8_t)(trail>>8);
+ target[3]=(uint8_t)trail;
+ target+=4;
+ targetCapacity-=4;
+ if(offsets!=NULL) {
+ *offsets++=-1;
+ *offsets++=-1;
+ *offsets++=-1;
+ *offsets++=-1;
}
+ sourceIndex=1;
+ cnv->fromUChar32=c=0;
}
/* copy an even number of bytes for complete UChars */
- count = 2 * targetCapacity;
- if(count > length) {
- count = length & ~1;
- }
- if(count > 0) {
- uprv_memcpy(target, source, count);
- source += count;
- length -= count;
- count >>= 1;
- target += count;
- targetCapacity -= count;
- if(offsets != NULL) {
- while(count > 0) {
- *offsets++ = sourceIndex;
- sourceIndex += 2;
+ count=2*length;
+ if(count>targetCapacity) {
+ count=targetCapacity&~1;
+ }
+ /* count is even */
+ if(c==0) {
+ targetCapacity-=count;
+ count>>=1;
+ length-=count;
+
+ if(offsets==NULL) {
+ while(count>0) {
+ c=*source++;
+ if(U16_IS_SINGLE(c)) {
+ target[0]=(uint8_t)(c>>8);
+ target[1]=(uint8_t)c;
+ target+=2;
+ } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
+ ++source;
+ --count;
+ target[0]=(uint8_t)(c>>8);
+ target[1]=(uint8_t)c;
+ target[2]=(uint8_t)(trail>>8);
+ target[3]=(uint8_t)trail;
+ target+=4;
+ } else {
+ break;
+ }
+ --count;
+ }
+ } else {
+ while(count>0) {
+ c=*source++;
+ if(U16_IS_SINGLE(c)) {
+ target[0]=(uint8_t)(c>>8);
+ target[1]=(uint8_t)c;
+ target+=2;
+ *offsets++=sourceIndex;
+ *offsets++=sourceIndex++;
+ } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
+ ++source;
+ --count;
+ target[0]=(uint8_t)(c>>8);
+ target[1]=(uint8_t)c;
+ target[2]=(uint8_t)(trail>>8);
+ target[3]=(uint8_t)trail;
+ target+=4;
+ *offsets++=sourceIndex;
+ *offsets++=sourceIndex;
+ *offsets++=sourceIndex;
+ *offsets++=sourceIndex;
+ sourceIndex+=2;
+ } else {
+ break;
+ }
--count;
}
}
- }
- /* check for a remaining source byte and store the status */
- if(length >= 2) {
- /* it must be targetCapacity==0 because otherwise the above would have copied more */
- *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
- } else if(length == 1) {
- if(pArgs->flush) {
- /* a UChar remains incomplete */
- *pErrorCode = U_TRUNCATED_CHAR_FOUND;
+ if(count==0) {
+ /* done with the loop for complete UChars */
+ if(length>0 && targetCapacity>0) {
+ /*
+ * there is more input and some target capacity -
+ * it must be targetCapacity==1 because otherwise
+ * the above would have copied more;
+ * prepare for overflow output
+ */
+ if(U16_IS_SINGLE(c=*source++)) {
+ overflow[0]=(char)(c>>8);
+ overflow[1]=(char)c;
+ length=2; /* 2 bytes to output */
+ c=0;
+ /* } else { keep c for surrogate handling, length will be set there */
+ }
+ } else {
+ length=0;
+ c=0;
+ }
} else {
- /* consume the last byte and store it, making sure that it will never set the status to 0 */
- cnv->toUnicodeStatus = *source++ | 0x100;
+ /* keep c for surrogate handling, length will be set there */
+ targetCapacity+=2*count;
}
- } else /* length==0 */ if(cnv->toUnicodeStatus!=0 && pArgs->flush) {
- /* a UChar remains incomplete */
- *pErrorCode = U_TRUNCATED_CHAR_FOUND;
- }
-
- /* write back the updated pointers */
- pArgs->source = (const char *)source;
- pArgs->target = target;
- pArgs->offsets = offsets;
-}
-
-static void
-_UTF16PEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
- UErrorCode *pErrorCode) {
- UConverter *cnv = pArgs->converter;
- const UChar *source = pArgs->source;
- uint8_t *target = (uint8_t *)pArgs->target;
- int32_t *offsets = pArgs->offsets;
- int32_t targetCapacity = pArgs->targetLimit - pArgs->target;
- int32_t length = pArgs->sourceLimit - source;
- int32_t count;
- int32_t sourceIndex = 0;
-
- if(length <= 0 && cnv->fromUnicodeStatus == 0) {
- /* no input, nothing to do */
- return;
- }
-
- if(targetCapacity <= 0) {
- *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
- return;
+ } else {
+ length=0; /* from here on, length counts the bytes in overflow[] */
}
-
- /* complete a partial UChar from the last call */
- if(cnv->fromUnicodeStatus != 0) {
- *target++ = (uint8_t)cnv->fromUnicodeStatus;
- cnv->fromUnicodeStatus = 0;
- --targetCapacity;
- if(offsets != NULL) {
- *offsets++ = -1;
+
+ if(c!=0) {
+ /*
+ * c is a surrogate, and
+ * - source or target too short
+ * - or the surrogate is unmatched
+ */
+ length=0;
+ if(U16_IS_SURROGATE_LEAD(c)) {
+ if(sourcesourceLimit) {
+ if(U16_IS_TRAIL(trail=*source)) {
+ /* output the surrogate pair, will overflow (see conditions comment above) */
+ ++source;
+ overflow[0]=(char)(c>>8);
+ overflow[1]=(char)c;
+ overflow[2]=(char)(trail>>8);
+ overflow[3]=(char)trail;
+ length=4; /* 4 bytes to output */
+ c=0;
+ } else {
+ /* unmatched lead surrogate */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ }
+ } else {
+ /* see if the trail surrogate is in the next buffer */
+ }
+ } else {
+ /* unmatched trail surrogate */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
}
+ cnv->fromUChar32=c;
}
- /* copy an even number of bytes for complete UChars */
- count = 2 * length;
- if(count > targetCapacity) {
- count = targetCapacity & ~1;
- }
- if(count>0) {
- uprv_memcpy(target, source, count);
- target += count;
- targetCapacity -= count;
- count >>= 1;
- source += count;
- length -= count;
- if(offsets != NULL) {
- while(count > 0) {
- *offsets++ = sourceIndex;
- *offsets++ = sourceIndex++;
- --count;
- }
- }
+ if(length>0) {
+ /* output length bytes with overflow (length>targetCapacity>0) */
+ ucnv_fromUWriteBytes(cnv,
+ overflow, length,
+ (char **)&target, pArgs->targetLimit,
+ &offsets, sourceIndex,
+ pErrorCode);
+ targetCapacity=pArgs->targetLimit-(char *)target;
}
- if(length > 0) {
- /* it must be targetCapacity<=1 because otherwise the above would have copied more */
- *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
- if(targetCapacity > 0) /* targetCapacity==1 */ {
- /* copy one byte and keep the other in the status */
- const uint8_t *p = (const uint8_t *)source++;
- *target++ = *p++;
- cnv->fromUnicodeStatus = *p | 0x100;
- if(offsets != NULL) {
- *offsets++ = sourceIndex;
- }
- }
+ if(U_SUCCESS(*pErrorCode) && sourcesourceLimit && targetCapacity==0) {
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
/* write back the updated pointers */
- pArgs->source = source;
- pArgs->target = (char *)target;
- pArgs->offsets = offsets;
+ pArgs->source=source;
+ pArgs->target=(char *)target;
+ pArgs->offsets=offsets;
}
-/* UTF-16 Opposite Endian --------------------------------------------------- */
-
-/*
- * For opposite-endian UTF-16, we keep a byte pointer to the UChars
- * and copy two bytes at a time and reverse them.
- */
-
static void
-_UTF16OEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
- UConverter *cnv = pArgs->converter;
- const uint8_t *source = (const uint8_t *)pArgs->source;
- UChar *target = pArgs->target;
- uint8_t *target8 = (uint8_t *)target; /* byte pointer to the target */
- int32_t *offsets = pArgs->offsets;
- int32_t targetCapacity = pArgs->targetLimit - pArgs->target;
- int32_t length = (const uint8_t *)pArgs->sourceLimit - source;
- int32_t count;
- int32_t sourceIndex = 0;
-
- if(length <= 0 && cnv->toUnicodeStatus == 0) {
+ UConverter *cnv;
+ const uint8_t *source;
+ UChar *target;
+ int32_t *offsets;
+
+ int32_t targetCapacity, length, count, sourceIndex;
+ UChar c, trail;
+
+ cnv=pArgs->converter;
+ source=(const uint8_t *)pArgs->source;
+ length=(const uint8_t *)pArgs->sourceLimit-source;
+ if(length<=0 && cnv->toUnicodeStatus==0) {
/* no input, nothing to do */
return;
}
- if(targetCapacity <= 0) {
- *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
+ targetCapacity=pArgs->targetLimit-pArgs->target;
+ if(targetCapacity<=0) {
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
return;
}
- /* complete a partial UChar from the last call */
- if(length != 0 && cnv->toUnicodeStatus != 0) {
+ target=pArgs->target;
+ offsets=pArgs->offsets;
+ sourceIndex=0;
+ c=0;
+
+ /* complete a partial UChar or pair from the last call */
+ if(cnv->toUnicodeStatus!=0) {
/*
- * copy the byte from the last call and the first one here into the target,
- * byte-wise, reversing the platform endianness
+ * special case: single byte from a previous buffer,
+ * where the byte turned out not to belong to a trail surrogate
+ * and the preceding, unmatched lead surrogate was put into toUBytes[]
+ * for error handling
*/
- *target8++ = *source++;
- *target8++ = (uint8_t)cnv->toUnicodeStatus;
- cnv->toUnicodeStatus = 0;
- ++target;
- --length;
- --targetCapacity;
- if(offsets != NULL) {
- *offsets++ = -1;
- }
+ cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
+ cnv->toULength=1;
+ cnv->toUnicodeStatus=0;
}
-
- /* copy an even number of bytes for complete UChars */
- count = 2 * targetCapacity;
- if(count > length) {
- count = length & ~1;
- }
- if(count>0) {
- length -= count;
- count >>= 1;
- targetCapacity -= count;
- if(offsets == NULL) {
- while(count > 0) {
- target8[1] = *source++;
- target8[0] = *source++;
- target8 += 2;
- --count;
- }
- } else {
- while(count>0) {
- target8[1] = *source++;
- target8[0] = *source++;
- target8 += 2;
- *offsets++ = sourceIndex;
- sourceIndex += 2;
- --count;
+ if((count=cnv->toULength)!=0) {
+ uint8_t *p=cnv->toUBytes;
+ do {
+ p[count++]=*source++;
+ ++sourceIndex;
+ --length;
+ if(count==2) {
+ c=((UChar)p[0]<<8)|p[1];
+ if(U16_IS_SINGLE(c)) {
+ /* output the BMP code point */
+ *target++=c;
+ if(offsets!=NULL) {
+ *offsets++=-1;
+ }
+ --targetCapacity;
+ count=0;
+ c=0;
+ break;
+ } else if(U16_IS_SURROGATE_LEAD(c)) {
+ /* continue collecting bytes for the trail surrogate */
+ c=0; /* avoid unnecessary surrogate handling below */
+ } else {
+ /* fall through to error handling for an unmatched trail surrogate */
+ break;
+ }
+ } else if(count==4) {
+ c=((UChar)p[0]<<8)|p[1];
+ trail=((UChar)p[2]<<8)|p[3];
+ if(U16_IS_TRAIL(trail)) {
+ /* output the surrogate pair */
+ *target++=c;
+ if(targetCapacity>=2) {
+ *target++=trail;
+ if(offsets!=NULL) {
+ *offsets++=-1;
+ *offsets++=-1;
+ }
+ targetCapacity-=2;
+ } else /* targetCapacity==1 */ {
+ targetCapacity=0;
+ cnv->UCharErrorBuffer[0]=trail;
+ cnv->UCharErrorBufferLength=1;
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ }
+ count=0;
+ c=0;
+ break;
+ } else {
+ /* unmatched lead surrogate, handle here for consistent toUBytes[] */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+
+ /* back out reading the code unit after it */
+ if(((const uint8_t *)pArgs->source-source)>=2) {
+ source-=2;
+ } else {
+ /*
+ * if the trail unit's first byte was in a previous buffer, then
+ * we need to put it into a special place because toUBytes[] will be
+ * used for the lead unit's bytes
+ */
+ cnv->toUnicodeStatus=0x100|p[2];
+ --source;
+ }
+ cnv->toULength=2;
+
+ /* write back the updated pointers */
+ pArgs->source=(const char *)source;
+ pArgs->target=target;
+ pArgs->offsets=offsets;
+ return;
+ }
}
- }
- target=(UChar *)target8;
+ } while(length>0);
+ cnv->toULength=(int8_t)count;
}
- /* check for a remaining source byte and store the status */
- if(length >= 2) {
- /* it must be targetCapacity==0 because otherwise the above would have copied more */
- *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
- } else if(length == 1) {
- if(pArgs->flush) {
- /* a UChar remains incomplete */
- *pErrorCode = U_TRUNCATED_CHAR_FOUND;
+ /* copy an even number of bytes for complete UChars */
+ count=2*targetCapacity;
+ if(count>length) {
+ count=length&~1;
+ }
+ if(c==0 && count>0) {
+ length-=count;
+ count>>=1;
+ targetCapacity-=count;
+ if(offsets==NULL) {
+ do {
+ c=((UChar)source[0]<<8)|source[1];
+ source+=2;
+ if(U16_IS_SINGLE(c)) {
+ *target++=c;
+ } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
+ U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
+ ) {
+ source+=2;
+ --count;
+ *target++=c;
+ *target++=trail;
+ } else {
+ break;
+ }
+ } while(--count>0);
} else {
- /* consume the last byte and store it, making sure that it will never set the status to 0 */
- cnv->toUnicodeStatus = *source++ | 0x100;
+ do {
+ c=((UChar)source[0]<<8)|source[1];
+ source+=2;
+ if(U16_IS_SINGLE(c)) {
+ *target++=c;
+ *offsets++=sourceIndex;
+ sourceIndex+=2;
+ } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
+ U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
+ ) {
+ source+=2;
+ --count;
+ *target++=c;
+ *target++=trail;
+ *offsets++=sourceIndex;
+ *offsets++=sourceIndex;
+ sourceIndex+=4;
+ } else {
+ break;
+ }
+ } while(--count>0);
}
- } else /* length==0 */ if(cnv->toUnicodeStatus!=0 && pArgs->flush) {
- /* a UChar remains incomplete */
- *pErrorCode = U_TRUNCATED_CHAR_FOUND;
- }
-
- /* write back the updated pointers */
- pArgs->source = (const char *)source;
- pArgs->target = target;
- pArgs->offsets = offsets;
-}
-
-static void
-_UTF16OEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
- UErrorCode *pErrorCode) {
- UConverter *cnv = pArgs->converter;
- const UChar *source = pArgs->source;
- const uint8_t *source8 = (const uint8_t *)source; /* byte pointer to the source */
- uint8_t *target = (uint8_t *)pArgs->target;
- int32_t *offsets = pArgs->offsets;
- int32_t targetCapacity = pArgs->targetLimit - pArgs->target;
- int32_t length = pArgs->sourceLimit - source;
- int32_t count;
- int32_t sourceIndex = 0;
-
- if(length <= 0 && cnv->fromUnicodeStatus == 0) {
- /* no input, nothing to do */
- return;
- }
- if(targetCapacity <= 0) {
- *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
- return;
- }
-
- /* complete a partial UChar from the last call */
- if(cnv->fromUnicodeStatus != 0) {
- *target++ = (uint8_t)cnv->fromUnicodeStatus;
- cnv->fromUnicodeStatus = 0;
- --targetCapacity;
- if(offsets != NULL) {
- *offsets++ = -1;
+ if(count==0) {
+ /* done with the loop for complete UChars */
+ c=0;
+ } else {
+ /* keep c for surrogate handling, trail will be set there */
+ length+=2*(count-1); /* one more byte pair was consumed than count decremented */
+ targetCapacity+=count;
}
}
- /* copy an even number of bytes for complete UChars */
- count = 2 * length;
- if(count > targetCapacity) {
- count = targetCapacity & ~1;
- }
- if(count > 0) {
- targetCapacity -= count;
- count >>= 1;
- length -= count;
- if(offsets == NULL) {
- while(count > 0) {
- target[1] = *source8++;
- target[0] = *source8++;
- target += 2;
- --count;
+ if(c!=0) {
+ /*
+ * c is a surrogate, and
+ * - source or target too short
+ * - or the surrogate is unmatched
+ */
+ cnv->toUBytes[0]=(uint8_t)(c>>8);
+ cnv->toUBytes[1]=(uint8_t)c;
+ cnv->toULength=2;
+
+ if(U16_IS_SURROGATE_LEAD(c)) {
+ if(length>=2) {
+ if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
+ /* output the surrogate pair, will overflow (see conditions comment above) */
+ source+=2;
+ length-=2;
+ *target++=c;
+ if(offsets!=NULL) {
+ *offsets++=sourceIndex;
+ }
+ cnv->UCharErrorBuffer[0]=trail;
+ cnv->UCharErrorBufferLength=1;
+ cnv->toULength=0;
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ } else {
+ /* unmatched lead surrogate */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ }
+ } else {
+ /* see if the trail surrogate is in the next buffer */
}
} else {
- while(count>0) {
- target[1] = *source8++;
- target[0] = *source8++;
- target += 2;
- *offsets++ = sourceIndex;
- *offsets++ = sourceIndex++;
- --count;
- }
+ /* unmatched trail surrogate */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
}
- source=(const UChar *)source8;
}
- if(length > 0) {
- /* it must be targetCapacity<=1 because otherwise the above would have copied more */
- *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
- if(targetCapacity > 0) /* targetCapacity==1 */ {
- /* copy one byte and keep the other in the status */
- cnv->fromUnicodeStatus = *source8++ | 0x100;
- *target++ = *source8;
- ++source;
- if(offsets != NULL) {
- *offsets++ = sourceIndex;
+ if(U_SUCCESS(*pErrorCode)) {
+ /* check for a remaining source byte */
+ if(length>0) {
+ if(targetCapacity==0) {
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ } else {
+ /* it must be length==1 because otherwise the above would have copied more */
+ cnv->toUBytes[cnv->toULength++]=*source++;
}
}
}
/* write back the updated pointers */
- pArgs->source = source;
- pArgs->target = (char *)target;
- pArgs->offsets = offsets;
+ pArgs->source=(const char *)source;
+ pArgs->target=target;
+ pArgs->offsets=offsets;
}
-/* UTF-16BE ----------------------------------------------------------------- */
+static UChar32
+_UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
+ const uint8_t *s, *sourceLimit;
+ UChar32 c;
-#if U_IS_BIG_ENDIAN
-# define _UTF16BEToUnicodeWithOffsets _UTF16PEToUnicodeWithOffsets
-# define _UTF16LEToUnicodeWithOffsets _UTF16OEToUnicodeWithOffsets
-# define _UTF16BEFromUnicodeWithOffsets _UTF16PEFromUnicodeWithOffsets
-# define _UTF16LEFromUnicodeWithOffsets _UTF16OEFromUnicodeWithOffsets
-#else
-# define _UTF16BEToUnicodeWithOffsets _UTF16OEToUnicodeWithOffsets
-# define _UTF16LEToUnicodeWithOffsets _UTF16PEToUnicodeWithOffsets
-# define _UTF16BEFromUnicodeWithOffsets _UTF16OEFromUnicodeWithOffsets
-# define _UTF16LEFromUnicodeWithOffsets _UTF16PEFromUnicodeWithOffsets
-#endif
+ s=(const uint8_t *)pArgs->source;
+ sourceLimit=(const uint8_t *)pArgs->sourceLimit;
-static UChar32 T_UConverter_getNextUChar_UTF16_BE(UConverterToUnicodeArgs* args,
- UErrorCode* err)
-{
- UChar32 myUChar;
- uint16_t first;
- /*Checks boundaries and set appropriate error codes*/
- if (args->source+2 > args->sourceLimit)
- {
- if (args->source >= args->sourceLimit)
- {
- /*Either caller has reached the end of the byte stream*/
- *err = U_INDEX_OUTOFBOUNDS_ERROR;
- }
- else
- {
- /* a character was cut in half*/
- *err = U_TRUNCATED_CHAR_FOUND;
- }
+ if(s>=sourceLimit) {
+ /* no input */
+ *err=U_INDEX_OUTOFBOUNDS_ERROR;
return 0xffff;
}
- /*Gets the corresponding codepoint*/
- first = (uint16_t)(((uint16_t)(*(args->source)) << 8) |((uint8_t)*((args->source)+1)));
- myUChar = first;
- args->source += 2;
-
- if(UTF_IS_FIRST_SURROGATE(first)) {
- uint16_t second;
+ if(s+2>sourceLimit) {
+ /* only one byte: truncated UChar */
+ pArgs->converter->toUBytes[0]=*s++;
+ pArgs->converter->toULength=1;
+ pArgs->source=(const char *)s;
+ *err = U_TRUNCATED_CHAR_FOUND;
+ return 0xffff;
+ }
- if (args->source+2 > args->sourceLimit) {
- *err = U_TRUNCATED_CHAR_FOUND;
- return 0xffff;
+ /* get one UChar */
+ c=((UChar32)*s<<8)|s[1];
+ s+=2;
+
+ /* check for a surrogate pair */
+ if(U_IS_SURROGATE(c)) {
+ if(U16_IS_SURROGATE_LEAD(c)) {
+ if(s+2<=sourceLimit) {
+ UChar trail;
+
+ /* get a second UChar and see if it is a trail surrogate */
+ trail=((UChar)*s<<8)|s[1];
+ if(U16_IS_TRAIL(trail)) {
+ c=U16_GET_SUPPLEMENTARY(c, trail);
+ s+=2;
+ } else {
+ /* unmatched lead surrogate */
+ c=-2;
+ }
+ } else {
+ /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
+ uint8_t *bytes=pArgs->converter->toUBytes;
+ s-=2;
+ pArgs->converter->toULength=(int8_t)(sourceLimit-s);
+ do {
+ *bytes++=*s++;
+ } while(ssource)) << 8) |((uint8_t)*(args->source+1)));
+ if(c<0) {
+ /* write the unmatched surrogate */
+ uint8_t *bytes=pArgs->converter->toUBytes;
+ pArgs->converter->toULength=2;
+ *bytes=*(s-2);
+ bytes[1]=*(s-1);
- /* ignore unmatched surrogates and just deliver the first one in such a case */
- if(UTF_IS_SECOND_SURROGATE(second)) {
- /* matched pair, get pair value */
- myUChar = UTF16_GET_PAIR_VALUE(first, second);
- args->source += 2;
+ c=0xffff;
+ *err=U_ILLEGAL_CHAR_FOUND;
}
}
- return myUChar;
+ pArgs->source=(const char *)s;
+ return c;
}
static const UConverterImpl _UTF16BEImpl={
@@ -436,7 +545,7 @@ static const UConverterImpl _UTF16BEImpl={
_UTF16BEToUnicodeWithOffsets,
_UTF16BEFromUnicodeWithOffsets,
_UTF16BEFromUnicodeWithOffsets,
- T_UConverter_getNextUChar_UTF16_BE,
+ _UTF16BEGetNextUChar,
NULL,
NULL,
@@ -445,7 +554,6 @@ static const UConverterImpl _UTF16BEImpl={
ucnv_getCompleteUnicodeSet
};
-/* The 1200 CCSID refers to any version of Unicode with any endianess of UTF-16 */
static const UConverterStaticData _UTF16BEStaticData={
sizeof(UConverterStaticData),
"UTF-16BE",
@@ -465,57 +573,504 @@ const UConverterSharedData _UTF16BEData={
/* UTF-16LE ----------------------------------------------------------------- */
-static UChar32 T_UConverter_getNextUChar_UTF16_LE(UConverterToUnicodeArgs* args,
- UErrorCode* err)
-{
- UChar32 myUChar;
- uint16_t first;
- /*Checks boundaries and set appropriate error codes*/
- if (args->source+2 > args->sourceLimit)
- {
- if (args->source >= args->sourceLimit)
- {
- /*Either caller has reached the end of the byte stream*/
- *err = U_INDEX_OUTOFBOUNDS_ERROR;
+static void
+_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode) {
+ UConverter *cnv;
+ const UChar *source;
+ uint8_t *target;
+ int32_t *offsets;
+
+ int32_t targetCapacity, length, count, sourceIndex;
+ UChar c, trail;
+ char overflow[4];
+
+ source=pArgs->source;
+ length=pArgs->sourceLimit-source;
+ if(length<=0) {
+ /* no input, nothing to do */
+ return;
+ }
+
+ targetCapacity=pArgs->targetLimit-pArgs->target;
+ if(targetCapacity<=0) {
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ return;
+ }
+
+ cnv=pArgs->converter;
+ target=(uint8_t *)pArgs->target;
+ offsets=pArgs->offsets;
+ sourceIndex=0;
+
+ /* c!=0 indicates in several places outside the main loops that a surrogate was found */
+
+ if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
+ /* the last buffer ended with a lead surrogate, output the surrogate pair */
+ ++source;
+ --length;
+ target[0]=(uint8_t)c;
+ target[1]=(uint8_t)(c>>8);
+ target[2]=(uint8_t)trail;
+ target[3]=(uint8_t)(trail>>8);
+ target+=4;
+ targetCapacity-=4;
+ if(offsets!=NULL) {
+ *offsets++=-1;
+ *offsets++=-1;
+ *offsets++=-1;
+ *offsets++=-1;
}
- else
- {
- /* a character was cut in half*/
- *err = U_TRUNCATED_CHAR_FOUND;
+ sourceIndex=1;
+ cnv->fromUChar32=c=0;
+ }
+
+ /* copy an even number of bytes for complete UChars */
+ count=2*length;
+ if(count>targetCapacity) {
+ count=targetCapacity&~1;
+ }
+ /* count is even */
+ if(c==0) {
+ targetCapacity-=count;
+ count>>=1;
+ length-=count;
+
+ if(offsets==NULL) {
+ while(count>0) {
+ c=*source++;
+ if(U16_IS_SINGLE(c)) {
+ target[0]=(uint8_t)c;
+ target[1]=(uint8_t)(c>>8);
+ target+=2;
+ } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
+ ++source;
+ --count;
+ target[0]=(uint8_t)c;
+ target[1]=(uint8_t)(c>>8);
+ target[2]=(uint8_t)trail;
+ target[3]=(uint8_t)(trail>>8);
+ target+=4;
+ } else {
+ break;
+ }
+ --count;
+ }
+ } else {
+ while(count>0) {
+ c=*source++;
+ if(U16_IS_SINGLE(c)) {
+ target[0]=(uint8_t)c;
+ target[1]=(uint8_t)(c>>8);
+ target+=2;
+ *offsets++=sourceIndex;
+ *offsets++=sourceIndex++;
+ } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
+ ++source;
+ --count;
+ target[0]=(uint8_t)c;
+ target[1]=(uint8_t)(c>>8);
+ target[2]=(uint8_t)trail;
+ target[3]=(uint8_t)(trail>>8);
+ target+=4;
+ *offsets++=sourceIndex;
+ *offsets++=sourceIndex;
+ *offsets++=sourceIndex;
+ *offsets++=sourceIndex;
+ sourceIndex+=2;
+ } else {
+ break;
+ }
+ --count;
+ }
}
- return 0xffff;
+ if(count==0) {
+ /* done with the loop for complete UChars */
+ if(length>0 && targetCapacity>0) {
+ /*
+ * there is more input and some target capacity -
+ * it must be targetCapacity==1 because otherwise
+ * the above would have copied more;
+ * prepare for overflow output
+ */
+ if(U16_IS_SINGLE(c=*source++)) {
+ overflow[0]=(char)c;
+ overflow[1]=(char)(c>>8);
+ length=2; /* 2 bytes to output */
+ c=0;
+ /* } else { keep c for surrogate handling, length will be set there */
+ }
+ } else {
+ length=0;
+ c=0;
+ }
+ } else {
+ /* keep c for surrogate handling, length will be set there */
+ targetCapacity+=2*count;
+ }
+ } else {
+ length=0; /* from here on, length counts the bytes in overflow[] */
+ }
+
+ if(c!=0) {
+ /*
+ * c is a surrogate, and
+ * - source or target too short
+ * - or the surrogate is unmatched
+ */
+ length=0;
+ if(U16_IS_SURROGATE_LEAD(c)) {
+ if(sourcesourceLimit) {
+ if(U16_IS_TRAIL(trail=*source)) {
+ /* output the surrogate pair, will overflow (see conditions comment above) */
+ ++source;
+ overflow[0]=(char)c;
+ overflow[1]=(char)(c>>8);
+ overflow[2]=(char)trail;
+ overflow[3]=(char)(trail>>8);
+ length=4; /* 4 bytes to output */
+ c=0;
+ } else {
+ /* unmatched lead surrogate */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ }
+ } else {
+ /* see if the trail surrogate is in the next buffer */
+ }
+ } else {
+ /* unmatched trail surrogate */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ }
+ cnv->fromUChar32=c;
}
- /*Gets the corresponding codepoint*/
- first = (uint16_t)(((uint16_t)*((args->source)+1) << 8) | ((uint8_t)(*(args->source))));
- myUChar=first;
- /*updates the source*/
- args->source += 2;
+ if(length>0) {
+ /* output length bytes with overflow (length>targetCapacity>0) */
+ ucnv_fromUWriteBytes(cnv,
+ overflow, length,
+ (char **)&target, pArgs->targetLimit,
+ &offsets, sourceIndex,
+ pErrorCode);
+ targetCapacity=pArgs->targetLimit-(char *)target;
+ }
- if (UTF_IS_FIRST_SURROGATE(first))
- {
- uint16_t second;
+ if(U_SUCCESS(*pErrorCode) && sourcesourceLimit && targetCapacity==0) {
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ }
+
+ /* write back the updated pointers */
+ pArgs->source=source;
+ pArgs->target=(char *)target;
+ pArgs->offsets=offsets;
+}
- if (args->source+2 > args->sourceLimit)
- {
- *err = U_TRUNCATED_CHAR_FOUND;
- return 0xffff;
+static void
+_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode) {
+ UConverter *cnv;
+ const uint8_t *source;
+ UChar *target;
+ int32_t *offsets;
+
+ int32_t targetCapacity, length, count, sourceIndex;
+ UChar c, trail;
+
+ cnv=pArgs->converter;
+ source=(const uint8_t *)pArgs->source;
+ length=(const uint8_t *)pArgs->sourceLimit-source;
+ if(length<=0 && cnv->toUnicodeStatus==0) {
+ /* no input, nothing to do */
+ return;
+ }
+
+ targetCapacity=pArgs->targetLimit-pArgs->target;
+ if(targetCapacity<=0) {
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ return;
+ }
+
+ target=pArgs->target;
+ offsets=pArgs->offsets;
+ sourceIndex=0;
+ c=0;
+
+ /* complete a partial UChar or pair from the last call */
+ if(cnv->toUnicodeStatus!=0) {
+ /*
+ * special case: single byte from a previous buffer,
+ * where the byte turned out not to belong to a trail surrogate
+ * and the preceding, unmatched lead surrogate was put into toUBytes[]
+ * for error handling
+ */
+ cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
+ cnv->toULength=1;
+ cnv->toUnicodeStatus=0;
+ }
+ if((count=cnv->toULength)!=0) {
+ uint8_t *p=cnv->toUBytes;
+ do {
+ p[count++]=*source++;
+ ++sourceIndex;
+ --length;
+ if(count==2) {
+ c=((UChar)p[1]<<8)|p[0];
+ if(U16_IS_SINGLE(c)) {
+ /* output the BMP code point */
+ *target++=c;
+ if(offsets!=NULL) {
+ *offsets++=-1;
+ }
+ --targetCapacity;
+ count=0;
+ c=0;
+ break;
+ } else if(U16_IS_SURROGATE_LEAD(c)) {
+ /* continue collecting bytes for the trail surrogate */
+ c=0; /* avoid unnecessary surrogate handling below */
+ } else {
+ /* fall through to error handling for an unmatched trail surrogate */
+ break;
+ }
+ } else if(count==4) {
+ c=((UChar)p[1]<<8)|p[0];
+ trail=((UChar)p[3]<<8)|p[2];
+ if(U16_IS_TRAIL(trail)) {
+ /* output the surrogate pair */
+ *target++=c;
+ if(targetCapacity>=2) {
+ *target++=trail;
+ if(offsets!=NULL) {
+ *offsets++=-1;
+ *offsets++=-1;
+ }
+ targetCapacity-=2;
+ } else /* targetCapacity==1 */ {
+ targetCapacity=0;
+ cnv->UCharErrorBuffer[0]=trail;
+ cnv->UCharErrorBufferLength=1;
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ }
+ count=0;
+ c=0;
+ break;
+ } else {
+ /* unmatched lead surrogate, handle here for consistent toUBytes[] */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+
+ /* back out reading the code unit after it */
+ if(((const uint8_t *)pArgs->source-source)>=2) {
+ source-=2;
+ } else {
+ /*
+ * if the trail unit's first byte was in a previous buffer, then
+ * we need to put it into a special place because toUBytes[] will be
+ * used for the lead unit's bytes
+ */
+ cnv->toUnicodeStatus=0x100|p[2];
+ --source;
+ }
+ cnv->toULength=2;
+
+ /* write back the updated pointers */
+ pArgs->source=(const char *)source;
+ pArgs->target=target;
+ pArgs->offsets=offsets;
+ return;
+ }
+ }
+ } while(length>0);
+ cnv->toULength=(int8_t)count;
+ }
+
+ /* copy an even number of bytes for complete UChars */
+ count=2*targetCapacity;
+ if(count>length) {
+ count=length&~1;
+ }
+ if(c==0 && count>0) {
+ length-=count;
+ count>>=1;
+ targetCapacity-=count;
+ if(offsets==NULL) {
+ do {
+ c=((UChar)source[1]<<8)|source[0];
+ source+=2;
+ if(U16_IS_SINGLE(c)) {
+ *target++=c;
+ } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
+ U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
+ ) {
+ source+=2;
+ --count;
+ *target++=c;
+ *target++=trail;
+ } else {
+ break;
+ }
+ } while(--count>0);
+ } else {
+ do {
+ c=((UChar)source[1]<<8)|source[0];
+ source+=2;
+ if(U16_IS_SINGLE(c)) {
+ *target++=c;
+ *offsets++=sourceIndex;
+ sourceIndex+=2;
+ } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
+ U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
+ ) {
+ source+=2;
+ --count;
+ *target++=c;
+ *target++=trail;
+ *offsets++=sourceIndex;
+ *offsets++=sourceIndex;
+ sourceIndex+=4;
+ } else {
+ break;
+ }
+ } while(--count>0);
}
- /* get the second surrogate and assemble the code point */
- second = (uint16_t)(((uint16_t)*(args->source+1) << 8) |((uint8_t)(*(args->source))));
+ if(count==0) {
+ /* done with the loop for complete UChars */
+ c=0;
+ } else {
+ /* keep c for surrogate handling, trail will be set there */
+ length+=2*(count-1); /* one more byte pair was consumed than count decremented */
+ targetCapacity+=count;
+ }
+ }
- /* ignore unmatched surrogates and just deliver the first one in such a case */
- if(UTF_IS_SECOND_SURROGATE(second))
- {
- /* matched pair, get pair value */
- myUChar = UTF16_GET_PAIR_VALUE(first, second);
- args->source += 2;
+ if(c!=0) {
+ /*
+ * c is a surrogate, and
+ * - source or target too short
+ * - or the surrogate is unmatched
+ */
+ cnv->toUBytes[0]=(uint8_t)c;
+ cnv->toUBytes[1]=(uint8_t)(c>>8);
+ cnv->toULength=2;
+
+ if(U16_IS_SURROGATE_LEAD(c)) {
+ if(length>=2) {
+ if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
+ /* output the surrogate pair, will overflow (see conditions comment above) */
+ source+=2;
+ length-=2;
+ *target++=c;
+ if(offsets!=NULL) {
+ *offsets++=sourceIndex;
+ }
+ cnv->UCharErrorBuffer[0]=trail;
+ cnv->UCharErrorBufferLength=1;
+ cnv->toULength=0;
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ } else {
+ /* unmatched lead surrogate */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ }
+ } else {
+ /* see if the trail surrogate is in the next buffer */
+ }
+ } else {
+ /* unmatched trail surrogate */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
}
}
- return myUChar;
+ if(U_SUCCESS(*pErrorCode)) {
+ /* check for a remaining source byte */
+ if(length>0) {
+ if(targetCapacity==0) {
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ } else {
+ /* it must be length==1 because otherwise the above would have copied more */
+ cnv->toUBytes[cnv->toULength++]=*source++;
+ }
+ }
+ }
+
+ /* write back the updated pointers */
+ pArgs->source=(const char *)source;
+ pArgs->target=target;
+ pArgs->offsets=offsets;
+}
+
+static UChar32
+_UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
+ const uint8_t *s, *sourceLimit;
+ UChar32 c;
+
+ s=(const uint8_t *)pArgs->source;
+ sourceLimit=(const uint8_t *)pArgs->sourceLimit;
+
+ if(s>=sourceLimit) {
+ /* no input */
+ *err=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0xffff;
+ }
+
+ if(s+2>sourceLimit) {
+ /* only one byte: truncated UChar */
+ pArgs->converter->toUBytes[0]=*s++;
+ pArgs->converter->toULength=1;
+ pArgs->source=(const char *)s;
+ *err = U_TRUNCATED_CHAR_FOUND;
+ return 0xffff;
+ }
+
+ /* get one UChar */
+ c=((UChar32)s[1]<<8)|*s;
+ s+=2;
+
+ /* check for a surrogate pair */
+ if(U_IS_SURROGATE(c)) {
+ if(U16_IS_SURROGATE_LEAD(c)) {
+ if(s+2<=sourceLimit) {
+ UChar trail;
+
+ /* get a second UChar and see if it is a trail surrogate */
+ trail=((UChar)s[1]<<8)|*s;
+ if(U16_IS_TRAIL(trail)) {
+ c=U16_GET_SUPPLEMENTARY(c, trail);
+ s+=2;
+ } else {
+ /* unmatched lead surrogate */
+ c=-2;
+ }
+ } else {
+ /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
+ uint8_t *bytes=pArgs->converter->toUBytes;
+ s-=2;
+ pArgs->converter->toULength=(int8_t)(sourceLimit-s);
+ do {
+ *bytes++=*s++;
+ } while(sconverter->toUBytes;
+ pArgs->converter->toULength=2;
+ *bytes=*(s-2);
+ bytes[1]=*(s-1);
+
+ c=0xffff;
+ *err=U_ILLEGAL_CHAR_FOUND;
+ }
+ }
+
+ pArgs->source=(const char *)s;
+ return c;
}
static const UConverterImpl _UTF16LEImpl={
@@ -532,7 +1087,7 @@ static const UConverterImpl _UTF16LEImpl={
_UTF16LEToUnicodeWithOffsets,
_UTF16LEFromUnicodeWithOffsets,
_UTF16LEFromUnicodeWithOffsets,
- T_UConverter_getNextUChar_UTF16_LE,
+ _UTF16LEGetNextUChar,
NULL,
NULL,
@@ -542,7 +1097,6 @@ static const UConverterImpl _UTF16LEImpl={
};
-/* The 1200 CCSID refers to any version of Unicode with any endianess of UTF-16 */
static const UConverterStaticData _UTF16LEStaticData={
sizeof(UConverterStaticData),
"UTF-16LE",
@@ -727,12 +1281,12 @@ _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
_UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
pArgs->source=source;
pArgs->sourceLimit=sourceLimit;
+ state=8;
break;
}
- cnv->mode=0; /* reset */
- } else {
- cnv->mode=state;
}
+
+ cnv->mode=state;
}
static UChar32
@@ -740,11 +1294,11 @@ _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
switch(pArgs->converter->mode) {
case 8:
- return T_UConverter_getNextUChar_UTF16_BE(pArgs, pErrorCode);
+ return _UTF16BEGetNextUChar(pArgs, pErrorCode);
case 9:
- return T_UConverter_getNextUChar_UTF16_LE(pArgs, pErrorCode);
+ return _UTF16LEGetNextUChar(pArgs, pErrorCode);
default:
- return ucnv_getNextUCharFromToUImpl(pArgs, _UTF16ToUnicodeWithOffsets, TRUE, pErrorCode);
+ return UCNV_GET_NEXT_UCHAR_USE_TO_U;
}
}
@@ -792,3 +1346,5 @@ const UConverterSharedData _UTF16Data = {
NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl,
0
};
+
+#endif
diff --git a/icuSources/common/ucnv_u32.c b/icuSources/common/ucnv_u32.c
index 82c15d78..1a37e96f 100644
--- a/icuSources/common/ucnv_u32.c
+++ b/icuSources/common/ucnv_u32.c
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 2002-2003, International Business Machines
+* Copyright (C) 2002-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u32.c
@@ -15,82 +15,25 @@
*/
#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
#include "unicode/ucnv.h"
-#include "unicode/ucnv_err.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
#include "cmemory.h"
#define MAXIMUM_UCS2 0x0000FFFF
#define MAXIMUM_UTF 0x0010FFFF
-#define MAXIMUM_UCS4 0x7FFFFFFF
#define HALF_SHIFT 10
#define HALF_BASE 0x0010000
#define HALF_MASK 0x3FF
#define SURROGATE_HIGH_START 0xD800
-#define SURROGATE_HIGH_END 0xDBFF
#define SURROGATE_LOW_START 0xDC00
-#define SURROGATE_LOW_END 0xDFFF
/* -SURROGATE_LOW_START + HALF_BASE */
#define SURROGATE_LOW_BASE 9216
-/**
- * Calls invalid char callback when an invalid character sequence is encountered.
- * It presumes that the converter has a callback to call.
- *
- * @returns true when callback fails
- */
-static UBool
-T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args,
- UConverterCallbackReason reason,
- UErrorCode *err)
-{
- UConverter *converter = args->converter;
-
- if (U_SUCCESS(*err))
- {
- if (reason == UCNV_ILLEGAL) {
- *err = U_ILLEGAL_CHAR_FOUND;
- } else {
- *err = U_INVALID_CHAR_FOUND;
- }
- }
-
- /* copy the toUBytes to the invalidCharBuffer */
- uprv_memcpy(converter->invalidCharBuffer,
- converter->toUBytes,
- converter->invalidCharLength);
-
- /* Call the ErrorFunction */
- args->converter->fromCharErrorBehaviour(converter->toUContext,
- args,
- converter->invalidCharBuffer,
- converter->invalidCharLength,
- reason,
- err);
-
- return (UBool)U_FAILURE(*err);
-}
-
-static UBool
-T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args,
- int32_t currentOffset,
- UConverterCallbackReason reason,
- UErrorCode *err)
-{
- int32_t *saveOffsets = args->offsets;
- UBool result;
-
- result = T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err);
-
- while (saveOffsets < args->offsets)
- {
- *(saveOffsets++) = currentOffset;
- }
- return result;
-}
-
/* UTF-32BE ----------------------------------------------------------------- */
static void
@@ -105,8 +48,7 @@ T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
uint32_t ch, i;
/* UTF-8 returns here for only non-offset, this needs to change.*/
- if (args->converter->toUnicodeStatus && myTarget < targetLimit)
- {
+ if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
i = args->converter->toULength; /* restore # of bytes consumed */
ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
@@ -114,58 +56,39 @@ T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
goto morebytes;
}
- while (mySource < sourceLimit && myTarget < targetLimit)
- {
+ while (mySource < sourceLimit && myTarget < targetLimit) {
i = 0;
ch = 0;
morebytes:
- while (i < sizeof(uint32_t))
- {
- if (mySource < sourceLimit)
- {
+ while (i < sizeof(uint32_t)) {
+ if (mySource < sourceLimit) {
ch = (ch << 8) | (uint8_t)(*mySource);
toUBytes[i++] = (char) *(mySource++);
}
- else
- {
- if (args->flush)
- {
- if (U_SUCCESS(*err))
- {
- *err = U_TRUNCATED_CHAR_FOUND;
- args->converter->toUnicodeStatus = MAXIMUM_UCS4;
- }
- }
- else
- { /* stores a partially calculated target*/
- /* + 1 to make 0 a valid character */
- args->converter->toUnicodeStatus = ch + 1;
- args->converter->toULength = (int8_t) i;
- }
+ else {
+ /* stores a partially calculated target*/
+ /* + 1 to make 0 a valid character */
+ args->converter->toUnicodeStatus = ch + 1;
+ args->converter->toULength = (int8_t) i;
goto donefornow;
}
}
- if (ch <= MAXIMUM_UTF)
- {
+ if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
if (ch <= MAXIMUM_UCS2)
{
/* fits in 16 bits */
*(myTarget++) = (UChar) ch;
}
- else
- {
+ else {
/* write out the surrogates */
- ch -= HALF_BASE;
- *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
- ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
- if (myTarget < targetLimit)
- {
+ *(myTarget++) = U16_LEAD(ch);
+ ch = U16_TRAIL(ch);
+ if (myTarget < targetLimit) {
*(myTarget++) = (UChar)ch;
}
- else
- {
+ else {
/* Put in overflow buffer (not handled here) */
args->converter->UCharErrorBuffer[0] = (UChar) ch;
args->converter->UCharErrorBufferLength = 1;
@@ -174,25 +97,15 @@ morebytes:
}
}
}
- else
- {
- args->source = (const char *) mySource;
- args->target = myTarget;
- args->converter->invalidCharLength = (int8_t)i;
- if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
- {
- /* Stop if the error wasn't handled */
- break;
- }
- args->converter->invalidCharLength = 0;
- mySource = (unsigned char *) args->source;
- myTarget = args->target;
+ else {
+ args->converter->toULength = (int8_t)i;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
}
donefornow:
- if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
- {
+ if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
/* End of target buffer */
*err = U_BUFFER_OVERFLOW_ERROR;
}
@@ -214,8 +127,7 @@ T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
uint32_t ch, i;
int32_t offsetNum = 0;
- if (args->converter->toUnicodeStatus && myTarget < targetLimit)
- {
+ if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
i = args->converter->toULength; /* restore # of bytes consumed */
ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
@@ -223,61 +135,42 @@ T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
goto morebytes;
}
- while (mySource < sourceLimit && myTarget < targetLimit)
- {
+ while (mySource < sourceLimit && myTarget < targetLimit) {
i = 0;
ch = 0;
morebytes:
- while (i < sizeof(uint32_t))
- {
- if (mySource < sourceLimit)
- {
+ while (i < sizeof(uint32_t)) {
+ if (mySource < sourceLimit) {
ch = (ch << 8) | (uint8_t)(*mySource);
toUBytes[i++] = (char) *(mySource++);
}
- else
- {
- if (args->flush)
- {
- if (U_SUCCESS(*err))
- {
- *err = U_TRUNCATED_CHAR_FOUND;
- args->converter->toUnicodeStatus = MAXIMUM_UCS4;
- }
- }
- else
- { /* stores a partially calculated target*/
- /* + 1 to make 0 a valid character */
- args->converter->toUnicodeStatus = ch + 1;
- args->converter->toULength = (int8_t) i;
- }
+ else {
+ /* stores a partially calculated target*/
+ /* + 1 to make 0 a valid character */
+ args->converter->toUnicodeStatus = ch + 1;
+ args->converter->toULength = (int8_t) i;
goto donefornow;
}
}
- if (ch <= MAXIMUM_UTF)
- {
+ if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
- if (ch <= MAXIMUM_UCS2)
- {
+ if (ch <= MAXIMUM_UCS2) {
/* fits in 16 bits */
*(myTarget++) = (UChar) ch;
*(myOffsets++) = offsetNum;
}
- else
- {
+ else {
/* write out the surrogates */
- ch -= HALF_BASE;
- *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
+ *(myTarget++) = U16_LEAD(ch);
*myOffsets++ = offsetNum;
- ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
+ ch = U16_TRAIL(ch);
if (myTarget < targetLimit)
{
*(myTarget++) = (UChar)ch;
*(myOffsets++) = offsetNum;
}
- else
- {
+ else {
/* Put in overflow buffer (not handled here) */
args->converter->UCharErrorBuffer[0] = (UChar) ch;
args->converter->UCharErrorBufferLength = 1;
@@ -286,21 +179,10 @@ morebytes:
}
}
}
- else
- {
- args->source = (const char *) mySource;
- args->target = myTarget;
- args->converter->invalidCharLength = (int8_t)i;
- args->offsets = myOffsets;
- if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args, offsetNum, UCNV_ILLEGAL, err))
- {
- /* Stop if the error wasn't handled */
- break;
- }
- args->converter->invalidCharLength = 0;
- mySource = (unsigned char *) args->source;
- myTarget = args->target;
- myOffsets = args->offsets;
+ else {
+ args->converter->toULength = (int8_t)i;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
offsetNum += i;
}
@@ -331,33 +213,48 @@ T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
temp[0] = 0;
- if (args->converter->fromUnicodeStatus)
- {
- ch = args->converter->fromUnicodeStatus;
- args->converter->fromUnicodeStatus = 0;
+ if (args->converter->fromUChar32) {
+ ch = args->converter->fromUChar32;
+ args->converter->fromUChar32 = 0;
goto lowsurogate;
}
- while (mySource < sourceLimit && myTarget < targetLimit)
- {
+ while (mySource < sourceLimit && myTarget < targetLimit) {
ch = *(mySource++);
- if (SURROGATE_HIGH_START <= ch && ch < SURROGATE_LOW_START)
- {
+ if (UTF_IS_SURROGATE(ch)) {
+ if (U_IS_LEAD(ch)) {
lowsurogate:
- if (mySource < sourceLimit)
- {
- ch2 = *mySource;
- if (SURROGATE_LOW_START <= ch2 && ch2 <= SURROGATE_LOW_END)
- {
- ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
- mySource++;
+ if (mySource < sourceLimit) {
+ ch2 = *mySource;
+ if (U_IS_TRAIL(ch2)) {
+ ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
+ mySource++;
+ }
+ else {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ args->converter->fromUChar32 = ch;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
+ }
+ }
+ else {
+ /* ran out of source */
+ args->converter->fromUChar32 = ch;
+ if (args->flush) {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ *err = U_ILLEGAL_CHAR_FOUND;
+ }
+ break;
}
}
- else if (!args->flush)
- {
- /* ran out of source */
- args->converter->fromUnicodeStatus = ch;
+ else {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ args->converter->fromUChar32 = ch;
+ *err = U_ILLEGAL_CHAR_FOUND;
break;
}
}
@@ -367,22 +264,18 @@ lowsurogate:
temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
- for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
- {
- if (myTarget < targetLimit)
- {
+ for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
+ if (myTarget < targetLimit) {
*(myTarget++) = temp[indexToWrite];
}
- else
- {
+ else {
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
*err = U_BUFFER_OVERFLOW_ERROR;
}
}
}
- if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
- {
+ if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
*err = U_BUFFER_OVERFLOW_ERROR;
}
@@ -406,33 +299,48 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
temp[0] = 0;
- if (args->converter->fromUnicodeStatus)
- {
- ch = args->converter->fromUnicodeStatus;
- args->converter->fromUnicodeStatus = 0;
+ if (args->converter->fromUChar32) {
+ ch = args->converter->fromUChar32;
+ args->converter->fromUChar32 = 0;
goto lowsurogate;
}
- while (mySource < sourceLimit && myTarget < targetLimit)
- {
+ while (mySource < sourceLimit && myTarget < targetLimit) {
ch = *(mySource++);
- if (SURROGATE_HIGH_START <= ch && ch < SURROGATE_LOW_START)
- {
+ if (UTF_IS_SURROGATE(ch)) {
+ if (U_IS_LEAD(ch)) {
lowsurogate:
- if (mySource < sourceLimit)
- {
- ch2 = *mySource;
- if (SURROGATE_LOW_START <= ch2 && ch2 <= SURROGATE_LOW_END)
- {
- ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
- mySource++;
+ if (mySource < sourceLimit) {
+ ch2 = *mySource;
+ if (U_IS_TRAIL(ch2)) {
+ ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
+ mySource++;
+ }
+ else {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ args->converter->fromUChar32 = ch;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
+ }
+ }
+ else {
+ /* ran out of source */
+ args->converter->fromUChar32 = ch;
+ if (args->flush) {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ *err = U_ILLEGAL_CHAR_FOUND;
+ }
+ break;
}
}
- else if (!args->flush)
- {
- /* ran out of source */
- args->converter->fromUnicodeStatus = ch;
+ else {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ args->converter->fromUChar32 = ch;
+ *err = U_ILLEGAL_CHAR_FOUND;
break;
}
}
@@ -442,15 +350,12 @@ lowsurogate:
temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
- for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
- {
- if (myTarget < targetLimit)
- {
+ for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
+ if (myTarget < targetLimit) {
*(myTarget++) = temp[indexToWrite];
*(myOffsets++) = offsetNum;
}
- else
- {
+ else {
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
*err = U_BUFFER_OVERFLOW_ERROR;
}
@@ -458,8 +363,7 @@ lowsurogate:
offsetNum++;
}
- if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
- {
+ if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
*err = U_BUFFER_OVERFLOW_ERROR;
}
@@ -472,65 +376,44 @@ static UChar32
T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
UErrorCode* err)
{
- UChar myUCharBuf[2];
- UChar *myUCharPtr;
- const unsigned char *mySource;
+ const uint8_t *mySource;
UChar32 myUChar;
int32_t length;
- while (args->source < args->sourceLimit)
+ mySource = (const uint8_t *)args->source;
+ if (mySource >= (const uint8_t *)args->sourceLimit)
{
- if (args->source + 4 > args->sourceLimit)
- {
- /* got a partial character */
- *err = U_TRUNCATED_CHAR_FOUND;
- return 0xffff;
- }
+ /* no input */
+ *err = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0xffff;
+ }
- /* Don't even try to do a direct cast because the value may be on an odd address. */
- mySource = (unsigned char *) args->source;
- myUChar = (mySource[0] << 24)
- | (mySource[1] << 16)
- | (mySource[2] << 8)
- | (mySource[3]);
+ length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
+ if (length < 4)
+ {
+ /* got a partial character */
+ uprv_memcpy(args->converter->toUBytes, mySource, length);
+ args->converter->toULength = (int8_t)length;
+ args->source = (const char *)(mySource + length);
+ *err = U_TRUNCATED_CHAR_FOUND;
+ return 0xffff;
+ }
- args->source = (const char *)(mySource + 4);
- if (myUChar <= MAXIMUM_UTF && myUChar >= 0) {
- return myUChar;
- }
+ /* Don't even try to do a direct cast because the value may be on an odd address. */
+ myUChar = ((UChar32)mySource[0] << 24)
+ | ((UChar32)mySource[1] << 16)
+ | ((UChar32)mySource[2] << 8)
+ | ((UChar32)mySource[3]);
- uprv_memcpy(args->converter->invalidCharBuffer, mySource, 4);
- args->converter->invalidCharLength = 4;
-
- myUCharPtr = myUCharBuf;
- *err = U_ILLEGAL_CHAR_FOUND;
- args->target = myUCharPtr;
- args->targetLimit = myUCharBuf + 2;
- args->converter->fromCharErrorBehaviour(args->converter->toUContext,
- args,
- (const char *)mySource,
- 4,
- UCNV_ILLEGAL,
- err);
-
- if(U_SUCCESS(*err)) {
- length = (uint16_t)(args->target - myUCharBuf);
- if(length > 0) {
- return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, length);
- }
- /* else (callback did not write anything) continue */
- } else if(*err == U_BUFFER_OVERFLOW_ERROR) {
- *err = U_ZERO_ERROR;
- return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, 2);
- } else {
- /* break on error */
- /* ### what if a callback set an error but _also_ generated output?! */
- return 0xffff;
- }
+ args->source = (const char *)(mySource + 4);
+ if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
+ return myUChar;
}
- /* no input or only skipping callbacks */
- *err = U_INDEX_OUTOFBOUNDS_ERROR;
+ uprv_memcpy(args->converter->toUBytes, mySource, 4);
+ args->converter->toULength = 4;
+
+ *err = U_ILLEGAL_CHAR_FOUND;
return 0xffff;
}
@@ -613,44 +496,28 @@ morebytes:
}
else
{
- if (args->flush)
- {
- if (U_SUCCESS(*err))
- {
- *err = U_TRUNCATED_CHAR_FOUND;
- args->converter->toUnicodeStatus = 0;
- }
- }
- else
- { /* stores a partially calculated target*/
- /* + 1 to make 0 a valid character */
- args->converter->toUnicodeStatus = ch + 1;
- args->converter->toULength = (int8_t) i;
- }
+ /* stores a partially calculated target*/
+ /* + 1 to make 0 a valid character */
+ args->converter->toUnicodeStatus = ch + 1;
+ args->converter->toULength = (int8_t) i;
goto donefornow;
}
}
- if (ch <= MAXIMUM_UTF)
- {
+ if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
- if (ch <= MAXIMUM_UCS2)
- {
+ if (ch <= MAXIMUM_UCS2) {
/* fits in 16 bits */
*(myTarget++) = (UChar) ch;
}
- else
- {
+ else {
/* write out the surrogates */
- ch -= HALF_BASE;
- *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
- ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
- if (myTarget < targetLimit)
- {
+ *(myTarget++) = U16_LEAD(ch);
+ ch = U16_TRAIL(ch);
+ if (myTarget < targetLimit) {
*(myTarget++) = (UChar)ch;
}
- else
- {
+ else {
/* Put in overflow buffer (not handled here) */
args->converter->UCharErrorBuffer[0] = (UChar) ch;
args->converter->UCharErrorBufferLength = 1;
@@ -659,19 +526,10 @@ morebytes:
}
}
}
- else
- {
- args->source = (const char *) mySource;
- args->target = myTarget;
- args->converter->invalidCharLength = (int8_t)i;
- if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
- {
- /* Stop if the error wasn't handled */
- break;
- }
- args->converter->invalidCharLength = 0;
- mySource = (unsigned char *) args->source;
- myTarget = args->target;
+ else {
+ args->converter->toULength = (int8_t)i;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
}
@@ -724,25 +582,15 @@ morebytes:
}
else
{
- if (args->flush)
- {
- if (U_SUCCESS(*err))
- {
- *err = U_TRUNCATED_CHAR_FOUND;
- args->converter->toUnicodeStatus = 0;
- }
- }
- else
- { /* stores a partially calculated target*/
- /* + 1 to make 0 a valid character */
- args->converter->toUnicodeStatus = ch + 1;
- args->converter->toULength = (int8_t) i;
- }
+ /* stores a partially calculated target*/
+ /* + 1 to make 0 a valid character */
+ args->converter->toUnicodeStatus = ch + 1;
+ args->converter->toULength = (int8_t) i;
goto donefornow;
}
}
- if (ch <= MAXIMUM_UTF)
+ if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
{
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
if (ch <= MAXIMUM_UCS2)
@@ -751,13 +599,11 @@ morebytes:
*(myTarget++) = (UChar) ch;
*(myOffsets++) = offsetNum;
}
- else
- {
+ else {
/* write out the surrogates */
- ch -= HALF_BASE;
- *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
+ *(myTarget++) = U16_LEAD(ch);
*(myOffsets++) = offsetNum;
- ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
+ ch = U16_TRAIL(ch);
if (myTarget < targetLimit)
{
*(myTarget++) = (UChar)ch;
@@ -775,19 +621,9 @@ morebytes:
}
else
{
- args->source = (const char *) mySource;
- args->target = myTarget;
- args->converter->invalidCharLength = (int8_t)i;
- args->offsets = myOffsets;
- if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args, offsetNum, UCNV_ILLEGAL, err))
- {
- /* Stop if the error wasn't handled */
- break;
- }
- args->converter->invalidCharLength = 0;
- mySource = (unsigned char *) args->source;
- myTarget = args->target;
- myOffsets = args->offsets;
+ args->converter->toULength = (int8_t)i;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
offsetNum += i;
}
@@ -818,10 +654,10 @@ T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
temp[3] = 0;
- if (args->converter->fromUnicodeStatus)
+ if (args->converter->fromUChar32)
{
- ch = args->converter->fromUnicodeStatus;
- args->converter->fromUnicodeStatus = 0;
+ ch = args->converter->fromUChar32;
+ args->converter->fromUChar32 = 0;
goto lowsurogate;
}
@@ -829,22 +665,41 @@ T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
{
ch = *(mySource++);
- if (SURROGATE_HIGH_START <= ch && ch < SURROGATE_LOW_START)
- {
-lowsurogate:
- if (mySource < sourceLimit)
+ if (UTF_IS_SURROGATE(ch)) {
+ if (U_IS_LEAD(ch))
{
- ch2 = *mySource;
- if (SURROGATE_LOW_START <= ch2 && ch2 <= SURROGATE_LOW_END)
+lowsurogate:
+ if (mySource < sourceLimit)
{
- ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
- mySource++;
+ ch2 = *mySource;
+ if (U_IS_TRAIL(ch2)) {
+ ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
+ mySource++;
+ }
+ else {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ args->converter->fromUChar32 = ch;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
+ }
+ }
+ else {
+ /* ran out of source */
+ args->converter->fromUChar32 = ch;
+ if (args->flush) {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ *err = U_ILLEGAL_CHAR_FOUND;
+ }
+ break;
}
}
- else if (!args->flush)
- {
- /* ran out of source */
- args->converter->fromUnicodeStatus = ch;
+ else {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ args->converter->fromUChar32 = ch;
+ *err = U_ILLEGAL_CHAR_FOUND;
break;
}
}
@@ -893,10 +748,10 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
temp[3] = 0;
- if (args->converter->fromUnicodeStatus)
+ if (args->converter->fromUChar32)
{
- ch = args->converter->fromUnicodeStatus;
- args->converter->fromUnicodeStatus = 0;
+ ch = args->converter->fromUChar32;
+ args->converter->fromUChar32 = 0;
goto lowsurogate;
}
@@ -904,22 +759,42 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
{
ch = *(mySource++);
- if (SURROGATE_HIGH_START <= ch && ch < SURROGATE_LOW_START)
- {
-lowsurogate:
- if (mySource < sourceLimit)
+ if (UTF_IS_SURROGATE(ch)) {
+ if (U_IS_LEAD(ch))
{
- ch2 = *mySource;
- if (SURROGATE_LOW_START <= ch2 && ch2 <= SURROGATE_LOW_END)
+lowsurogate:
+ if (mySource < sourceLimit)
{
- ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
- mySource++;
+ ch2 = *mySource;
+ if (U_IS_TRAIL(ch2))
+ {
+ ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
+ mySource++;
+ }
+ else {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ args->converter->fromUChar32 = ch;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
+ }
+ }
+ else {
+ /* ran out of source */
+ args->converter->fromUChar32 = ch;
+ if (args->flush) {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ *err = U_ILLEGAL_CHAR_FOUND;
+ }
+ break;
}
}
- else if (!args->flush)
- {
- /* ran out of source */
- args->converter->fromUnicodeStatus = ch;
+ else {
+ /* this is an unmatched trail code unit (2nd surrogate) */
+ /* callback(illegal) */
+ args->converter->fromUChar32 = ch;
+ *err = U_ILLEGAL_CHAR_FOUND;
break;
}
}
@@ -959,65 +834,44 @@ static UChar32
T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
UErrorCode* err)
{
- UChar myUCharBuf[2];
- UChar *myUCharPtr;
- const unsigned char *mySource;
+ const uint8_t *mySource;
UChar32 myUChar;
int32_t length;
- while (args->source < args->sourceLimit)
+ mySource = (const uint8_t *)args->source;
+ if (mySource >= (const uint8_t *)args->sourceLimit)
{
- if (args->source + 4 > args->sourceLimit)
- {
- /* got a partial character */
- *err = U_TRUNCATED_CHAR_FOUND;
- return 0xffff;
- }
+ /* no input */
+ *err = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0xffff;
+ }
- /* Don't even try to do a direct cast because the value may be on an odd address. */
- mySource = (unsigned char *) args->source;
- myUChar = (mySource[0])
- | (mySource[1] << 8)
- | (mySource[2] << 16)
- | (mySource[3] << 24);
+ length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
+ if (length < 4)
+ {
+ /* got a partial character */
+ uprv_memcpy(args->converter->toUBytes, mySource, length);
+ args->converter->toULength = (int8_t)length;
+ args->source = (const char *)(mySource + length);
+ *err = U_TRUNCATED_CHAR_FOUND;
+ return 0xffff;
+ }
- args->source = (const char *)(mySource + 4);
- if (myUChar <= MAXIMUM_UTF && myUChar >= 0) {
- return myUChar;
- }
+ /* Don't even try to do a direct cast because the value may be on an odd address. */
+ myUChar = ((UChar32)mySource[3] << 24)
+ | ((UChar32)mySource[2] << 16)
+ | ((UChar32)mySource[1] << 8)
+ | ((UChar32)mySource[0]);
- uprv_memcpy(args->converter->invalidCharBuffer, mySource, 4);
- args->converter->invalidCharLength = 4;
-
- myUCharPtr = myUCharBuf;
- *err = U_ILLEGAL_CHAR_FOUND;
- args->target = myUCharPtr;
- args->targetLimit = myUCharBuf + 2;
- args->converter->fromCharErrorBehaviour(args->converter->toUContext,
- args,
- (const char *)mySource,
- 4,
- UCNV_ILLEGAL,
- err);
-
- if(U_SUCCESS(*err)) {
- length = (uint16_t)(args->target - myUCharBuf);
- if(length > 0) {
- return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, length);
- }
- /* else (callback did not write anything) continue */
- } else if(*err == U_BUFFER_OVERFLOW_ERROR) {
- *err = U_ZERO_ERROR;
- return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, 2);
- } else {
- /* break on error */
- /* ### what if a callback set an error but _also_ generated output?! */
- return 0xffff;
- }
+ args->source = (const char *)(mySource + 4);
+ if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
+ return myUChar;
}
- /* no input or only skipping callbacks */
- *err = U_INDEX_OUTOFBOUNDS_ERROR;
+ uprv_memcpy(args->converter->toUBytes, mySource, 4);
+ args->converter->toULength = 4;
+
+ *err = U_ILLEGAL_CHAR_FOUND;
return 0xffff;
}
@@ -1253,12 +1107,12 @@ _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
pArgs->source=source;
pArgs->sourceLimit=sourceLimit;
+ state=8;
break;
}
- cnv->mode=0; /* reset */
- } else {
- cnv->mode=state;
}
+
+ cnv->mode=state;
}
static UChar32
@@ -1270,7 +1124,7 @@ _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
case 9:
return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
default:
- return ucnv_getNextUCharFromToUImpl(pArgs, _UTF32ToUnicodeWithOffsets, FALSE, pErrorCode);
+ return UCNV_GET_NEXT_UCHAR_USE_TO_U;
}
}
@@ -1323,3 +1177,5 @@ const UConverterSharedData _UTF32Data = {
NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
0
};
+
+#endif
diff --git a/icuSources/common/ucnv_u7.c b/icuSources/common/ucnv_u7.c
index 202edefd..f15da503 100644
--- a/icuSources/common/ucnv_u7.c
+++ b/icuSources/common/ucnv_u7.c
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 2002-2003, International Business Machines
+* Copyright (C) 2002-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u7.c
@@ -15,14 +15,15 @@
*/
#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
#include "unicode/ucnv.h"
-#include "unicode/ucnv_err.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
/* UTF-7 -------------------------------------------------------------------- */
-/* ### TODO: in user guide, document version option (=1 for escaping set O characters) */
/*
* UTF-7 is a stateful encoding of Unicode.
* It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
@@ -247,7 +248,6 @@ _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
sourceIndex=byteIndex==0 ? 0 : -1;
nextSourceIndex=0;
-loop:
if(inDirectMode) {
directMode:
/*
@@ -270,8 +270,8 @@ directMode:
/* illegal */
bytes[0]=b;
byteIndex=1;
- nextSourceIndex=sourceIndex+1;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
} else if(b!=PLUS) {
/* write directly encoded character */
*target++=b;
@@ -312,7 +312,8 @@ unicodeMode:
if(b>=126) {
/* illegal - test other illegal US-ASCII values by base64Value==-3 */
inDirectMode=TRUE;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
} else if((base64Value=fromBase64[b])>=0) {
/* collect base64 bytes into UChars */
switch(base64Counter) {
@@ -377,7 +378,8 @@ unicodeMode:
/* absorb the minus and leave the Unicode Mode */
if(bits!=0) {
/* bits are illegally left over, a UChar is incomplete */
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
}
}
sourceIndex=nextSourceIndex;
@@ -392,7 +394,8 @@ unicodeMode:
bytes[0]=PLUS;
bytes[1]=b;
byteIndex=2;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
} else if(bits==0) {
/* un-read the character in case it is a plus sign */
--source;
@@ -400,12 +403,14 @@ unicodeMode:
goto directMode;
} else {
/* bits are illegally left over, a UChar is incomplete */
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
}
} else /* base64Value==-3 for illegal characters */ {
/* illegal */
inDirectMode=TRUE;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
}
} else {
/* target is full */
@@ -414,91 +419,26 @@ unicodeMode:
}
}
}
-endloop:
- if(pArgs->flush && source>=sourceLimit) {
- /* reset the state for the next conversion */
- if(!inDirectMode && bits!=0 && U_SUCCESS(*pErrorCode)) {
- /* a character byte sequence remains incomplete */
- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
- }
- cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
- cnv->toULength=0;
- } else {
- /* set the converter state back into UConverter */
- cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
- cnv->toULength=byteIndex;
+ if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
+ /*
+ * if we are in Unicode mode, then the byteIndex might not be 0,
+ * but that is ok if bits==0
+ * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
+ * (not true for IMAP-mailbox-name where we must end in direct mode)
+ */
+ byteIndex=0;
}
-finish:
+ /* set the converter state back into UConverter */
+ cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
+ cnv->toULength=byteIndex;
+
/* write back the updated pointers */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
return;
-
-callback:
- /* call the callback function with all the preparations and post-processing */
- /* update the arguments structure */
- pArgs->source=(const char *)source;
- pArgs->target=target;
- pArgs->offsets=offsets;
-
- /* copy the current bytes to invalidCharBuffer */
- for(b=0; b<(uint8_t)byteIndex; ++b) {
- cnv->invalidCharBuffer[b]=(char)bytes[b];
- }
- cnv->invalidCharLength=byteIndex;
-
- /* set the converter state in UConverter to deal with the next character */
- cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24;
- cnv->toULength=0;
-
- /* call the callback function */
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode);
-
- /* get the converter state from UConverter */
- {
- uint32_t status=cnv->toUnicodeStatus;
- inDirectMode=(UBool)((status>>24)&1);
- base64Counter=(int8_t)(status>>16);
- bits=(uint16_t)status;
- }
- byteIndex=cnv->toULength;
-
- /* update target and deal with offsets if necessary */
- offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
- target=pArgs->target;
-
- /* update the source pointer and index */
- sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
- source=(const uint8_t *)pArgs->source;
-
- /*
- * If the callback overflowed the target, then we need to
- * stop here with an overflow indication.
- */
- if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
- goto endloop;
- } else if(cnv->UCharErrorBufferLength>0) {
- /* target is full */
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- goto endloop;
- } else if(U_FAILURE(*pErrorCode)) {
- /* break on error */
- cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
- cnv->toULength=0;
- goto finish;
- } else {
- goto loop;
- }
-}
-
-static UChar32
-_UTF7GetNextUChar(UConverterToUnicodeArgs *pArgs,
- UErrorCode *pErrorCode) {
- return ucnv_getNextUCharFromToUImpl(pArgs, pArgs->converter->sharedData->impl->toUnicode, TRUE, pErrorCode);
}
static void
@@ -788,7 +728,7 @@ static const UConverterImpl _UTF7Impl={
_UTF7ToUnicodeWithOffsets,
_UTF7FromUnicodeWithOffsets,
_UTF7FromUnicodeWithOffsets,
- _UTF7GetNextUChar,
+ NULL,
NULL,
_UTF7GetName,
@@ -967,7 +907,6 @@ _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
sourceIndex=byteIndex==0 ? 0 : -1;
nextSourceIndex=0;
-loop:
if(inDirectMode) {
directMode:
/*
@@ -989,8 +928,8 @@ directMode:
/* illegal */
bytes[0]=b;
byteIndex=1;
- nextSourceIndex=sourceIndex+1;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
} else if(b!=AMPERSAND) {
/* write directly encoded character */
*target++=b;
@@ -1032,7 +971,8 @@ unicodeMode:
if(b>0x7e) {
/* illegal - test other illegal US-ASCII values by base64Value==-3 */
inDirectMode=TRUE;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
} else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
/* collect base64 bytes into UChars */
switch(base64Counter) {
@@ -1053,7 +993,8 @@ unicodeMode:
if(isLegalIMAP(c)) {
/* illegal */
inDirectMode=TRUE;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ goto endloop;
}
*target++=c;
if(offsets!=NULL) {
@@ -1070,7 +1011,8 @@ unicodeMode:
if(isLegalIMAP(c)) {
/* illegal */
inDirectMode=TRUE;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ goto endloop;
}
*target++=c;
if(offsets!=NULL) {
@@ -1087,7 +1029,8 @@ unicodeMode:
if(isLegalIMAP(c)) {
/* illegal */
inDirectMode=TRUE;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ goto endloop;
}
*target++=c;
if(offsets!=NULL) {
@@ -1116,7 +1059,8 @@ unicodeMode:
if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
/* bits are illegally left over, a UChar is incomplete */
/* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
}
}
sourceIndex=nextSourceIndex;
@@ -1134,7 +1078,8 @@ unicodeMode:
/* base64Value==-3 for illegal characters */
/* illegal */
inDirectMode=TRUE;
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ break;
}
} else {
/* target is full */
@@ -1145,83 +1090,41 @@ unicodeMode:
}
endloop:
- if(pArgs->flush && source>=sourceLimit) {
- /* reset the state for the next conversion */
- if(!inDirectMode && U_SUCCESS(*pErrorCode)) {
- /* a character byte sequence remains incomplete - IMAP must end in ASCII/direct mode */
- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
+ /*
+ * the end of the input stream and detection of truncated input
+ * are handled by the framework, but here we must check if we are in Unicode
+ * mode and byteIndex==0 because we must end in direct mode
+ *
+ * conditions:
+ * successful
+ * in Unicode mode and byteIndex==0
+ * end of input and no truncated input
+ */
+ if( U_SUCCESS(*pErrorCode) &&
+ !inDirectMode && byteIndex==0 &&
+ pArgs->flush && source>=sourceLimit
+ ) {
+ if(base64Counter==-1) {
+ /* & at the very end of the input */
+ /* make the ampersand the reported sequence */
+ bytes[0]=AMPERSAND;
+ byteIndex=1;
}
- cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
- cnv->toULength=0;
- } else {
- /* set the converter state back into UConverter */
- cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
- cnv->toULength=byteIndex;
+ /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
+
+ inDirectMode=TRUE; /* avoid looping */
+ *pErrorCode=U_TRUNCATED_CHAR_FOUND;
}
-finish:
+ /* set the converter state back into UConverter */
+ cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
+ cnv->toULength=byteIndex;
+
/* write back the updated pointers */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
return;
-
-callback:
- /* call the callback function with all the preparations and post-processing */
- /* update the arguments structure */
- pArgs->source=(const char *)source;
- pArgs->target=target;
- pArgs->offsets=offsets;
-
- /* copy the current bytes to invalidCharBuffer */
- for(b=0; b<(uint8_t)byteIndex; ++b) {
- cnv->invalidCharBuffer[b]=(char)bytes[b];
- }
- cnv->invalidCharLength=byteIndex;
-
- /* set the converter state in UConverter to deal with the next character */
- cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24;
- cnv->toULength=0;
-
- /* call the callback function */
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode);
-
- /* get the converter state from UConverter */
- {
- uint32_t status=cnv->toUnicodeStatus;
- inDirectMode=(UBool)((status>>24)&1);
- base64Counter=(int8_t)(status>>16);
- bits=(uint16_t)status;
- }
- byteIndex=cnv->toULength;
-
- /* update target and deal with offsets if necessary */
- offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
- target=pArgs->target;
-
- /* update the source pointer and index */
- sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
- source=(const uint8_t *)pArgs->source;
-
- /*
- * If the callback overflowed the target, then we need to
- * stop here with an overflow indication.
- */
- if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
- goto endloop;
- } else if(cnv->UCharErrorBufferLength>0) {
- /* target is full */
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- goto endloop;
- } else if(U_FAILURE(*pErrorCode)) {
- /* break on error */
- cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
- cnv->toULength=0;
- goto finish;
- } else {
- goto loop;
- }
}
static void
@@ -1525,7 +1428,7 @@ static const UConverterImpl _IMAPImpl={
_IMAPToUnicodeWithOffsets,
_IMAPFromUnicodeWithOffsets,
_IMAPFromUnicodeWithOffsets,
- _UTF7GetNextUChar,
+ NULL,
NULL,
NULL,
@@ -1537,7 +1440,7 @@ static const UConverterImpl _IMAPImpl={
static const UConverterStaticData _IMAPStaticData={
sizeof(UConverterStaticData),
"IMAP-mailbox-name",
- 0, /* TODO CCSID for UTF-7 */
+ 0, /* TODO CCSID for IMAP-mailbox-name */
UCNV_IBM, UCNV_IMAP_MAILBOX,
1, 4,
{ 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
@@ -1552,3 +1455,5 @@ const UConverterSharedData _IMAPData={
NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl,
0
};
+
+#endif
diff --git a/icuSources/common/ucnv_u8.c b/icuSources/common/ucnv_u8.c
index 411701aa..a8635783 100644
--- a/icuSources/common/ucnv_u8.c
+++ b/icuSources/common/ucnv_u8.c
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 2002-2003, International Business Machines
+* Copyright (C) 2002-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u8.c
@@ -19,8 +19,10 @@
*/
#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
#include "unicode/ucnv.h"
-#include "unicode/ucnv_err.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
#include "cmemory.h"
@@ -29,16 +31,10 @@
/* Keep these here to make finicky compilers happy */
-U_CFUNC void T_UConverter_toUnicode_UTF8(UConverterToUnicodeArgs *args,
- UErrorCode *err);
-U_CFUNC void T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
- UErrorCode *err);
-U_CFUNC void T_UConverter_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
+U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
UErrorCode *err);
-U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
+U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
UErrorCode *err);
-U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
- UErrorCode *err);
/* UTF-8 -------------------------------------------------------------------- */
@@ -88,64 +84,7 @@ static const int8_t bytesFromUTF8[256] = {
static const uint32_t
utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
-/**
- * Calls invalid char callback when an invalid character sequence is encountered.
- * It presumes that the converter has a callback to call.
- *
- * @returns true when callback fails
- */
-static UBool
-T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args,
- UConverterCallbackReason reason,
- UErrorCode *err)
-{
- UConverter *converter = args->converter;
-
- if (U_SUCCESS(*err))
- {
- if (reason == UCNV_ILLEGAL) {
- *err = U_ILLEGAL_CHAR_FOUND;
- } else {
- *err = U_INVALID_CHAR_FOUND;
- }
- }
-
- /* copy the toUBytes to the invalidCharBuffer */
- uprv_memcpy(converter->invalidCharBuffer,
- converter->toUBytes,
- converter->toULength);
- converter->invalidCharLength = converter->toULength;
-
- /* Call the ErrorFunction */
- args->converter->fromCharErrorBehaviour(converter->toUContext,
- args,
- converter->invalidCharBuffer,
- converter->invalidCharLength,
- reason,
- err);
-
- return (UBool)U_FAILURE(*err);
-}
-
-static UBool
-T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args,
- int32_t currentOffset,
- UConverterCallbackReason reason,
- UErrorCode *err)
-{
- int32_t *saveOffsets = args->offsets;
- UBool result;
-
- result = T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err);
-
- while (saveOffsets < args->offsets)
- {
- *(saveOffsets++) = currentOffset;
- }
- return result;
-}
-
-U_CFUNC void T_UConverter_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
+static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
UErrorCode * err)
{
const unsigned char *mySource = (unsigned char *) args->source;
@@ -158,7 +97,6 @@ U_CFUNC void T_UConverter_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
int32_t i, inBytes;
/* Restore size of current sequence */
-start:
if (args->converter->toUnicodeStatus && myTarget < targetLimit)
{
inBytes = args->converter->mode; /* restore # of bytes to consume */
@@ -200,19 +138,10 @@ morebytes:
}
else
{
- if (args->flush)
- {
- if (U_SUCCESS(*err))
- {
- *err = U_TRUNCATED_CHAR_FOUND;
- }
- }
- else
- { /* stores a partially calculated target*/
- args->converter->toUnicodeStatus = ch;
- args->converter->mode = inBytes;
- args->converter->toULength = (int8_t) i;
- }
+ /* stores a partially calculated target*/
+ args->converter->toUnicodeStatus = ch;
+ args->converter->mode = inBytes;
+ args->converter->toULength = (int8_t) i;
goto donefornow;
}
}
@@ -236,6 +165,7 @@ morebytes:
(isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
{
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
+ args->converter->toULength = 0;
if (ch <= MAXIMUM_UCS2)
{
/* fits in 16 bits */
@@ -263,22 +193,9 @@ morebytes:
}
else
{
- args->source = (const char *) mySource;
- args->target = myTarget;
-
args->converter->toULength = (int8_t)i;
- if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
- {
- /* Stop if the error wasn't handled */
- /* args and err should already be set properly */
- return;
- }
-
- mySource = (unsigned char *) args->source;
- myTarget = args->target;
-
- /* goto the start to handle state left behind by the callback */
- goto start;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
}
}
@@ -294,7 +211,7 @@ donefornow:
args->source = (const char *) mySource;
}
-U_CFUNC void T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
+static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
UErrorCode * err)
{
const unsigned char *mySource = (unsigned char *) args->source;
@@ -309,7 +226,6 @@ U_CFUNC void T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs
int32_t i, inBytes;
/* Restore size of current sequence */
-start:
if (args->converter->toUnicodeStatus && myTarget < targetLimit)
{
inBytes = args->converter->mode; /* restore # of bytes to consume */
@@ -350,20 +266,9 @@ morebytes:
}
else
{
- if (args->flush)
- {
- if (U_SUCCESS(*err))
- {
- *err = U_TRUNCATED_CHAR_FOUND;
- args->converter->toUnicodeStatus = 0;
- }
- }
- else
- {
- args->converter->toUnicodeStatus = ch;
- args->converter->mode = inBytes;
- args->converter->toULength = (int8_t)i;
- }
+ args->converter->toUnicodeStatus = ch;
+ args->converter->mode = inBytes;
+ args->converter->toULength = (int8_t)i;
goto donefornow;
}
}
@@ -387,6 +292,7 @@ morebytes:
(isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
{
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
+ args->converter->toULength = 0;
if (ch <= MAXIMUM_UCS2)
{
/* fits in 16 bits */
@@ -416,26 +322,9 @@ morebytes:
}
else
{
- args->source = (const char *) mySource;
- args->target = myTarget;
- args->offsets = myOffsets;
-
args->converter->toULength = (int8_t)i;
- if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args,
- offsetNum, UCNV_ILLEGAL, err))
- {
- /* Stop if the error wasn't handled */
- /* args and err should already be set properly */
- return;
- }
-
- offsetNum += i + ((unsigned char *) args->source - mySource);
- mySource = (unsigned char *) args->source;
- myTarget = args->target;
- myOffsets = args->offsets;
-
- /* goto the start to handle state left behind by the callback */
- goto start;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
}
}
@@ -451,7 +340,7 @@ donefornow:
args->offsets = myOffsets;
}
-U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
+U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
UErrorCode * err)
{
UConverter *cnv = args->converter;
@@ -460,14 +349,14 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
const UChar *sourceLimit = args->sourceLimit;
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
- uint32_t ch, ch2;
+ UChar32 ch, ch2;
int16_t indexToWrite;
char temp[4];
- if (cnv->fromUSurrogateLead && myTarget < targetLimit)
+ if (cnv->fromUChar32 && myTarget < targetLimit)
{
- ch = cnv->fromUSurrogateLead;
- cnv->fromUSurrogateLead = 0;
+ ch = cnv->fromUChar32;
+ cnv->fromUChar32 = 0;
goto lowsurrogate;
}
@@ -511,63 +400,21 @@ lowsurrogate:
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
- ch2 = ch;
+ cnv->fromUChar32 = ch;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
} else {
/* no more input */
- cnv->fromUSurrogateLead = (UChar)ch;
+ cnv->fromUChar32 = ch;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
- ch2 = ch;
- }
-
- if(ch2 != 0) {
- /* call the callback function with all the preparations and post-processing */
+ cnv->fromUChar32 = ch;
*err = U_ILLEGAL_CHAR_FOUND;
-
- /* update the arguments structure */
- args->source=mySource;
- args->target=(char *)myTarget;
-
- /* write the code point as code units */
- cnv->invalidUCharBuffer[0] = (UChar)ch2;
- cnv->invalidUCharLength = 1;
-
- /* call the callback function */
- cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err);
-
- /* get the converter state from UConverter */
- ch = cnv->fromUSurrogateLead;
- cnv->fromUSurrogateLead = 0;
-
- myTarget=(uint8_t *)args->target;
- mySource=args->source;
-
- /*
- * If the callback overflowed the target, then we need to
- * stop here with an overflow indication.
- */
- if(*err==U_BUFFER_OVERFLOW_ERROR) {
- break;
- } else if(U_FAILURE(*err)) {
- /* break on error */
- break;
- } else if(cnv->charErrorBufferLength>0) {
- /* target is full */
- *err=U_BUFFER_OVERFLOW_ERROR;
- break;
- /*
- * } else if(ch != 0) { ...
- * ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c)
- * does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 .
- * We would have to check myTargetflush && mySource >= sourceLimit && cnv->fromUSurrogateLead != 0 && U_SUCCESS(*err)) {
- /* a Unicode code point remains incomplete (only a first surrogate) */
- *err = U_TRUNCATED_CHAR_FOUND;
- cnv->fromUSurrogateLead = 0;
- }
args->target = (char *) myTarget;
args->source = mySource;
}
-U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
+U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
UErrorCode * err)
{
UConverter *cnv = args->converter;
@@ -624,15 +466,15 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeA
const UChar *sourceLimit = args->sourceLimit;
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
- uint32_t ch, ch2;
+ UChar32 ch, ch2;
int32_t offsetNum, nextSourceIndex;
int16_t indexToWrite;
char temp[4];
- if (cnv->fromUSurrogateLead && myTarget < targetLimit)
+ if (cnv->fromUChar32 && myTarget < targetLimit)
{
- ch = cnv->fromUSurrogateLead;
- cnv->fromUSurrogateLead = 0;
+ ch = cnv->fromUChar32;
+ cnv->fromUChar32 = 0;
offsetNum = -1;
nextSourceIndex = 0;
goto lowsurrogate;
@@ -686,69 +528,21 @@ lowsurrogate:
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
- ch2 = ch;
+ cnv->fromUChar32 = ch;
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
} else {
/* no more input */
- cnv->fromUSurrogateLead = (UChar)ch;
+ cnv->fromUChar32 = ch;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
- ch2 = ch;
- }
-
- if(ch2 != 0) {
- /* call the callback function with all the preparations and post-processing */
+ cnv->fromUChar32 = ch;
*err = U_ILLEGAL_CHAR_FOUND;
-
- /* update the arguments structure */
- args->source=mySource;
- args->target=(char *)myTarget;
- args->offsets=myOffsets;
-
- /* write the code point as code units */
- cnv->invalidUCharBuffer[0] = (UChar)ch2;
- cnv->invalidUCharLength = 1;
-
- /* call the callback function */
- cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err);
-
- /* get the converter state from UConverter */
- ch = cnv->fromUSurrogateLead;
- cnv->fromUSurrogateLead = 0;
-
- /* update target and deal with offsets if necessary */
- myOffsets=ucnv_updateCallbackOffsets(myOffsets, ((uint8_t *)args->target)-myTarget, offsetNum);
- myTarget=(uint8_t *)args->target;
-
- /* update the source pointer and index */
- offsetNum=nextSourceIndex+(args->source-mySource);
- mySource=args->source;
-
- /*
- * If the callback overflowed the target, then we need to
- * stop here with an overflow indication.
- */
- if(*err==U_BUFFER_OVERFLOW_ERROR) {
- break;
- } else if(U_FAILURE(*err)) {
- /* break on error */
- break;
- } else if(cnv->charErrorBufferLength>0) {
- /* target is full */
- *err=U_BUFFER_OVERFLOW_ERROR;
- break;
- /*
- * } else if(ch != 0) { ...
- * ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c)
- * does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 .
- * We would have to check myTargetflush && mySource >= sourceLimit && cnv->fromUSurrogateLead != 0 && U_SUCCESS(*err)) {
- /* a Unicode code point remains incomplete (only a first surrogate) */
- *err = U_TRUNCATED_CHAR_FOUND;
- cnv->fromUSurrogateLead = 0;
- }
args->target = (char *) myTarget;
args->source = mySource;
args->offsets = myOffsets;
}
-U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
+static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
UErrorCode *err) {
- UChar buffer[2];
- const char *sourceInitial;
+ UConverter *cnv;
+ const uint8_t *sourceInitial;
const uint8_t *source;
- UChar* myUCharPtr;
uint16_t extraBytesToWrite;
uint8_t myByte;
UChar32 ch;
- int8_t isLegalSequence;
- UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
+ int8_t i, isLegalSequence;
- while (args->source < args->sourceLimit)
+ /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
+
+ cnv = args->converter;
+ sourceInitial = source = (const uint8_t *)args->source;
+ if (source >= (const uint8_t *)args->sourceLimit)
{
- sourceInitial = args->source;
- myByte = (uint8_t)*(args->source++);
- if (myByte < 0x80)
- {
- return (UChar32)myByte;
- }
+ /* no input */
+ *err = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0xffff;
+ }
- extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
- if (extraBytesToWrite == 0) {
- isLegalSequence = FALSE;
- ch = 0;
- goto CALL_ERROR_FUNCTION;
- }
+ myByte = (uint8_t)*(source++);
+ if (myByte < 0x80)
+ {
+ args->source = (const char *)source;
+ return (UChar32)myByte;
+ }
- /*The byte sequence is longer than the buffer area passed*/
- source = (const uint8_t *)args->source;
- if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
- {
- *err = U_TRUNCATED_CHAR_FOUND;
- return 0xffff;
- }
- else
- {
- isLegalSequence = 1;
- ch = myByte << 6;
- switch(extraBytesToWrite)
- {
- /* note: code falls through cases! (sic)*/
- case 6:
- ch += (myByte = *source++);
- ch <<= 6;
- if (!UTF8_IS_TRAIL(myByte))
- {
- isLegalSequence = 0;
- break;
- }
- case 5:
- ch += (myByte = *source++);
- ch <<= 6;
- if (!UTF8_IS_TRAIL(myByte))
- {
- isLegalSequence = 0;
- break;
- }
- case 4:
- ch += (myByte = *source++);
- ch <<= 6;
- if (!UTF8_IS_TRAIL(myByte))
- {
- isLegalSequence = 0;
- break;
- }
- case 3:
- ch += (myByte = *source++);
- ch <<= 6;
- if (!UTF8_IS_TRAIL(myByte))
- {
- isLegalSequence = 0;
- break;
- }
- case 2:
- ch += (myByte = *source++);
- if (!UTF8_IS_TRAIL(myByte))
- {
- isLegalSequence = 0;
- }
- };
- }
- ch -= offsetsFromUTF8[extraBytesToWrite];
+ extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
+ if (extraBytesToWrite == 0) {
+ cnv->toUBytes[0] = myByte;
+ cnv->toULength = 1;
+ *err = U_ILLEGAL_CHAR_FOUND;
args->source = (const char *)source;
+ return 0xffff;
+ }
- /*
- * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
- * - use only trail bytes after a lead byte (checked above)
- * - use the right number of trail bytes for a given lead byte
- * - encode a code point <= U+10ffff
- * - use the fewest possible number of bytes for their code points
- * - use at most 4 bytes (for i>=5 it is 0x10ffff= utf8_minChar32[extraBytesToWrite]) {
- if(isCESU8) {
- if(extraBytesToWrite <= 3) {
- if( UTF_IS_FIRST_SURROGATE(ch) &&
- (const char *)(source + 3) <= args->sourceLimit &&
- source[0] == 0xed && (source[1] & 0xf0) == 0xb0 && (source[2] & 0xc0) == 0x80
- ) {
- /* ch is a lead surrogate followed by a trail surrogate */
- ch = (ch << 10) +
- ((source[1] & 0xf) << 6) + (source[2] & 0x3f) -
- ((0xd800 << 10) - 0x10000);
- args->source = (const char *)(source + 3);
- }
- return ch; /* return the code point */
- }
- /* illegal CESU-8 */
+ /*The byte sequence is longer than the buffer area passed*/
+ if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
+ {
+ /* check if all of the remaining bytes are trail bytes */
+ cnv->toUBytes[0] = myByte;
+ i = 1;
+ *err = U_TRUNCATED_CHAR_FOUND;
+ while(source < (const uint8_t *)args->sourceLimit) {
+ if(U8_IS_TRAIL(myByte = *source)) {
+ cnv->toUBytes[i++] = myByte;
+ ++source;
} else {
- if(!UTF_IS_SURROGATE(ch)) {
- return ch; /* return the code point */
- }
- /* illegal UTF-8 */
+ /* error even before we run out of input */
+ *err = U_ILLEGAL_CHAR_FOUND;
+ break;
}
}
+ cnv->toULength = i;
+ args->source = (const char *)source;
+ return 0xffff;
+ }
-CALL_ERROR_FUNCTION:
- extraBytesToWrite = (uint16_t)(args->source - sourceInitial);
- args->converter->invalidCharLength = (uint8_t)extraBytesToWrite;
- uprv_memcpy(args->converter->invalidCharBuffer, sourceInitial, extraBytesToWrite);
-
- myUCharPtr = buffer;
- *err = U_ILLEGAL_CHAR_FOUND;
- args->target = myUCharPtr;
- args->targetLimit = buffer + 2;
- args->converter->fromCharErrorBehaviour(args->converter->toUContext,
- args,
- sourceInitial,
- extraBytesToWrite,
- UCNV_ILLEGAL,
- err);
-
- if(U_SUCCESS(*err)) {
- extraBytesToWrite = (uint16_t)(args->target - buffer);
- if(extraBytesToWrite > 0) {
- return ucnv_getUChar32KeepOverflow(args->converter, buffer, extraBytesToWrite);
- }
- /* else (callback did not write anything) continue */
- } else if(*err == U_BUFFER_OVERFLOW_ERROR) {
- *err = U_ZERO_ERROR;
- return ucnv_getUChar32KeepOverflow(args->converter, buffer, 2);
- } else {
- /* break on error */
- /* ### what if a callback set an error but _also_ generated output?! */
- return 0xffff;
+ isLegalSequence = 1;
+ ch = myByte << 6;
+ switch(extraBytesToWrite)
+ {
+ /* note: code falls through cases! (sic)*/
+ case 6:
+ ch += (myByte = *source);
+ ch <<= 6;
+ if (!UTF8_IS_TRAIL(myByte))
+ {
+ isLegalSequence = 0;
+ break;
}
+ ++source;
+ case 5:
+ ch += (myByte = *source);
+ ch <<= 6;
+ if (!UTF8_IS_TRAIL(myByte))
+ {
+ isLegalSequence = 0;
+ break;
+ }
+ ++source;
+ case 4:
+ ch += (myByte = *source);
+ ch <<= 6;
+ if (!UTF8_IS_TRAIL(myByte))
+ {
+ isLegalSequence = 0;
+ break;
+ }
+ ++source;
+ case 3:
+ ch += (myByte = *source);
+ ch <<= 6;
+ if (!UTF8_IS_TRAIL(myByte))
+ {
+ isLegalSequence = 0;
+ break;
+ }
+ ++source;
+ case 2:
+ ch += (myByte = *source);
+ if (!UTF8_IS_TRAIL(myByte))
+ {
+ isLegalSequence = 0;
+ break;
+ }
+ ++source;
+ };
+ ch -= offsetsFromUTF8[extraBytesToWrite];
+ args->source = (const char *)source;
+
+ /*
+ * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
+ * - use only trail bytes after a lead byte (checked above)
+ * - use the right number of trail bytes for a given lead byte
+ * - encode a code point <= U+10ffff
+ * - use the fewest possible number of bytes for their code points
+ * - use at most 4 bytes (for i>=5 it is 0x10ffff= utf8_minChar32[extraBytesToWrite] &&
+ !U_IS_SURROGATE(ch)
+ ) {
+ return ch; /* return the code point */
}
- /* no input or only skipping callback calls */
- *err = U_INDEX_OUTOFBOUNDS_ERROR;
+ for(i = 0; sourceInitial < source; ++i) {
+ cnv->toUBytes[i] = *sourceInitial++;
+ }
+ cnv->toULength = i;
+ *err = U_ILLEGAL_CHAR_FOUND;
return 0xffff;
}
@@ -968,11 +738,11 @@ static const UConverterImpl _UTF8Impl={
NULL,
NULL,
- T_UConverter_toUnicode_UTF8,
- T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC,
- T_UConverter_fromUnicode_UTF8,
- T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC,
- T_UConverter_getNextUChar_UTF8,
+ ucnv_toUnicode_UTF8,
+ ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
+ ucnv_fromUnicode_UTF8,
+ ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
+ ucnv_getNextUChar_UTF8,
NULL,
NULL,
@@ -985,7 +755,8 @@ static const UConverterImpl _UTF8Impl={
static const UConverterStaticData _UTF8StaticData={
sizeof(UConverterStaticData),
"UTF-8",
- 1208, UCNV_IBM, UCNV_UTF8, 1, 4,
+ 1208, UCNV_IBM, UCNV_UTF8,
+ 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
{ 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
0,
0,
@@ -1001,6 +772,29 @@ const UConverterSharedData _UTF8Data={
/* CESU-8 converter data ---------------------------------------------------- */
+static const UConverterImpl _CESU8Impl={
+ UCNV_CESU8,
+
+ NULL,
+ NULL,
+
+ NULL,
+ NULL,
+ NULL,
+
+ ucnv_toUnicode_UTF8,
+ ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
+ ucnv_fromUnicode_UTF8,
+ ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
+ NULL,
+
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ ucnv_getCompleteUnicodeSet
+};
+
static const UConverterStaticData _CESU8StaticData={
sizeof(UConverterStaticData),
"CESU-8",
@@ -1014,6 +808,8 @@ static const UConverterStaticData _CESU8StaticData={
const UConverterSharedData _CESU8Data={
sizeof(UConverterSharedData), ~((uint32_t) 0),
- NULL, NULL, &_CESU8StaticData, FALSE, &_UTF8Impl,
+ NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
0
};
+
+#endif
diff --git a/icuSources/common/ucnvbocu.c b/icuSources/common/ucnvbocu.c
index 3db6caeb..dacd6c16 100644
--- a/icuSources/common/ucnvbocu.c
+++ b/icuSources/common/ucnvbocu.c
@@ -1,7 +1,7 @@
/*
******************************************************************************
*
-* Copyright (C) 2002-2003, International Business Machines
+* Copyright (C) 2002-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@@ -14,10 +14,13 @@
* created by: Markus W. Scherer
*
* This is an implementation of the Binary Ordered Compression for Unicode,
-* in its MIME-friendly form as defined in ### TODO http://... 1. doc/papers 2. design
+* in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
*/
#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
#include "unicode/ucnv.h"
#include "unicode/ucnv_cb.h"
#include "ucnv_bld.h"
@@ -402,7 +405,7 @@ U_ALIGN_CODE(16)
offsets=pArgs->offsets;
/* get the converter state from UConverter */
- c=cnv->fromUSurrogateLead;
+ c=cnv->fromUChar32;
prev=(int32_t)cnv->fromUnicodeStatus;
if(prev==0) {
prev=BOCU1_ASCII_PREV;
@@ -424,47 +427,25 @@ fastSingle:
if(targetCapacity>diff) {
targetCapacity=diff;
}
- /* ### TODO if WithOffsets is never used without offsets, then remove all offsets==NULL branches and checks */
- if(offsets==NULL) {
- while(targetCapacity>0 && (c=*source)<0x3000) {
- if(c<=0x20) {
- if(c!=0x20) {
- prev=BOCU1_ASCII_PREV;
- }
- *target++=(uint8_t)c;
- } else {
- diff=c-prev;
- if(DIFF_IS_SINGLE(diff)) {
- prev=BOCU1_SIMPLE_PREV(c);
- *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
- } else {
- break;
- }
+ while(targetCapacity>0 && (c=*source)<0x3000) {
+ if(c<=0x20) {
+ if(c!=0x20) {
+ prev=BOCU1_ASCII_PREV;
}
+ *target++=(uint8_t)c;
+ *offsets++=nextSourceIndex++;
++source;
--targetCapacity;
- }
- } else {
- while(targetCapacity>0 && (c=*source)<0x3000) {
- if(c<=0x20) {
- if(c!=0x20) {
- prev=BOCU1_ASCII_PREV;
- }
- *target++=(uint8_t)c;
+ } else {
+ diff=c-prev;
+ if(DIFF_IS_SINGLE(diff)) {
+ prev=BOCU1_SIMPLE_PREV(c);
+ *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
*offsets++=nextSourceIndex++;
++source;
--targetCapacity;
} else {
- diff=c-prev;
- if(DIFF_IS_SINGLE(diff)) {
- prev=BOCU1_SIMPLE_PREV(c);
- *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
- *offsets++=nextSourceIndex++;
- ++source;
- --targetCapacity;
- } else {
- break;
- }
+ break;
}
}
}
@@ -488,9 +469,7 @@ fastSingle:
prev=BOCU1_ASCII_PREV;
}
*target++=(uint8_t)c;
- if(offsets!=NULL) {
- *offsets++=sourceIndex;
- }
+ *offsets++=sourceIndex;
--targetCapacity;
sourceIndex=nextSourceIndex;
@@ -527,9 +506,7 @@ getTrail:
prev=BOCU1_PREV(c);
if(DIFF_IS_SINGLE(diff)) {
*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
- if(offsets!=NULL) {
- *offsets++=sourceIndex;
- }
+ *offsets++=sourceIndex;
--targetCapacity;
sourceIndex=nextSourceIndex;
if(c<0x3000) {
@@ -551,10 +528,8 @@ getTrail:
}
*target++=(uint8_t)diff;
*target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
- if(offsets!=NULL) {
- *offsets++=sourceIndex;
- *offsets++=sourceIndex;
- }
+ *offsets++=sourceIndex;
+ *offsets++=sourceIndex;
targetCapacity-=2;
sourceIndex=nextSourceIndex;
} else {
@@ -566,40 +541,23 @@ getTrail:
/* write the output character bytes from diff and length */
/* from the first if in the loop we know that targetCapacity>0 */
if(length<=targetCapacity) {
- if(offsets==NULL) {
- switch(length) {
- /* each branch falls through to the next one */
- case 4:
- *target++=(uint8_t)(diff>>24);
- case 3:
- *target++=(uint8_t)(diff>>16);
- /* case 2: handled above */
- *target++=(uint8_t)(diff>>8);
- /* case 1: handled above */
- *target++=(uint8_t)diff;
- default:
- /* will never occur */
- break;
- }
- } else {
- switch(length) {
- /* each branch falls through to the next one */
- case 4:
- *target++=(uint8_t)(diff>>24);
- *offsets++=sourceIndex;
- case 3:
- *target++=(uint8_t)(diff>>16);
- *offsets++=sourceIndex;
- case 2:
- *target++=(uint8_t)(diff>>8);
- *offsets++=sourceIndex;
- /* case 1: handled above */
- *target++=(uint8_t)diff;
- *offsets++=sourceIndex;
- default:
- /* will never occur */
- break;
- }
+ switch(length) {
+ /* each branch falls through to the next one */
+ case 4:
+ *target++=(uint8_t)(diff>>24);
+ *offsets++=sourceIndex;
+ case 3:
+ *target++=(uint8_t)(diff>>16);
+ *offsets++=sourceIndex;
+ case 2:
+ *target++=(uint8_t)(diff>>8);
+ *offsets++=sourceIndex;
+ /* case 1: handled above */
+ *target++=(uint8_t)diff;
+ *offsets++=sourceIndex;
+ default:
+ /* will never occur */
+ break;
}
targetCapacity-=length;
sourceIndex=nextSourceIndex;
@@ -635,19 +593,13 @@ getTrail:
/* each branch falls through to the next one */
case 3:
*target++=(uint8_t)(diff>>16);
- if(offsets!=NULL) {
- *offsets++=sourceIndex;
- }
+ *offsets++=sourceIndex;
case 2:
*target++=(uint8_t)(diff>>8);
- if(offsets!=NULL) {
- *offsets++=sourceIndex;
- }
+ *offsets++=sourceIndex;
case 1:
*target++=(uint8_t)diff;
- if(offsets!=NULL) {
- *offsets++=sourceIndex;
- }
+ *offsets++=sourceIndex;
default:
/* will never occur */
break;
@@ -666,19 +618,9 @@ getTrail:
}
}
- if(pArgs->flush && source>=sourceLimit) {
- /* reset the state for the next conversion */
- if(c<0 && U_SUCCESS(*pErrorCode)) {
- /* a Unicode code point remains incomplete (only a first surrogate) */
- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
- }
- cnv->fromUSurrogateLead=0;
- cnv->fromUnicodeStatus=BOCU1_ASCII_PREV;
- } else {
- /* set the converter state back into UConverter */
- cnv->fromUSurrogateLead= c<0 ? (UChar)-c : 0;
- cnv->fromUnicodeStatus=(uint32_t)prev;
- }
+ /* set the converter state back into UConverter */
+ cnv->fromUChar32= c<0 ? -c : 0;
+ cnv->fromUnicodeStatus=(uint32_t)prev;
/* write back the updated pointers */
pArgs->source=source;
@@ -711,7 +653,7 @@ _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
targetCapacity=pArgs->targetLimit-pArgs->target;
/* get the converter state from UConverter */
- c=cnv->fromUSurrogateLead;
+ c=cnv->fromUChar32;
prev=(int32_t)cnv->fromUnicodeStatus;
if(prev==0) {
prev=BOCU1_ASCII_PREV;
@@ -897,19 +839,9 @@ getTrail:
}
}
- if(pArgs->flush && source>=sourceLimit) {
- /* reset the state for the next conversion */
- if(c<0 && U_SUCCESS(*pErrorCode)) {
- /* a Unicode code point remains incomplete (only a first surrogate) */
- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
- }
- cnv->fromUSurrogateLead=0;
- cnv->fromUnicodeStatus=BOCU1_ASCII_PREV;
- } else {
- /* set the converter state back into UConverter */
- cnv->fromUSurrogateLead= c<0 ? (UChar)-c : 0;
- cnv->fromUnicodeStatus=(uint32_t)prev;
- }
+ /* set the converter state back into UConverter */
+ cnv->fromUChar32= c<0 ? -c : 0;
+ cnv->fromUnicodeStatus=(uint32_t)prev;
/* write back the updated pointers */
pArgs->source=source;
@@ -1039,7 +971,6 @@ _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
nextSourceIndex=0;
/* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
-loop:
if(count>0 && byteIndex>0 && targetdiff) {
count=diff;
}
- if(offsets==NULL) {
- while(count>0) {
- if(BOCU1_START_NEG_2<=(c=*source) && c0) {
- if(BOCU1_START_NEG_2<=(c=*source) && c0) {
+ if(BOCU1_START_NEG_2<=(c=*source) && c0x10ffff) {
- goto callback;
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ goto endloop;
}
break;
}
@@ -1201,23 +1107,17 @@ getTrail:
prev=BOCU1_PREV(c);
if(c<=0xffff) {
*target++=(UChar)c;
- if(offsets!=NULL) {
- *offsets++=sourceIndex;
- }
+ *offsets++=sourceIndex;
} else {
/* output surrogate pair */
*target++=UTF16_LEAD(c);
if(targetUCharErrorBuffer[0]=UTF16_TRAIL(c);
cnv->UCharErrorBufferLength=1;
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
@@ -1228,90 +1128,22 @@ getTrail:
}
endloop:
- if(pArgs->flush && source>=sourceLimit) {
- /* reset the state for the next conversion */
- if(byteIndex>0 && U_SUCCESS(*pErrorCode)) {
- /* a character byte sequence remains incomplete */
- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
- }
+ if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
+ /* set the converter state in UConverter to deal with the next character */
cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
cnv->mode=0;
- cnv->toULength=0;
} else {
/* set the converter state back into UConverter */
cnv->toUnicodeStatus=(uint32_t)prev;
cnv->mode=(diff<<2)|count;
- cnv->toULength=byteIndex;
}
+ cnv->toULength=byteIndex;
-finish:
/* write back the updated pointers */
pArgs->source=(const char *)source;
pArgs->target=target;
pArgs->offsets=offsets;
return;
-
-callback:
- /* call the callback function with all the preparations and post-processing */
- /* update the arguments structure */
- pArgs->source=(const char *)source;
- pArgs->target=target;
- pArgs->offsets=offsets;
-
- /* copy the current bytes to invalidCharBuffer */
- cnv->invalidCharBuffer[0]=bytes[0];
- cnv->invalidCharBuffer[1]=bytes[1];
- cnv->invalidCharBuffer[2]=bytes[2];
- cnv->invalidCharBuffer[3]=bytes[3];
- cnv->invalidCharLength=(int8_t)byteIndex;
-
- /* set the converter state in UConverter to deal with the next character */
- cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
- cnv->mode=0;
- cnv->toULength=0;
-
- /* call the callback function */
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, (const char *)bytes, byteIndex, UCNV_ILLEGAL, pErrorCode);
-
- /* get the converter state from UConverter */
- prev=(int32_t)cnv->toUnicodeStatus;
- if(prev==0) {
- prev=BOCU1_ASCII_PREV;
- }
- diff=cnv->mode;
- count=diff&3;
- diff>>=2;
-
- byteIndex=cnv->toULength;
-
- /* update target and deal with offsets if necessary */
- offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
- target=pArgs->target;
-
- /* update the source pointer and index */
- sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
- source=(const uint8_t *)pArgs->source;
-
- /*
- * If the callback overflowed the target, then we need to
- * stop here with an overflow indication.
- */
- if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
- goto endloop;
- } else if(cnv->UCharErrorBufferLength>0) {
- /* target is full */
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- goto endloop;
- } else if(U_FAILURE(*pErrorCode)) {
- /* reset and break on error */
- cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
- cnv->mode=0;
- cnv->toULength=0;
- goto finish;
- } else {
- goto loop;
- }
}
/*
@@ -1356,7 +1188,6 @@ U_ALIGN_CODE(16)
bytes=cnv->toUBytes;
/* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
-loop:
if(count>0 && byteIndex>0 && target