[apple/icu.git] / icuSources / i18n / unesctrn.cpp

/*
 **********************************************************************
 *   Copyright (c) 2001-2008, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   11/19/2001  aliu        Creation.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uchar.h"
#include "unesctrn.h"
#include "util.h"

#include "cmemory.h"

U_NAMESPACE_BEGIN

/**
 * Special character marking the end of the spec[] array.
 */
static const UChar END = 0xFFFF;

// Unicode: "U+10FFFF" hex, min=4, max=6
static const UChar SPEC_Unicode[] = {
    2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
    END
};

// Java: "\\uFFFF" hex, min=4, max=4
static const UChar SPEC_Java[] = {
    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
    END
};

// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
static const UChar SPEC_C[] = {
    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
    2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
    END
};

// XML: "&#x10FFFF;" hex, min=1, max=6
static const UChar SPEC_XML[] = {
    3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
    END
};

// XML10: "&#1114111;" dec, min=1, max=7 (not really "Hex-Any")
static const UChar SPEC_XML10[] = {
    2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
    END
};

// Perl: "\\x{263A}" hex, min=1, max=6
static const UChar SPEC_Perl[] = {
    3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
    END
};

// All: Java, C, Perl, XML, XML10, Unicode
static const UChar SPEC_Any[] = {
    2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
    2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
    3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
    2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
    3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
    END
};

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)

static UChar* copySpec(const UChar* spec) {
    int32_t len = 0;
    while (spec[len] != END) {
        ++len;
    }
    ++len;
    UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
    // Check for memory allocation error. 
    if (result != NULL) {
    	uprv_memcpy(result, spec, len*sizeof(result[0]));
    }
    return result;
}

/**
 * Factory methods.  Ignore the context.
 */
static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
    return new UnescapeTransliterator(ID, SPEC_Unicode);
}
static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
    return new UnescapeTransliterator(ID, SPEC_Java);
}
static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
    return new UnescapeTransliterator(ID, SPEC_C);
}
static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
    return new UnescapeTransliterator(ID, SPEC_XML);
}
static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
    return new UnescapeTransliterator(ID, SPEC_XML10);
}
static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
    return new UnescapeTransliterator(ID, SPEC_Perl);
}
static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
    return new UnescapeTransliterator(ID, SPEC_Any);
}

/**
 * Registers standard variants with the system.  Called by
 * Transliterator during initialization.
 */
void UnescapeTransliterator::registerIDs() {
    Token t = integerToken(0);

    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);

    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);

    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);

    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);

    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);

    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);

    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
}

/**
 * Constructor.  Takes the encoded spec array.
 */
UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
                                               const UChar *newSpec) :
    Transliterator(newID, NULL)
{
    this->spec = copySpec(newSpec);
}

/**
 * Copy constructor.
 */
UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
    Transliterator(o) {
    this->spec = copySpec(o.spec);
}

UnescapeTransliterator::~UnescapeTransliterator() {
    uprv_free(spec);
}

/**
 * Transliterator API.
 */
Transliterator* UnescapeTransliterator::clone() const {
    return new UnescapeTransliterator(*this);
}

/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
                                                 UBool isIncremental) const {
    int32_t start = pos.start;
    int32_t limit = pos.limit;
    int32_t i, j, ipat;

    while (start < limit) {
        // Loop over the forms in spec[].  Exit this loop when we
        // match one of the specs.  Exit the outer loop if a
        // partial match is detected and isIncremental is true.
        for (j=0, ipat=0; spec[ipat] != END; ++j) {

            // Read the header
            int32_t prefixLen = spec[ipat++];
            int32_t suffixLen = spec[ipat++];
            int8_t  radix     = (int8_t) spec[ipat++];
            int32_t minDigits = spec[ipat++];
            int32_t maxDigits = spec[ipat++];

            // s is a copy of start that is advanced over the
            // characters as we parse them.
            int32_t s = start;
            UBool match = TRUE;

            for (i=0; i<prefixLen; ++i) {
                if (s >= limit) {
                    if (i > 0) {
                        // We've already matched a character.  This is
                        // a partial match, so we return if in
                        // incremental mode.  In non-incremental mode,
                        // go to the next spec.
                        if (isIncremental) {
                            goto exit;
                        }
                        match = FALSE;
                        break;
                    }
                }
                UChar c = text.charAt(s++);
                if (c != spec[ipat + i]) {
                    match = FALSE;
                    break;
                }
            }

            if (match) {
                UChar32 u = 0;
                int32_t digitCount = 0;
                for (;;) {
                    if (s >= limit) {
                        // Check for partial match in incremental mode.
                        if (s > start && isIncremental) {
                            goto exit;
                        }
                        break;
                    }
                    UChar32 ch = text.char32At(s);
                    int32_t digit = u_digit(ch, radix);
                    if (digit < 0) {
                        break;
                    }
                    s += UTF_CHAR_LENGTH(ch);
                    u = (u * radix) + digit;
                    if (++digitCount == maxDigits) {
                        break;
                    }
                }

                match = (digitCount >= minDigits);

                if (match) {
                    for (i=0; i<suffixLen; ++i) {
                        if (s >= limit) {
                            // Check for partial match in incremental mode.
                            if (s > start && isIncremental) {
                                goto exit;
                            }
                            match = FALSE;
                            break;
                        }
                        UChar c = text.charAt(s++);
                        if (c != spec[ipat + prefixLen + i]) {
                            match = FALSE;
                            break;
                        }
                    }

                    if (match) {
                        // At this point, we have a match
                        UnicodeString str(u);
                        text.handleReplaceBetween(start, s, str);
                        limit -= s - start - str.length();
                        // The following break statement leaves the
                        // loop that is traversing the forms in
                        // spec[].  We then parse the next input
                        // character.
                        break;
                    }
                }
            }

            ipat += prefixLen + suffixLen;
        }

        if (start < limit) {
            start += UTF_CHAR_LENGTH(text.char32At(start));
        }
    }

  exit:
    pos.contextLimit += limit - pos.limit;
    pos.limit = limit;
    pos.start = start;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

//eof
Commit	Line	Data
b75a7d8f	1	/*
374ca955	2	**********************************************************************
46f4442e	3	* Copyright (c) 2001-2008, International Business Machines
374ca955 A	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	* Date Name Description
	7	* 11/19/2001 aliu Creation.
	8	**********************************************************************
	9	*/
b75a7d8f A	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_TRANSLITERATION
	14
	15	#include "unicode/uchar.h"
	16	#include "unesctrn.h"
	17	#include "util.h"
	18
	19	#include "cmemory.h"
	20
	21	U_NAMESPACE_BEGIN
	22
	23	/**
	24	* Special character marking the end of the spec[] array.
	25	*/
	26	static const UChar END = 0xFFFF;
	27
	28	// Unicode: "U+10FFFF" hex, min=4, max=6
	29	static const UChar SPEC_Unicode[] = {
	30	2, 0, 16, 4, 6, 85/U/, 43/+/,
	31	END
	32	};
	33
	34	// Java: "\\uFFFF" hex, min=4, max=4
	35	static const UChar SPEC_Java[] = {
	36	2, 0, 16, 4, 4, 92/\/, 117/u/,
	37	END
	38	};
	39
	40	// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
	41	static const UChar SPEC_C[] = {
	42	2, 0, 16, 4, 4, 92/\/, 117/u/,
	43	2, 0, 16, 8, 8, 92/\/, 85/U/,
	44	END
	45	};
	46
	47	// XML: "􏿿" hex, min=1, max=6
	48	static const UChar SPEC_XML[] = {
	49	3, 1, 16, 1, 6, 38/&/, 35/#/, 120/x/, 59/;/,
	50	END
	51	};
	52
	53	// XML10: "􏿿" dec, min=1, max=7 (not really "Hex-Any")
	54	static const UChar SPEC_XML10[] = {
	55	2, 1, 10, 1, 7, 38/&/, 35/#/, 59/;/,
	56	END
	57	};
	58
	59	// Perl: "\\x{263A}" hex, min=1, max=6
	60	static const UChar SPEC_Perl[] = {
	61	3, 1, 16, 1, 6, 92/\/, 120/x/, 123/{/, 125/}/,
	62	END
	63	};
	64
	65	// All: Java, C, Perl, XML, XML10, Unicode
	66	static const UChar SPEC_Any[] = {
	67	2, 0, 16, 4, 6, 85/U/, 43/+/, // Unicode
	68	2, 0, 16, 4, 4, 92/\/, 117/u/, // Java
	69	2, 0, 16, 8, 8, 92/\/, 85/U/, // C (surrogates)
	70	3, 1, 16, 1, 6, 38/&/, 35/#/, 120/x/, 59/;/, // XML
	71	2, 1, 10, 1, 7, 38/&/, 35/#/, 59/;/, // XML10
	72	3, 1, 16, 1, 6, 92/\/, 120/x/, 123/{/, 125/}/, // Perl
	73	END
74	};
75
374ca955	76	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
b75a7d8f	77
73c04bcf A	78	static UChar* copySpec(const UChar* spec) {
	79	int32_t len = 0;
	80	while (spec[len] != END) {
	81	++len;
	82	}
	83	++len;
	84	UChar result = (UChar )uprv_malloc(len*sizeof(UChar));
46f4442e A	85	// Check for memory allocation error.
	86	if (result != NULL) {
	87	uprv_memcpy(result, spec, len*sizeof(result[0]));
	88	}
73c04bcf A	89	return result;
	90	}
	91
b75a7d8f A	92	/**
	93	* Factory methods. Ignore the context.
	94	*/
73c04bcf	95	static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /context/) {
b75a7d8f A	96	return new UnescapeTransliterator(ID, SPEC_Unicode);
b75a7d8f A	97	}
73c04bcf	98	static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /context/) {
b75a7d8f A	99	return new UnescapeTransliterator(ID, SPEC_Java);
b75a7d8f A	100	}
73c04bcf	101	static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /context/) {
b75a7d8f A	102	return new UnescapeTransliterator(ID, SPEC_C);
b75a7d8f A	103	}
73c04bcf	104	static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /context/) {
b75a7d8f A	105	return new UnescapeTransliterator(ID, SPEC_XML);
b75a7d8f A	106	}
73c04bcf	107	static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /context/) {
b75a7d8f A	108	return new UnescapeTransliterator(ID, SPEC_XML10);
b75a7d8f A	109	}
73c04bcf	110	static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /context/) {
b75a7d8f A	111	return new UnescapeTransliterator(ID, SPEC_Perl);
b75a7d8f A	112	}
73c04bcf	113	static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /context/) {
b75a7d8f A	114	return new UnescapeTransliterator(ID, SPEC_Any);
	115	}
	116
	117	/**
	118	* Registers standard variants with the system. Called by
	119	* Transliterator during initialization.
	120	*/
	121	void UnescapeTransliterator::registerIDs() {
	122	Token t = integerToken(0);
	123
374ca955	124	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
b75a7d8f	125
374ca955	126	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
b75a7d8f	127
374ca955	128	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
b75a7d8f	129
374ca955	130	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
b75a7d8f	131
374ca955	132	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
b75a7d8f	133
374ca955	134	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
b75a7d8f	135
374ca955	136	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
b75a7d8f A	137	}
	138
	139	/**
	140	* Constructor. Takes the encoded spec array.
	141	*/
	142	UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
	143	const UChar *newSpec) :
	144	Transliterator(newID, NULL)
	145	{
	146	this->spec = copySpec(newSpec);
	147	}
	148
	149	/**
	150	* Copy constructor.
	151	*/
	152	UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
	153	Transliterator(o) {
	154	this->spec = copySpec(o.spec);
	155	}
	156
	157	UnescapeTransliterator::~UnescapeTransliterator() {
	158	uprv_free(spec);
	159	}
	160
	161	/**
	162	* Transliterator API.
	163	*/
	164	Transliterator* UnescapeTransliterator::clone() const {
	165	return new UnescapeTransliterator(*this);
	166	}
	167
b75a7d8f A	168	/**
	169	* Implements {@link Transliterator#handleTransliterate}.
	170	*/
	171	void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
	172	UBool isIncremental) const {
	173	int32_t start = pos.start;
	174	int32_t limit = pos.limit;
	175	int32_t i, j, ipat;
	176
	177	while (start < limit) {
	178	// Loop over the forms in spec[]. Exit this loop when we
	179	// match one of the specs. Exit the outer loop if a
	180	// partial match is detected and isIncremental is true.
	181	for (j=0, ipat=0; spec[ipat] != END; ++j) {
	182
	183	// Read the header
	184	int32_t prefixLen = spec[ipat++];
	185	int32_t suffixLen = spec[ipat++];
	186	int8_t radix = (int8_t) spec[ipat++];
	187	int32_t minDigits = spec[ipat++];
	188	int32_t maxDigits = spec[ipat++];
	189
	190	// s is a copy of start that is advanced over the
	191	// characters as we parse them.
	192	int32_t s = start;
	193	UBool match = TRUE;
	194
	195	for (i=0; i<prefixLen; ++i) {
	196	if (s >= limit) {
	197	if (i > 0) {
	198	// We've already matched a character. This is
	199	// a partial match, so we return if in
	200	// incremental mode. In non-incremental mode,
	201	// go to the next spec.
	202	if (isIncremental) {
	203	goto exit;
	204	}
	205	match = FALSE;
	206	break;
	207	}
	208	}
	209	UChar c = text.charAt(s++);
	210	if (c != spec[ipat + i]) {
	211	match = FALSE;
	212	break;
	213	}
	214	}
	215
	216	if (match) {
	217	UChar32 u = 0;
	218	int32_t digitCount = 0;
	219	for (;;) {
	220	if (s >= limit) {
	221	// Check for partial match in incremental mode.
	222	if (s > start && isIncremental) {
	223	goto exit;
	224	}
	225	break;
	226	}
	227	UChar32 ch = text.char32At(s);
	228	int32_t digit = u_digit(ch, radix);
	229	if (digit < 0) {
	230	break;
	231	}
232	s += UTF_CHAR_LENGTH(ch);
233	u = (u * radix) + digit;
234	if (++digitCount == maxDigits) {
235	break;
236	}
237	}
238
239	match = (digitCount >= minDigits);
240
241	if (match) {
242	for (i=0; i<suffixLen; ++i) {
243	if (s >= limit) {
244	// Check for partial match in incremental mode.
245	if (s > start && isIncremental) {
246	goto exit;
247	}
248	match = FALSE;
249	break;
250	}
251	UChar c = text.charAt(s++);
252	if (c != spec[ipat + prefixLen + i]) {
253	match = FALSE;
254	break;
255	}
256	}
257
258	if (match) {
259	// At this point, we have a match
260	UnicodeString str(u);
261	text.handleReplaceBetween(start, s, str);
262	limit -= s - start - str.length();
263	// The following break statement leaves the
264	// loop that is traversing the forms in
265	// spec[]. We then parse the next input
266	// character.
267	break;
268	}
269	}
270	}
271
272	ipat += prefixLen + suffixLen;
273	}
274
275	if (start < limit) {
276	start += UTF_CHAR_LENGTH(text.char32At(start));
277	}
278	}
279
280	exit:
281	pos.contextLimit += limit - pos.limit;
282	pos.limit = limit;
283	pos.start = start;
284	}
285
286	U_NAMESPACE_END
287
288	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
289
290	//eof