git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	**********************************************************************
	3	* Copyright (c) 2001-2004, International Business Machines
	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	* Date Name Description
	7	* 11/19/2001 aliu Creation.
	8	**********************************************************************
	9	*/
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_TRANSLITERATION
	14
	15	#include "unicode/uchar.h"
	16	#include "unesctrn.h"
	17	#include "util.h"
	18
	19	#include "cmemory.h"
	20
	21	U_NAMESPACE_BEGIN
	22
	23	/**
	24	* Special character marking the end of the spec[] array.
	25	*/
	26	static const UChar END = 0xFFFF;
	27
	28	// Unicode: "U+10FFFF" hex, min=4, max=6
	29	static const UChar SPEC_Unicode[] = {
	30	2, 0, 16, 4, 6, 85/U/, 43/+/,
	31	END
	32	};
	33
	34	// Java: "\\uFFFF" hex, min=4, max=4
	35	static const UChar SPEC_Java[] = {
	36	2, 0, 16, 4, 4, 92/\/, 117/u/,
	37	END
	38	};
	39
	40	// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
	41	static const UChar SPEC_C[] = {
	42	2, 0, 16, 4, 4, 92/\/, 117/u/,
	43	2, 0, 16, 8, 8, 92/\/, 85/U/,
	44	END
	45	};
	46
	47	// XML: "􏿿" hex, min=1, max=6
	48	static const UChar SPEC_XML[] = {
	49	3, 1, 16, 1, 6, 38/&/, 35/#/, 120/x/, 59/;/,
	50	END
	51	};
	52
	53	// XML10: "􏿿" dec, min=1, max=7 (not really "Hex-Any")
	54	static const UChar SPEC_XML10[] = {
	55	2, 1, 10, 1, 7, 38/&/, 35/#/, 59/;/,
	56	END
	57	};
	58
	59	// Perl: "\\x{263A}" hex, min=1, max=6
	60	static const UChar SPEC_Perl[] = {
	61	3, 1, 16, 1, 6, 92/\/, 120/x/, 123/{/, 125/}/,
	62	END
	63	};
	64
	65	// All: Java, C, Perl, XML, XML10, Unicode
	66	static const UChar SPEC_Any[] = {
	67	2, 0, 16, 4, 6, 85/U/, 43/+/, // Unicode
	68	2, 0, 16, 4, 4, 92/\/, 117/u/, // Java
	69	2, 0, 16, 8, 8, 92/\/, 85/U/, // C (surrogates)
	70	3, 1, 16, 1, 6, 38/&/, 35/#/, 120/x/, 59/;/, // XML
	71	2, 1, 10, 1, 7, 38/&/, 35/#/, 59/;/, // XML10
	72	3, 1, 16, 1, 6, 92/\/, 120/x/, 123/{/, 125/}/, // Perl
	73	END
	74	};
	75
	76	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
	77
	78	/**
	79	* Factory methods. Ignore the context.
	80	*/
	81	Transliterator* UnescapeTransliterator::_createUnicode(const UnicodeString& ID, Token /context/) {
	82	return new UnescapeTransliterator(ID, SPEC_Unicode);
	83	}
	84	Transliterator* UnescapeTransliterator::_createJava(const UnicodeString& ID, Token /context/) {
	85	return new UnescapeTransliterator(ID, SPEC_Java);
	86	}
	87	Transliterator* UnescapeTransliterator::_createC(const UnicodeString& ID, Token /context/) {
	88	return new UnescapeTransliterator(ID, SPEC_C);
	89	}
	90	Transliterator* UnescapeTransliterator::_createXML(const UnicodeString& ID, Token /context/) {
	91	return new UnescapeTransliterator(ID, SPEC_XML);
	92	}
	93	Transliterator* UnescapeTransliterator::_createXML10(const UnicodeString& ID, Token /context/) {
	94	return new UnescapeTransliterator(ID, SPEC_XML10);
	95	}
	96	Transliterator* UnescapeTransliterator::_createPerl(const UnicodeString& ID, Token /context/) {
	97	return new UnescapeTransliterator(ID, SPEC_Perl);
	98	}
	99	Transliterator* UnescapeTransliterator::_createAny(const UnicodeString& ID, Token /context/) {
	100	return new UnescapeTransliterator(ID, SPEC_Any);
	101	}
	102
	103	/**
	104	* Registers standard variants with the system. Called by
	105	* Transliterator during initialization.
	106	*/
	107	void UnescapeTransliterator::registerIDs() {
	108	Token t = integerToken(0);
	109
	110	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
	111
	112	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
	113
	114	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
	115
	116	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
	117
	118	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
	119
	120	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
	121
	122	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
	123	}
	124
	125	/**
	126	* Constructor. Takes the encoded spec array.
	127	*/
	128	UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
	129	const UChar *newSpec) :
	130	Transliterator(newID, NULL)
	131	{
	132	this->spec = copySpec(newSpec);
	133	}
	134
	135	/**
	136	* Copy constructor.
	137	*/
	138	UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
	139	Transliterator(o) {
	140	this->spec = copySpec(o.spec);
	141	}
	142
	143	UnescapeTransliterator::~UnescapeTransliterator() {
	144	uprv_free(spec);
	145	}
	146
	147	/**
	148	* Transliterator API.
	149	*/
	150	Transliterator* UnescapeTransliterator::clone() const {
	151	return new UnescapeTransliterator(*this);
	152	}
	153
	154	UChar* UnescapeTransliterator::copySpec(const UChar* spec) {
	155	int32_t len = 0;
	156	while (spec[len] != END) {
	157	++len;
	158	}
	159	++len;
	160	UChar result = (UChar )uprv_malloc(len*sizeof(UChar));
	161	uprv_memcpy(result, spec, len*sizeof(result[0]));
	162	return result;
	163	}
	164
	165	/**
	166	* Implements {@link Transliterator#handleTransliterate}.
	167	*/
	168	void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
	169	UBool isIncremental) const {
	170	int32_t start = pos.start;
	171	int32_t limit = pos.limit;
	172	int32_t i, j, ipat;
	173
	174	while (start < limit) {
	175	// Loop over the forms in spec[]. Exit this loop when we
	176	// match one of the specs. Exit the outer loop if a
	177	// partial match is detected and isIncremental is true.
	178	for (j=0, ipat=0; spec[ipat] != END; ++j) {
	179
	180	// Read the header
	181	int32_t prefixLen = spec[ipat++];
	182	int32_t suffixLen = spec[ipat++];
	183	int8_t radix = (int8_t) spec[ipat++];
	184	int32_t minDigits = spec[ipat++];
	185	int32_t maxDigits = spec[ipat++];
	186
	187	// s is a copy of start that is advanced over the
	188	// characters as we parse them.
	189	int32_t s = start;
	190	UBool match = TRUE;
	191
	192	for (i=0; i<prefixLen; ++i) {
	193	if (s >= limit) {
	194	if (i > 0) {
	195	// We've already matched a character. This is
	196	// a partial match, so we return if in
	197	// incremental mode. In non-incremental mode,
	198	// go to the next spec.
	199	if (isIncremental) {
	200	goto exit;
	201	}
	202	match = FALSE;
	203	break;
	204	}
	205	}
	206	UChar c = text.charAt(s++);
	207	if (c != spec[ipat + i]) {
	208	match = FALSE;
	209	break;
	210	}
	211	}
	212
	213	if (match) {
	214	UChar32 u = 0;
	215	int32_t digitCount = 0;
	216	for (;;) {
	217	if (s >= limit) {
	218	// Check for partial match in incremental mode.
	219	if (s > start && isIncremental) {
	220	goto exit;
	221	}
	222	break;
	223	}
	224	UChar32 ch = text.char32At(s);
	225	int32_t digit = u_digit(ch, radix);
	226	if (digit < 0) {
	227	break;
	228	}
	229	s += UTF_CHAR_LENGTH(ch);
	230	u = (u * radix) + digit;
	231	if (++digitCount == maxDigits) {
	232	break;
	233	}
	234	}
	235
	236	match = (digitCount >= minDigits);
	237
	238	if (match) {
	239	for (i=0; i<suffixLen; ++i) {
	240	if (s >= limit) {
	241	// Check for partial match in incremental mode.
	242	if (s > start && isIncremental) {
	243	goto exit;
	244	}
	245	match = FALSE;
	246	break;
	247	}
	248	UChar c = text.charAt(s++);
	249	if (c != spec[ipat + prefixLen + i]) {
	250	match = FALSE;
	251	break;
	252	}
	253	}
	254
	255	if (match) {
	256	// At this point, we have a match
	257	UnicodeString str(u);
	258	text.handleReplaceBetween(start, s, str);
	259	limit -= s - start - str.length();
	260	// The following break statement leaves the
	261	// loop that is traversing the forms in
	262	// spec[]. We then parse the next input
	263	// character.
	264	break;
	265	}
	266	}
	267	}
	268
	269	ipat += prefixLen + suffixLen;
	270	}
	271
	272	if (start < limit) {
	273	start += UTF_CHAR_LENGTH(text.char32At(start));
	274	}
	275	}
	276
	277	exit:
	278	pos.contextLimit += limit - pos.limit;
	279	pos.limit = limit;
	280	pos.start = start;
	281	}
	282
	283	U_NAMESPACE_END
	284
	285	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
	286
	287	//eof