git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	**********************************************************************
	3	* Copyright (C) 1999-2004, International Business Machines
	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	* Date Name Description
	7	* 11/17/99 aliu Creation.
	8	**********************************************************************
	9	*/
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_TRANSLITERATION
	14
	15	#include "unicode/unistr.h"
	16	#include "unicode/uniset.h"
	17	#include "rbt_set.h"
	18	#include "rbt_rule.h"
	19	#include "cmemory.h"
	20	#include "putilimp.h"
	21
	22	U_CDECL_BEGIN
	23	static void U_EXPORT2 U_CALLCONV _deleteRule(void *rule) {
	24	delete (U_NAMESPACE_QUALIFIER TransliterationRule *)rule;
	25	}
	26	U_CDECL_END
	27
	28	//----------------------------------------------------------------------
	29	// BEGIN Debugging support
	30	//----------------------------------------------------------------------
	31
	32	// #define DEBUG_RBT
	33
	34	#ifdef DEBUG_RBT
	35	#include <stdio.h>
	36	#include "charstr.h"
	37
	38	/**
	39	* @param appendTo result is appended to this param.
	40	* @param input the string being transliterated
	41	* @param pos the index struct
	42	*/
	43	static UnicodeString& _formatInput(UnicodeString &appendTo,
	44	const UnicodeString& input,
	45	const UTransPosition& pos) {
	46	// Output a string of the form aaa{bbb\|ccc\|ddd}eee, where
	47	// the {} indicate the context start and limit, and the \|\|
	48	// indicate the start and limit.
	49	if (0 <= pos.contextStart &&
	50	pos.contextStart <= pos.start &&
	51	pos.start <= pos.limit &&
	52	pos.limit <= pos.contextLimit &&
	53	pos.contextLimit <= input.length()) {
	54
	55	UnicodeString a, b, c, d, e;
	56	input.extractBetween(0, pos.contextStart, a);
	57	input.extractBetween(pos.contextStart, pos.start, b);
	58	input.extractBetween(pos.start, pos.limit, c);
	59	input.extractBetween(pos.limit, pos.contextLimit, d);
	60	input.extractBetween(pos.contextLimit, input.length(), e);
	61	appendTo.append(a).append((UChar)123/{/).append(b).
	62	append((UChar)124/\|/).append(c).append((UChar)124/\|/).append(d).
	63	append((UChar)125/}/).append(e);
	64	} else {
	65	appendTo.append("INVALID UTransPosition");
	66	//appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
	67	// pos.contextStart + ", s=" + pos.start + ", l=" +
	68	// pos.limit + ", cl=" + pos.contextLimit + "} on " +
	69	// input);
	70	}
	71	return appendTo;
	72	}
	73
	74	// Append a hex string to the target
	75	UnicodeString& _appendHex(uint32_t number,
	76	int32_t digits,
	77	UnicodeString& target) {
	78	static const UChar digitString[] = {
	79	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
	80	0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0
	81	};
	82	while (digits--) {
	83	target += digitString[(number >> (digits*4)) & 0xF];
	84	}
	85	return target;
	86	}
	87
	88	// Replace nonprintable characters with unicode escapes
	89	UnicodeString& _escape(const UnicodeString &source,
	90	UnicodeString &target) {
	91	for (int32_t i = 0; i < source.length(); ) {
	92	UChar32 ch = source.char32At(i);
	93	i += UTF_CHAR_LENGTH(ch);
	94	if (ch < 0x09 \|\| (ch > 0x0A && ch < 0x20)\|\| ch > 0x7E) {
	95	if (ch <= 0xFFFF) {
	96	target += "\\u";
	97	_appendHex(ch, 4, target);
	98	} else {
	99	target += "\\U";
	100	_appendHex(ch, 8, target);
	101	}
	102	} else {
	103	target += ch;
	104	}
	105	}
	106	return target;
	107	}
	108
	109	inline void _debugOut(const char* msg, TransliterationRule* rule,
	110	const Replaceable& theText, UTransPosition& pos) {
	111	UnicodeString buf(msg, "");
	112	if (rule) {
	113	UnicodeString r;
	114	rule->toRule(r, TRUE);
	115	buf.append((UChar)32).append(r);
	116	}
	117	buf.append(UnicodeString(" => ", ""));
	118	UnicodeString* text = (UnicodeString*)&theText;
	119	_formatInput(buf, *text, pos);
	120	UnicodeString esc;
	121	_escape(buf, esc);
	122	CharString cbuf(esc);
	123	printf("%s\n", (char*) cbuf);
	124	}
	125
	126	#else
	127	#define _debugOut(msg, rule, theText, pos)
	128	#endif
	129
	130	//----------------------------------------------------------------------
	131	// END Debugging support
	132	//----------------------------------------------------------------------
	133
	134	// Fill the precontext and postcontext with the patterns of the rules
	135	// that are masking one another.
	136	static void maskingError(const U_NAMESPACE_QUALIFIER TransliterationRule& rule1,
	137	const U_NAMESPACE_QUALIFIER TransliterationRule& rule2,
	138	UParseError& parseError) {
	139	U_NAMESPACE_QUALIFIER UnicodeString r;
	140	int32_t len;
	141
	142	parseError.line = parseError.offset = -1;
	143
	144	// for pre-context
	145	rule1.toRule(r, FALSE);
	146	len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);
	147	r.extract(0, len, parseError.preContext);
	148	parseError.preContext[len] = 0;
	149
	150	//for post-context
	151	r.truncate(0);
	152	rule2.toRule(r, FALSE);
	153	len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);
	154	r.extract(0, len, parseError.postContext);
	155	parseError.postContext[len] = 0;
	156	}
	157
	158	U_NAMESPACE_BEGIN
	159
	160	/**
	161	* Construct a new empty rule set.
	162	*/
	163	TransliterationRuleSet::TransliterationRuleSet(UErrorCode& status) : UMemory() {
	164	ruleVector = new UVector(&_deleteRule, NULL, status);
	165	rules = NULL;
	166	maxContextLength = 0;
	167	if (ruleVector == NULL) {
	168	status = U_MEMORY_ALLOCATION_ERROR;
	169	}
	170	}
	171
	172	/**
	173	* Copy constructor.
	174	*/
	175	TransliterationRuleSet::TransliterationRuleSet(const TransliterationRuleSet& other) :
	176	UMemory(other),
	177	ruleVector(0),
	178	rules(0),
	179	maxContextLength(other.maxContextLength) {
	180
	181	int32_t i, len;
	182	uprv_memcpy(index, other.index, sizeof(index));
	183	UErrorCode status = U_ZERO_ERROR;
	184	ruleVector = new UVector(&_deleteRule, NULL, status);
	185	if (other.ruleVector != 0 && ruleVector != 0 && U_SUCCESS(status)) {
	186	len = other.ruleVector->size();
	187	for (i=0; i<len && U_SUCCESS(status); ++i) {
	188	ruleVector->addElement(new TransliterationRule(
	189	(TransliterationRule)other.ruleVector->elementAt(i)), status);
	190	}
	191	}
	192	if (other.rules != 0) {
	193	UParseError p;
	194	freeze(p, status);
	195	}
	196	}
	197
	198	/**
	199	* Destructor.
	200	*/
	201	TransliterationRuleSet::~TransliterationRuleSet() {
	202	delete ruleVector; // This deletes the contained rules
	203	uprv_free(rules);
	204	}
	205
	206	void TransliterationRuleSet::setData(const TransliterationRuleData* d) {
	207	/**
	208	* We assume that the ruleset has already been frozen.
	209	*/
	210	int32_t len = index[256]; // see freeze()
	211	for (int32_t i=0; i<len; ++i) {
	212	rules[i]->setData(d);
	213	}
	214	}
	215
	216	/**
	217	* Return the maximum context length.
	218	* @return the length of the longest preceding context.
	219	*/
	220	int32_t TransliterationRuleSet::getMaximumContextLength(void) const {
	221	return maxContextLength;
	222	}
	223
	224	/**
	225	* Add a rule to this set. Rules are added in order, and order is
	226	* significant. The last call to this method must be followed by
	227	* a call to <code>freeze()</code> before the rule set is used.
	228	*
	229	* <p>If freeze() has already been called, calling addRule()
	230	* unfreezes the rules, and freeze() must be called again.
	231	*
	232	* @param adoptedRule the rule to add
	233	*/
	234	void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
	235	UErrorCode& status) {
	236	if (U_FAILURE(status)) {
	237	delete adoptedRule;
	238	return;
	239	}
	240	ruleVector->addElement(adoptedRule, status);
	241
	242	int32_t len;
	243	if ((len = adoptedRule->getContextLength()) > maxContextLength) {
	244	maxContextLength = len;
	245	}
	246
	247	uprv_free(rules);
	248	rules = 0;
	249	}
	250
	251	/**
	252	* Check this for masked rules and index it to optimize performance.
	253	* The sequence of operations is: (1) add rules to a set using
	254	* <code>addRule()</code>; (2) freeze the set using
	255	* <code>freeze()</code>; (3) use the rule set. If
	256	* <code>addRule()</code> is called after calling this method, it
	257	* invalidates this object, and this method must be called again.
	258	* That is, <code>freeze()</code> may be called multiple times,
	259	* although for optimal performance it shouldn't be.
	260	*/
	261	void TransliterationRuleSet::freeze(UParseError& parseError,UErrorCode& status) {
	262	/* Construct the rule array and index table. We reorder the
	263	* rules by sorting them into 256 bins. Each bin contains all
	264	* rules matching the index value for that bin. A rule
	265	* matches an index value if string whose first key character
	266	* has a low byte equal to the index value can match the rule.
	267	*
	268	* Each bin contains zero or more rules, in the same order
	269	* they were found originally. However, the total rules in
	270	* the bins may exceed the number in the original vector,
	271	* since rules that have a variable as their first key
	272	* character will generally fall into more than one bin.
	273	*
	274	* That is, each bin contains all rules that either have that
	275	* first index value as their first key character, or have
	276	* a set containing the index value as their first character.
	277	*/
	278	int32_t n = ruleVector->size();
	279	int32_t j;
	280	int16_t x;
	281	UVector v(2*n, status); // heuristic; adjust as needed
	282
	283	if (U_FAILURE(status)) {
	284	return;
	285	}
	286
	287	/* Precompute the index values. This saves a LOT of time.
	288	* Be careful not to call malloc(0).
	289	*/
	290	int16_t* indexValue = (int16_t) uprv_malloc( sizeof(int16_t) (n > 0 ? n : 1) );
	291	/* test for NULL */
	292	if (indexValue == 0) {
	293	status = U_MEMORY_ALLOCATION_ERROR;
	294	return;
	295	}
	296	for (j=0; j<n; ++j) {
	297	TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
	298	indexValue[j] = r->getIndexValue();
	299	}
	300	for (x=0; x<256; ++x) {
	301	index[x] = v.size();
	302	for (j=0; j<n; ++j) {
	303	if (indexValue[j] >= 0) {
	304	if (indexValue[j] == x) {
	305	v.addElement(ruleVector->elementAt(j), status);
	306	}
	307	} else {
	308	// If the indexValue is < 0, then the first key character is
	309	// a set, and we must use the more time-consuming
	310	// matchesIndexValue check. In practice this happens
	311	// rarely, so we seldom tread this code path.
	312	TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
	313	if (r->matchesIndexValue((uint8_t)x)) {
	314	v.addElement(r, status);
	315	}
	316	}
	317	}
	318	}
	319	uprv_free(indexValue);
	320	index[256] = v.size();
	321
	322	/* Freeze things into an array.
	323	*/
	324	uprv_free(rules); // Contains alias pointers
	325
	326	/* You can't do malloc(0)! */
	327	if (v.size() == 0) {
	328	rules = NULL;
	329	return;
	330	}
	331	rules = (TransliterationRule *)uprv_malloc(v.size() sizeof(TransliterationRule *));
	332	/* test for NULL */
	333	if (rules == 0) {
	334	status = U_MEMORY_ALLOCATION_ERROR;
	335	return;
	336	}
	337	for (j=0; j<v.size(); ++j) {
	338	rules[j] = (TransliterationRule*) v.elementAt(j);
	339	}
	340
	341	// TODO Add error reporting that indicates the rules that
	342	// are being masked.
	343	//UnicodeString errors;
	344
	345	/* Check for masking. This is MUCH faster than our old check,
	346	* which was each rule against each following rule, since we
	347	* only have to check for masking within each bin now. It's
	348	* 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
	349	* count, and n2 is the per-bin rule count. But n2<<n1, so
	350	* it's a big win.
	351	*/
	352	for (x=0; x<256; ++x) {
	353	for (j=index[x]; j<index[x+1]-1; ++j) {
	354	TransliterationRule* r1 = rules[j];
	355	for (int32_t k=j+1; k<index[x+1]; ++k) {
	356	TransliterationRule* r2 = rules[k];
	357	if (r1->masks(*r2)) {
	358	//\| if (errors == null) {
	359	//\| errors = new StringBuffer();
	360	//\| } else {
	361	//\| errors.append("\n");
	362	//\| }
	363	//\| errors.append("Rule " + r1 + " masks " + r2);
	364	status = U_RULE_MASK_ERROR;
	365	maskingError(r1, r2, parseError);
	366	return;
	367	}
	368	}
	369	}
	370	}
	371
	372	//if (errors != null) {
	373	// throw new IllegalArgumentException(errors.toString());
	374	//}
	375	}
	376
	377	/**
	378	* Transliterate the given text with the given UTransPosition
	379	* indices. Return TRUE if the transliteration should continue
	380	* or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
	381	* Note that FALSE is only ever returned if isIncremental is TRUE.
	382	* @param text the text to be transliterated
	383	* @param pos the position indices, which will be updated
	384	* @param incremental if TRUE, assume new text may be inserted
	385	* at index.limit, and return FALSE if thre is a partial match.
	386	* @return TRUE unless a U_PARTIAL_MATCH has been obtained,
	387	* indicating that transliteration should stop until more text
	388	* arrives.
	389	*/
	390	UBool TransliterationRuleSet::transliterate(Replaceable& text,
	391	UTransPosition& pos,
	392	UBool incremental) {
	393	int16_t indexByte = (int16_t) (text.char32At(pos.start) & 0xFF);
	394	for (int32_t i=index[indexByte]; i<index[indexByte+1]; ++i) {
	395	UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental);
	396	switch (m) {
	397	case U_MATCH:
	398	_debugOut("match", rules[i], text, pos);
	399	return TRUE;
	400	case U_PARTIAL_MATCH:
	401	_debugOut("partial match", rules[i], text, pos);
	402	return FALSE;
	403	default: /* Ram: added default to make GCC happy */
	404	break;
	405	}
	406	}
	407	// No match or partial match from any rule
	408	pos.start += UTF_CHAR_LENGTH(text.char32At(pos.start));
	409	_debugOut("no match", NULL, text, pos);
	410	return TRUE;
	411	}
	412
	413	/**
	414	* Create rule strings that represents this rule set.
	415	*/
	416	UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
	417	UBool escapeUnprintable) const {
	418	int32_t i;
	419	int32_t count = ruleVector->size();
	420	ruleSource.truncate(0);
	421	for (i=0; i<count; ++i) {
	422	if (i != 0) {
	423	ruleSource.append((UChar) 0x000A /\n/);
	424	}
	425	TransliterationRule *r =
	426	(TransliterationRule*) ruleVector->elementAt(i);
	427	r->toRule(ruleSource, escapeUnprintable);
	428	}
	429	return ruleSource;
	430	}
	431
	432	/**
	433	* Return the set of all characters that may be modified
	434	* (getTarget=false) or emitted (getTarget=true) by this set.
	435	*/
	436	UnicodeSet& TransliterationRuleSet::getSourceTargetSet(UnicodeSet& result,
	437	UBool getTarget) const {
	438	result.clear();
	439	int32_t count = ruleVector->size();
	440	for (int32_t i=0; i<count; ++i) {
	441	TransliterationRule* r =
	442	(TransliterationRule*) ruleVector->elementAt(i);
	443	if (getTarget) {
	444	r->addTargetSetTo(result);
	445	} else {
	446	r->addSourceSetTo(result);
	447	}
	448	}
	449	return result;
	450	}
	451
	452	U_NAMESPACE_END
	453
	454	#endif /* #if !UCONFIG_NO_TRANSLITERATION */