git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
	3	**********************************************************************
	4	* Date Name Description
	5	* 11/17/99 aliu Creation.
	6	**********************************************************************
	7	*/
	8	#ifndef RBT_RULE_H
	9	#define RBT_RULE_H
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_TRANSLITERATION
	14
	15	#include "unicode/uobject.h"
	16	#include "unicode/unistr.h"
	17	#include "unicode/utrans.h"
	18	#include "unicode/unimatch.h"
	19
	20	U_NAMESPACE_BEGIN
	21
	22	class Replaceable;
	23	class TransliterationRuleData;
	24	class StringMatcher;
	25	class UnicodeFunctor;
	26
	27	/**
	28	* A transliteration rule used by
	29	* <code>RuleBasedTransliterator</code>.
	30	* <code>TransliterationRule</code> is an immutable object.
	31	*
	32	* <p>A rule consists of an input pattern and an output string. When
	33	* the input pattern is matched, the output string is emitted. The
	34	* input pattern consists of zero or more characters which are matched
	35	* exactly (the key) and optional context. Context must match if it
	36	* is specified. Context may be specified before the key, after the
	37	* key, or both. The key, preceding context, and following context
	38	* may contain variables. Variables represent a set of Unicode
	39	* characters, such as the letters <i>a</i> through <i>z</i>.
	40	* Variables are detected by looking up each character in a supplied
	41	* variable list to see if it has been so defined.
	42	*
	43	* <p>A rule may contain segments in its input string and segment
	44	* references in its output string. A segment is a substring of the
	45	* input pattern, indicated by an offset and limit. The segment may
	46	* be in the preceding or following context. It may not span a
	47	* context boundary. A segment reference is a special character in
	48	* the output string that causes a segment of the input string (not
	49	* the input pattern) to be copied to the output string. The range of
	50	* special characters that represent segment references is defined by
	51	* RuleBasedTransliterator.Data.
	52	*
	53	* @author Alan Liu
	54	*/
	55	class TransliterationRule : public UMemory {
	56
	57	private:
	58
	59	// TODO Eliminate the pattern and keyLength data members. They
	60	// are used only by masks() and getIndexValue() which are called
	61	// only during build time, not during run-time. Perhaps these
	62	// methods and pattern/keyLength can be isolated into a separate
	63	// object.
	64
	65	/**
	66	* The match that must occur before the key, or null if there is no
	67	* preceding context.
	68	*/
	69	StringMatcher *anteContext;
	70
	71	/**
	72	* The matcher object for the key. If null, then the key is empty.
	73	*/
	74	StringMatcher *key;
	75
	76	/**
	77	* The match that must occur after the key, or null if there is no
	78	* following context.
	79	*/
	80	StringMatcher *postContext;
	81
	82	/**
	83	* The object that performs the replacement if the key,
	84	* anteContext, and postContext are matched. Never null.
	85	*/
	86	UnicodeFunctor* output;
	87
	88	/**
	89	* The string that must be matched, consisting of the anteContext, key,
	90	* and postContext, concatenated together, in that order. Some components
	91	* may be empty (zero length).
	92	* @see anteContextLength
	93	* @see keyLength
	94	*/
	95	UnicodeString pattern;
	96
	97	/**
	98	* An array of matcher objects corresponding to the input pattern
	99	* segments. If there are no segments this is null. N.B. This is
	100	* a UnicodeMatcher for generality, but in practice it is always a
	101	* StringMatcher. In the future we may generalize this, but for
	102	* now we sometimes cast down to StringMatcher.
	103	*
	104	* The array is owned, but the pointers within it are not.
	105	*/
	106	UnicodeFunctor** segments;
	107
	108	/**
	109	* The number of elements in segments[] or zero if segments is NULL.
	110	*/
	111	int32_t segmentsCount;
	112
	113	/**
	114	* The length of the string that must match before the key. If
	115	* zero, then there is no matching requirement before the key.
	116	* Substring [0,anteContextLength) of pattern is the anteContext.
	117	*/
	118	int32_t anteContextLength;
	119
	120	/**
	121	* The length of the key. Substring [anteContextLength,
	122	* anteContextLength + keyLength) is the key.
	123
	124	*/
	125	int32_t keyLength;
	126
	127	/**
	128	* Miscellaneous attributes.
	129	*/
	130	int8_t flags;
	131
	132	/**
	133	* Flag attributes.
	134	*/
	135	enum {
	136	ANCHOR_START = 1,
	137	ANCHOR_END = 2
	138	};
	139
	140	/**
	141	* An alias pointer to the data for this rule. The data provides
	142	* lookup services for matchers and segments.
	143	*/
	144	const TransliterationRuleData* data;
	145
	146	public:
	147
	148	/**
	149	* Construct a new rule with the given input, output text, and other
	150	* attributes. A cursor position may be specified for the output text.
	151	* @param input input string, including key and optional ante and
	152	* post context.
	153	* @param anteContextPos offset into input to end of ante context, or -1 if
	154	* none. Must be <= input.length() if not -1.
	155	* @param postContextPos offset into input to start of post context, or -1
	156	* if none. Must be <= input.length() if not -1, and must be >=
	157	* anteContextPos.
	158	* @param outputStr output string.
	159	* @param cursorPosition offset into output at which cursor is located, or -1 if
	160	* none. If less than zero, then the cursor is placed after the
	161	* <code>output</code>; that is, -1 is equivalent to
	162	* <code>output.length()</code>. If greater than
	163	* <code>output.length()</code> then an exception is thrown.
	164	* @param cursorOffset an offset to be added to cursorPos to position the
	165	* cursor either in the ante context, if < 0, or in the post context, if >
	166	* 0. For example, the rule "abc{def} > \| @@@ xyz;" changes "def" to
	167	* "xyz" and moves the cursor to before "a". It would have a cursorOffset
	168	* of -3.
	169	* @param segs array of UnicodeMatcher corresponding to input pattern
	170	* segments, or null if there are none. The array itself is adopted,
	171	* but the pointers within it are not.
	172	* @param segsCount number of elements in segs[].
	173	* @param anchorStart TRUE if the the rule is anchored on the left to
	174	* the context start.
	175	* @param anchorEnd TRUE if the rule is anchored on the right to the
	176	* context limit.
	177	* @param data the rule data.
	178	* @param status Output parameter filled in with success or failure status.
	179	*/
	180	TransliterationRule(const UnicodeString& input,
	181	int32_t anteContextPos, int32_t postContextPos,
	182	const UnicodeString& outputStr,
	183	int32_t cursorPosition, int32_t cursorOffset,
	184	UnicodeFunctor** segs,
	185	int32_t segsCount,
	186	UBool anchorStart, UBool anchorEnd,
	187	const TransliterationRuleData* data,
	188	UErrorCode& status);
	189
	190	/**
	191	* Copy constructor.
	192	* @param other the object to be copied.
	193	*/
	194	TransliterationRule(TransliterationRule& other);
	195
	196	/**
	197	* Destructor.
	198	*/
	199	virtual ~TransliterationRule();
	200
	201	/**
	202	* Change the data object that this rule belongs to. Used
	203	* internally by the TransliterationRuleData copy constructor.
	204	* @param data the new data value to be set.
	205	*/
	206	void setData(const TransliterationRuleData* data);
	207
	208	/**
	209	* Return the preceding context length. This method is needed to
	210	* support the <code>Transliterator</code> method
	211	* <code>getMaximumContextLength()</code>. Internally, this is
	212	* implemented as the anteContextLength, optionally plus one if
	213	* there is a start anchor. The one character anchor gap is
	214	* needed to make repeated incremental transliteration with
	215	* anchors work.
	216	* @return the preceding context length.
	217	*/
	218	virtual int32_t getContextLength(void) const;
	219
	220	/**
	221	* Internal method. Returns 8-bit index value for this rule.
	222	* This is the low byte of the first character of the key,
	223	* unless the first character of the key is a set. If it's a
	224	* set, or otherwise can match multiple keys, the index value is -1.
	225	* @return 8-bit index value for this rule.
	226	*/
	227	int16_t getIndexValue() const;
	228
	229	/**
	230	* Internal method. Returns true if this rule matches the given
	231	* index value. The index value is an 8-bit integer, 0..255,
	232	* representing the low byte of the first character of the key.
	233	* It matches this rule if it matches the first character of the
	234	* key, or if the first character of the key is a set, and the set
	235	* contains any character with a low byte equal to the index
	236	* value. If the rule contains only ante context, as in foo)>bar,
	237	* then it will match any key.
	238	* @param v the given index value.
	239	* @return true if this rule matches the given index value.
	240	*/
	241	UBool matchesIndexValue(uint8_t v) const;
	242
	243	/**
	244	* Return true if this rule masks another rule. If r1 masks r2 then
	245	* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
	246	* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
	247	* "[c]a>x" masks "[dc]a>y".
	248	* @param r2 the given rule to be compared with.
	249	* @return true if this rule masks 'r2'
	250	*/
	251	virtual UBool masks(const TransliterationRule& r2) const;
	252
	253	/**
	254	* Attempt a match and replacement at the given position. Return
	255	* the degree of match between this rule and the given text. The
	256	* degree of match may be mismatch, a partial match, or a full
	257	* match. A mismatch means at least one character of the text
	258	* does not match the context or key. A partial match means some
	259	* context and key characters match, but the text is not long
	260	* enough to match all of them. A full match means all context
	261	* and key characters match.
	262	*
	263	* If a full match is obtained, perform a replacement, update pos,
	264	* and return U_MATCH. Otherwise both text and pos are unchanged.
	265	*
	266	* @param text the text
	267	* @param pos the position indices
	268	* @param incremental if TRUE, test for partial matches that may
	269	* be completed by additional text inserted at pos.limit.
	270	* @return one of <code>U_MISMATCH</code>,
	271	* <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If
	272	* incremental is FALSE then U_PARTIAL_MATCH will not be returned.
	273	*/
	274	UMatchDegree matchAndReplace(Replaceable& text,
	275	UTransPosition& pos,
	276	UBool incremental) const;
	277
	278	/**
	279	* Create a rule string that represents this rule object. Append
	280	* it to the given string.
	281	*/
	282	virtual UnicodeString& toRule(UnicodeString& pat,
	283	UBool escapeUnprintable) const;
	284
	285	/**
	286	* Union the set of all characters that may be modified by this rule
	287	* into the given set.
	288	*/
	289	void addSourceSetTo(UnicodeSet& toUnionTo) const;
	290
	291	/**
	292	* Union the set of all characters that may be emitted by this rule
	293	* into the given set.
	294	*/
	295	void addTargetSetTo(UnicodeSet& toUnionTo) const;
	296
	297	private:
	298
	299	friend class StringMatcher;
	300
	301	TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class
	302	};
	303
	304	U_NAMESPACE_END
	305
	306	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
	307
	308	#endif