[apple/icu.git] / icuSources / i18n / rbt_rule.h

/*
* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#ifndef RBT_RULE_H
#define RBT_RULE_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/utrans.h"
#include "unicode/unimatch.h"

U_NAMESPACE_BEGIN

class Replaceable;
class TransliterationRuleData;
class StringMatcher;
class UnicodeFunctor;

/**
 * A transliteration rule used by
 * <code>RuleBasedTransliterator</code>.
 * <code>TransliterationRule</code> is an immutable object.
 *
 * <p>A rule consists of an input pattern and an output string.  When
 * the input pattern is matched, the output string is emitted.  The
 * input pattern consists of zero or more characters which are matched
 * exactly (the key) and optional context.  Context must match if it
 * is specified.  Context may be specified before the key, after the
 * key, or both.  The key, preceding context, and following context
 * may contain variables.  Variables represent a set of Unicode
 * characters, such as the letters <i>a</i> through <i>z</i>.
 * Variables are detected by looking up each character in a supplied
 * variable list to see if it has been so defined.
 *
 * <p>A rule may contain segments in its input string and segment
 * references in its output string.  A segment is a substring of the
 * input pattern, indicated by an offset and limit.  The segment may
 * be in the preceding or following context.  It may not span a
 * context boundary.  A segment reference is a special character in
 * the output string that causes a segment of the input string (not
 * the input pattern) to be copied to the output string.  The range of
 * special characters that represent segment references is defined by
 * RuleBasedTransliterator.Data.
 *
 * @author Alan Liu
 */
class TransliterationRule : public UMemory {

private:

    // TODO Eliminate the pattern and keyLength data members.  They
    // are used only by masks() and getIndexValue() which are called
    // only during build time, not during run-time.  Perhaps these
    // methods and pattern/keyLength can be isolated into a separate
    // object.

    /**
     * The match that must occur before the key, or null if there is no
     * preceding context.
     */
    StringMatcher *anteContext;

    /**
     * The matcher object for the key.  If null, then the key is empty.
     */
    StringMatcher *key;

    /**
     * The match that must occur after the key, or null if there is no
     * following context.
     */
    StringMatcher *postContext;

    /**
     * The object that performs the replacement if the key,
     * anteContext, and postContext are matched.  Never null.
     */
    UnicodeFunctor* output;

    /**
     * The string that must be matched, consisting of the anteContext, key,
     * and postContext, concatenated together, in that order.  Some components
     * may be empty (zero length).
     * @see anteContextLength
     * @see keyLength
     */
    UnicodeString pattern;

    /**
     * An array of matcher objects corresponding to the input pattern
     * segments.  If there are no segments this is null.  N.B. This is
     * a UnicodeMatcher for generality, but in practice it is always a
     * StringMatcher.  In the future we may generalize this, but for
     * now we sometimes cast down to StringMatcher.
     *
     * The array is owned, but the pointers within it are not.
     */
    UnicodeFunctor** segments;

    /**
     * The number of elements in segments[] or zero if segments is NULL.
     */
    int32_t segmentsCount;

    /**
     * The length of the string that must match before the key.  If
     * zero, then there is no matching requirement before the key.
     * Substring [0,anteContextLength) of pattern is the anteContext.
     */
    int32_t anteContextLength;

    /**
     * The length of the key.  Substring [anteContextLength,
     * anteContextLength + keyLength) is the key.

     */
    int32_t keyLength;

    /**
     * Miscellaneous attributes.
     */
    int8_t flags;

    /**
     * Flag attributes.
     */
    enum {
        ANCHOR_START = 1,
        ANCHOR_END   = 2
    };

    /**
     * An alias pointer to the data for this rule.  The data provides
     * lookup services for matchers and segments.
     */
    const TransliterationRuleData* data;

public:

    /**
     * Construct a new rule with the given input, output text, and other
     * attributes.  A cursor position may be specified for the output text.
     * @param input          input string, including key and optional ante and
     *                       post context.
     * @param anteContextPos offset into input to end of ante context, or -1 if
     *                       none.  Must be <= input.length() if not -1.
     * @param postContextPos offset into input to start of post context, or -1
     *                       if none.  Must be <= input.length() if not -1, and must be >=
     *                       anteContextPos.
     * @param outputStr      output string.
     * @param cursorPosition offset into output at which cursor is located, or -1 if
     *                       none.  If less than zero, then the cursor is placed after the
     *                       <code>output</code>; that is, -1 is equivalent to
     *                       <code>output.length()</code>.  If greater than
     *                       <code>output.length()</code> then an exception is thrown.
     * @param cursorOffset   an offset to be added to cursorPos to position the
     *                       cursor either in the ante context, if < 0, or in the post context, if >
     *                       0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
     *                       "xyz" and moves the cursor to before "a".  It would have a cursorOffset
     *                       of -3.
     * @param segs           array of UnicodeMatcher corresponding to input pattern
     *                       segments, or null if there are none.  The array itself is adopted,
     *                       but the pointers within it are not.
     * @param segsCount      number of elements in segs[].
     * @param anchorStart    TRUE if the the rule is anchored on the left to
     *                       the context start.
     * @param anchorEnd      TRUE if the rule is anchored on the right to the
     *                       context limit.
     * @param data           the rule data.
     * @param status         Output parameter filled in with success or failure status.
     */
    TransliterationRule(const UnicodeString& input,
                        int32_t anteContextPos, int32_t postContextPos,
                        const UnicodeString& outputStr,
                        int32_t cursorPosition, int32_t cursorOffset,
                        UnicodeFunctor** segs,
                        int32_t segsCount,
                        UBool anchorStart, UBool anchorEnd,
                        const TransliterationRuleData* data,
                        UErrorCode& status);

    /**
     * Copy constructor.
     * @param other    the object to be copied.
     */
    TransliterationRule(TransliterationRule& other);

    /**
     * Destructor.
     */
    virtual ~TransliterationRule();

    /**
     * Change the data object that this rule belongs to.  Used
     * internally by the TransliterationRuleData copy constructor.
     * @param data    the new data value to be set.
     */
    void setData(const TransliterationRuleData* data);

    /**
     * Return the preceding context length.  This method is needed to
     * support the <code>Transliterator</code> method
     * <code>getMaximumContextLength()</code>.  Internally, this is
     * implemented as the anteContextLength, optionally plus one if
     * there is a start anchor.  The one character anchor gap is
     * needed to make repeated incremental transliteration with
     * anchors work.
     * @return    the preceding context length.
     */
    virtual int32_t getContextLength(void) const;

    /**
     * Internal method.  Returns 8-bit index value for this rule.
     * This is the low byte of the first character of the key,
     * unless the first character of the key is a set.  If it's a
     * set, or otherwise can match multiple keys, the index value is -1.
     * @return    8-bit index value for this rule.
     */
    int16_t getIndexValue() const;

    /**
     * Internal method.  Returns true if this rule matches the given
     * index value.  The index value is an 8-bit integer, 0..255,
     * representing the low byte of the first character of the key.
     * It matches this rule if it matches the first character of the
     * key, or if the first character of the key is a set, and the set
     * contains any character with a low byte equal to the index
     * value.  If the rule contains only ante context, as in foo)>bar,
     * then it will match any key.
     * @param v    the given index value.
     * @return     true if this rule matches the given index value.
     */
    UBool matchesIndexValue(uint8_t v) const;

    /**
     * Return true if this rule masks another rule.  If r1 masks r2 then
     * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
     * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
     * "[c]a>x" masks "[dc]a>y".
     * @param r2  the given rule to be compared with.
     * @return    true if this rule masks 'r2'
     */
    virtual UBool masks(const TransliterationRule& r2) const;

    /**
     * Attempt a match and replacement at the given position.  Return
     * the degree of match between this rule and the given text.  The
     * degree of match may be mismatch, a partial match, or a full
     * match.  A mismatch means at least one character of the text
     * does not match the context or key.  A partial match means some
     * context and key characters match, but the text is not long
     * enough to match all of them.  A full match means all context
     * and key characters match.
     * 
     * If a full match is obtained, perform a replacement, update pos,
     * and return U_MATCH.  Otherwise both text and pos are unchanged.
     * 
     * @param text the text
     * @param pos the position indices
     * @param incremental if TRUE, test for partial matches that may
     * be completed by additional text inserted at pos.limit.
     * @return one of <code>U_MISMATCH</code>,
     * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If
     * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
     */
    UMatchDegree matchAndReplace(Replaceable& text,
                                 UTransPosition& pos,
                                 UBool incremental) const;

    /**
     * Create a rule string that represents this rule object.  Append
     * it to the given string.
     */
    virtual UnicodeString& toRule(UnicodeString& pat,
                                  UBool escapeUnprintable) const;

    /**
     * Union the set of all characters that may be modified by this rule
     * into the given set.
     */
    void addSourceSetTo(UnicodeSet& toUnionTo) const;

    /**
     * Union the set of all characters that may be emitted by this rule
     * into the given set.
     */
    void addTargetSetTo(UnicodeSet& toUnionTo) const;

 private:

    friend class StringMatcher;

    TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class
};

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif
Commit	Line	Data
b75a7d8f A	1	/*
	2	* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
	3	**********************************************************************
	4	* Date Name Description
	5	* 11/17/99 aliu Creation.
	6	**********************************************************************
	7	*/
	8	#ifndef RBT_RULE_H
	9	#define RBT_RULE_H
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_TRANSLITERATION
	14
	15	#include "unicode/uobject.h"
	16	#include "unicode/unistr.h"
	17	#include "unicode/utrans.h"
	18	#include "unicode/unimatch.h"
	19
	20	U_NAMESPACE_BEGIN
	21
	22	class Replaceable;
	23	class TransliterationRuleData;
	24	class StringMatcher;
	25	class UnicodeFunctor;
	26
	27	/**
	28	* A transliteration rule used by
	29	* <code>RuleBasedTransliterator</code>.
	30	* <code>TransliterationRule</code> is an immutable object.
	31	*
	32	* <p>A rule consists of an input pattern and an output string. When
	33	* the input pattern is matched, the output string is emitted. The
	34	* input pattern consists of zero or more characters which are matched
	35	* exactly (the key) and optional context. Context must match if it
	36	* is specified. Context may be specified before the key, after the
	37	* key, or both. The key, preceding context, and following context
	38	* may contain variables. Variables represent a set of Unicode
	39	* characters, such as the letters <i>a</i> through <i>z</i>.
	40	* Variables are detected by looking up each character in a supplied
	41	* variable list to see if it has been so defined.
	42	*
	43	* <p>A rule may contain segments in its input string and segment
	44	* references in its output string. A segment is a substring of the
	45	* input pattern, indicated by an offset and limit. The segment may
	46	* be in the preceding or following context. It may not span a
	47	* context boundary. A segment reference is a special character in
	48	* the output string that causes a segment of the input string (not
	49	* the input pattern) to be copied to the output string. The range of
	50	* special characters that represent segment references is defined by
	51	* RuleBasedTransliterator.Data.
	52	*
	53	* @author Alan Liu
	54	*/
	55	class TransliterationRule : public UMemory {
	56
	57	private:
	58
	59	// TODO Eliminate the pattern and keyLength data members. They
	60	// are used only by masks() and getIndexValue() which are called
	61	// only during build time, not during run-time. Perhaps these
	62	// methods and pattern/keyLength can be isolated into a separate
	63	// object.
	64
65	/**
66	* The match that must occur before the key, or null if there is no
67	* preceding context.
68	*/
69	StringMatcher *anteContext;
70
71	/**
72	* The matcher object for the key. If null, then the key is empty.
73	*/
74	StringMatcher *key;
75
76	/**
77	* The match that must occur after the key, or null if there is no
78	* following context.
79	*/
80	StringMatcher *postContext;
81
82	/**
83	* The object that performs the replacement if the key,
84	* anteContext, and postContext are matched. Never null.
85	*/
86	UnicodeFunctor* output;
87
88	/**
89	* The string that must be matched, consisting of the anteContext, key,
90	* and postContext, concatenated together, in that order. Some components
91	* may be empty (zero length).
92	* @see anteContextLength
93	* @see keyLength
94	*/
95	UnicodeString pattern;
96
97	/**
98	* An array of matcher objects corresponding to the input pattern
99	* segments. If there are no segments this is null. N.B. This is
100	* a UnicodeMatcher for generality, but in practice it is always a
101	* StringMatcher. In the future we may generalize this, but for
102	* now we sometimes cast down to StringMatcher.
103	*
104	* The array is owned, but the pointers within it are not.
105	*/
106	UnicodeFunctor** segments;
107
108	/**
109	* The number of elements in segments[] or zero if segments is NULL.
110	*/
111	int32_t segmentsCount;
112
113	/**
114	* The length of the string that must match before the key. If
115	* zero, then there is no matching requirement before the key.
116	* Substring [0,anteContextLength) of pattern is the anteContext.
117	*/
118	int32_t anteContextLength;
119
120	/**
121	* The length of the key. Substring [anteContextLength,
122	* anteContextLength + keyLength) is the key.
123
124	*/
125	int32_t keyLength;
126
127	/**
128	* Miscellaneous attributes.
129	*/
130	int8_t flags;
131
132	/**
133	* Flag attributes.
134	*/
135	enum {
136	ANCHOR_START = 1,
137	ANCHOR_END = 2
138	};
139
140	/**
141	* An alias pointer to the data for this rule. The data provides
142	* lookup services for matchers and segments.
143	*/
144	const TransliterationRuleData* data;
145
146	public:
147
148	/**
149	* Construct a new rule with the given input, output text, and other
150	* attributes. A cursor position may be specified for the output text.
151	* @param input input string, including key and optional ante and
152	* post context.
153	* @param anteContextPos offset into input to end of ante context, or -1 if
154	* none. Must be <= input.length() if not -1.
155	* @param postContextPos offset into input to start of post context, or -1
156	* if none. Must be <= input.length() if not -1, and must be >=
157	* anteContextPos.
158	* @param outputStr output string.
159	* @param cursorPosition offset into output at which cursor is located, or -1 if
160	* none. If less than zero, then the cursor is placed after the
161	* <code>output</code>; that is, -1 is equivalent to
162	* <code>output.length()</code>. If greater than
163	* <code>output.length()</code> then an exception is thrown.
164	* @param cursorOffset an offset to be added to cursorPos to position the
165	* cursor either in the ante context, if < 0, or in the post context, if >
166	* 0. For example, the rule "abc{def} > \| @@@ xyz;" changes "def" to
167	* "xyz" and moves the cursor to before "a". It would have a cursorOffset
168	* of -3.
169	* @param segs array of UnicodeMatcher corresponding to input pattern
170	* segments, or null if there are none. The array itself is adopted,
171	* but the pointers within it are not.
172	* @param segsCount number of elements in segs[].
173	* @param anchorStart TRUE if the the rule is anchored on the left to
174	* the context start.
175	* @param anchorEnd TRUE if the rule is anchored on the right to the
176	* context limit.
177	* @param data the rule data.
178	* @param status Output parameter filled in with success or failure status.
179	*/
180	TransliterationRule(const UnicodeString& input,
181	int32_t anteContextPos, int32_t postContextPos,
182	const UnicodeString& outputStr,
183	int32_t cursorPosition, int32_t cursorOffset,
184	UnicodeFunctor** segs,
185	int32_t segsCount,
186	UBool anchorStart, UBool anchorEnd,
187	const TransliterationRuleData* data,
188	UErrorCode& status);
189
190	/**
191	* Copy constructor.
192	* @param other the object to be copied.
193	*/
194	TransliterationRule(TransliterationRule& other);
195
196	/**
197	* Destructor.
198	*/
199	virtual ~TransliterationRule();
200
201	/**
202	* Change the data object that this rule belongs to. Used
203	* internally by the TransliterationRuleData copy constructor.
204	* @param data the new data value to be set.
205	*/
206	void setData(const TransliterationRuleData* data);
207
208	/**
209	* Return the preceding context length. This method is needed to
210	* support the <code>Transliterator</code> method
211	* <code>getMaximumContextLength()</code>. Internally, this is
212	* implemented as the anteContextLength, optionally plus one if
213	* there is a start anchor. The one character anchor gap is
214	* needed to make repeated incremental transliteration with
215	* anchors work.
216	* @return the preceding context length.
217	*/
218	virtual int32_t getContextLength(void) const;
219
220	/**
221	* Internal method. Returns 8-bit index value for this rule.
222	* This is the low byte of the first character of the key,
223	* unless the first character of the key is a set. If it's a
224	* set, or otherwise can match multiple keys, the index value is -1.
225	* @return 8-bit index value for this rule.
226	*/
227	int16_t getIndexValue() const;
228
229	/**
230	* Internal method. Returns true if this rule matches the given
231	* index value. The index value is an 8-bit integer, 0..255,
232	* representing the low byte of the first character of the key.
233	* It matches this rule if it matches the first character of the
234	* key, or if the first character of the key is a set, and the set
235	* contains any character with a low byte equal to the index
236	* value. If the rule contains only ante context, as in foo)>bar,
237	* then it will match any key.
238	* @param v the given index value.
239	* @return true if this rule matches the given index value.
240	*/
241	UBool matchesIndexValue(uint8_t v) const;
242
243	/**
244	* Return true if this rule masks another rule. If r1 masks r2 then
245	* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
246	* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
247	* "[c]a>x" masks "[dc]a>y".
248	* @param r2 the given rule to be compared with.
249	* @return true if this rule masks 'r2'
250	*/
251	virtual UBool masks(const TransliterationRule& r2) const;
252
253	/**
254	* Attempt a match and replacement at the given position. Return
255	* the degree of match between this rule and the given text. The
256	* degree of match may be mismatch, a partial match, or a full
257	* match. A mismatch means at least one character of the text
258	* does not match the context or key. A partial match means some
259	* context and key characters match, but the text is not long
260	* enough to match all of them. A full match means all context
261	* and key characters match.
262	*
263	* If a full match is obtained, perform a replacement, update pos,
264	* and return U_MATCH. Otherwise both text and pos are unchanged.
265	*
266	* @param text the text
267	* @param pos the position indices
268	* @param incremental if TRUE, test for partial matches that may
269	* be completed by additional text inserted at pos.limit.
270	* @return one of <code>U_MISMATCH</code>,
271	* <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If
272	* incremental is FALSE then U_PARTIAL_MATCH will not be returned.
273	*/
274	UMatchDegree matchAndReplace(Replaceable& text,
275	UTransPosition& pos,
276	UBool incremental) const;
277
278	/**
279	* Create a rule string that represents this rule object. Append
280	* it to the given string.
281	*/
282	virtual UnicodeString& toRule(UnicodeString& pat,
283	UBool escapeUnprintable) const;
284
285	/**
286	* Union the set of all characters that may be modified by this rule
287	* into the given set.
288	*/
289	void addSourceSetTo(UnicodeSet& toUnionTo) const;
290
291	/**
292	* Union the set of all characters that may be emitted by this rule
293	* into the given set.
294	*/
295	void addTargetSetTo(UnicodeSet& toUnionTo) const;
296
297	private:
298
299	friend class StringMatcher;
300
301	TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class
302	};
303
304	U_NAMESPACE_END
305
306	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
307
308	#endif