[apple/icu.git] / icuSources / i18n / rbt_pars.h

/*
* Copyright (C) {1999-2003}, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#ifndef RBT_PARS_H
#define RBT_PARS_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uobject.h"
#include "unicode/parseerr.h"
#include "unicode/unorm.h"
#include "rbt.h"

U_NAMESPACE_BEGIN

class TransliterationRuleData;
class UnicodeFunctor;
class ParseData;
class RuleHalf;
class ParsePosition;
class UVector;
class StringMatcher;

class TransliteratorParser : public UMemory {

 public:

    /**
     * PUBLIC data member containing the parsed data object, or null if
     * there were no rules.
     */
    TransliterationRuleData* data;

    /**
     * PUBLIC data member.
     * The block of ::IDs, both at the top and at the bottom.
     * Inserted into these may be additional rules at the
     * idSplitPoint.
     */
    UnicodeString idBlock;

    /**
     * PUBLIC data member.
     * In a compound RBT, the index at which the RBT rules are
     * inserted into the ID block.  Index 0 means before any IDs
     * in the block.  Index idBlock.length() means after all IDs
     * in the block.  Index is a string index.
     */
    int32_t idSplitPoint;

    /**
     * PUBLIC data member containing the parsed compound filter, if any.
     */
    UnicodeSet* compoundFilter;

 private:

    // The number of rules parsed.  This tells us if there were
    // any actual transliterator rules, or if there were just ::ID
    // block IDs.
    int32_t ruleCount;

    UTransDirection direction;

    /**
     * We use a single error code during parsing.  Rather than pass it
     * through each API, we keep it here.
     */
    UErrorCode status;

    /**
     * Parse error information.
     */
    UParseError parseError;

    /**
     * Temporary symbol table used during parsing.
     */
    ParseData* parseData;

    /**
     * Temporary vector of matcher variables.  When parsing is complete, this
     * is copied into the array data.variables.  As with data.variables,
     * element 0 corresponds to character data.variablesBase.
     */
    UVector* variablesVector;

    /**
     * String of standins for segments.  Used during the parsing of a single
     * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
     * to StringMatcher object segmentObjects.elementAt(0), etc.
     */
    UnicodeString segmentStandins;

    /**
     * Vector of StringMatcher objects for segments.  Used during the
     * parsing of a single rule.  
     * segmentStandins.charAt(0) is the standin for "$1" and corresponds
     * to StringMatcher object segmentObjects.elementAt(0), etc.
     */
    UVector* segmentObjects;

    /**
     * The next available stand-in for variables.  This starts at some point in
     * the private use area (discovered dynamically) and increments up toward
     * <code>variableLimit</code>.  At any point during parsing, available
     * variables are <code>variableNext..variableLimit-1</code>.
     */
    UChar variableNext;

    /**
     * The last available stand-in for variables.  This is discovered
     * dynamically.  At any point during parsing, available variables are
     * <code>variableNext..variableLimit-1</code>.
     */
    UChar variableLimit;

    /**
     * When we encounter an undefined variable, we do not immediately signal
     * an error, in case we are defining this variable, e.g., "$a = [a-z];".
     * Instead, we save the name of the undefined variable, and substitute
     * in the placeholder char variableLimit - 1, and decrement
     * variableLimit.
     */
    UnicodeString undefinedVariableName;

    /**
     * The stand-in character for the 'dot' set, represented by '.' in
     * patterns.  This is allocated the first time it is needed, and
     * reused thereafter.
     */
    UChar dotStandIn;

public:

    /**
     * Constructor.
     */
    TransliteratorParser();

    /**
     * Destructor.
     */
    ~TransliteratorParser();

    /**
     * Parse the given string as a sequence of rules, separated by newline
     * characters ('\n'), and cause this object to implement those rules.  Any
     * previous rules are discarded.  Typically this method is called exactly
     * once after construction.
     *
     * Parse the given rules, in the given direction.  After this call
     * returns, query the public data members for results.  The caller
     * owns the 'data' and 'compoundFilter' data members after this
     * call returns.
     * @param rules      rules, separated by ';'
     * @param direction  either FORWARD or REVERSE.
     * @param pe         Struct to recieve information on position 
     *                   of error if an error is encountered
     * @param ec         Output param set to success/failure code.
     */
    void parse(const UnicodeString& rules,
               UTransDirection direction,
               UParseError& pe,
               UErrorCode& ec);

    /**
     * Return the compound filter parsed by parse().  Caller owns result.
     * @return the compound filter parsed by parse().
     */ 
    UnicodeSet* orphanCompoundFilter();

    /**
     * Return the data object parsed by parse().  Caller owns result.
     * @return the data object parsed by parse().
     */
    TransliterationRuleData* orphanData();

private:

    /**
     * Return a representation of this transliterator as source rules.
     * @param rules      Output param to receive the rules.
     * @param direction  either FORWARD or REVERSE.
     */
    void parseRules(const UnicodeString& rules,
                    UTransDirection direction);

    /**
     * MAIN PARSER.  Parse the next rule in the given rule string, starting
     * at pos.  Return the index after the last character parsed.  Do not
     * parse characters at or after limit.
     *
     * Important:  The character at pos must be a non-whitespace character
     * that is not the comment character.
     *
     * This method handles quoting, escaping, and whitespace removal.  It
     * parses the end-of-rule character.  It recognizes context and cursor
     * indicators.  Once it does a lexical breakdown of the rule at pos, it
     * creates a rule object and adds it to our rule list.
     * @param rules      Output param to receive the rules.
     * @param pos        the starting position.
     * @param limit      pointer past the last character of the rule.
     * @return           the index after the last character parsed.
     */
    int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit);

    /**
     * Set the variable range to [start, end] (inclusive).
     * @param start    the start value of the range.
     * @param end      the end value of the range.
     */
    void setVariableRange(int32_t start, int32_t end);

    /**
     * Assert that the given character is NOT within the variable range.
     * If it is, return FALSE.  This is neccesary to ensure that the
     * variable range does not overlap characters used in a rule.
     * @param ch     the given character.
     * @return       True, if the given character is NOT within the variable range.
     */
    UBool checkVariableRange(UChar32 ch) const;

    /**
     * Set the maximum backup to 'backup', in response to a pragma
     * statement.
     * @param backup    the new value to be set.
     */
    void pragmaMaximumBackup(int32_t backup);

    /**
     * Begin normalizing all rules using the given mode, in response
     * to a pragma statement.
     * @param mode    the given mode.
     */
    void pragmaNormalizeRules(UNormalizationMode mode);

    /**
     * Return true if the given rule looks like a pragma.
     * @param pos offset to the first non-whitespace character
     * of the rule.
     * @param limit pointer past the last character of the rule.
     * @return true if the given rule looks like a pragma.
     */
    static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);

    /**
     * Parse a pragma.  This method assumes resemblesPragma() has
     * already returned true.
     * @param pos offset to the first non-whitespace character
     * of the rule.
     * @param limit pointer past the last character of the rule.
     * @return the position index after the final ';' of the pragma,
     * or -1 on failure.
     */
    int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit);

    /**
     * Called by main parser upon syntax error.  Search the rule string
     * for the probable end of the rule.  Of course, if the error is that
     * the end of rule marker is missing, then the rule end will not be found.
     * In any case the rule start will be correctly reported.
     * @param parseErrorCode error code.
     * @param msg error description.
     * @param start position of first character of current rule.
     * @return start position of first character of current rule.
     */
    int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start);

    /**
     * Parse a UnicodeSet out, store it, and return the stand-in character
     * used to represent it.
     *
     * @param rule    the rule for UnicodeSet.
     * @param pos     the position in pattern at which to start parsing.
     * @return        the stand-in character used to represent it.
     */
    UChar parseSet(const UnicodeString& rule,
                   ParsePosition& pos);

    /**
     * Generate and return a stand-in for a new UnicodeFunctor.  Store
     * the matcher (adopt it).
     * @param adopted the UnicodeFunctor to be adopted.
     * @return        a stand-in for a new UnicodeFunctor.
     */
    UChar generateStandInFor(UnicodeFunctor* adopted);

    /**
     * Return the standin for segment seg (1-based).
     * @param seg    the given segment.
     * @return       the standIn character for the given segment.
     */
    UChar getSegmentStandin(int32_t seg);

    /**
     * Set the object for segment seg (1-based).
     * @param seg      the given segment.
     * @param adopted  the StringMatcher to be adopted.
     */
    void setSegmentObject(int32_t seg, StringMatcher* adopted);

    /**
     * Return the stand-in for the dot set.  It is allocated the first
     * time and reused thereafter.
     * @return    the stand-in for the dot set.
     */
    UChar getDotStandIn();

    /**
     * Append the value of the given variable name to the given
     * UnicodeString.
     * @param name    the variable name to be appended.
     * @param buf     the given UnicodeString to append to.
     */
    void appendVariableDef(const UnicodeString& name,
                           UnicodeString& buf);

    /**
     * Glue method to get around access restrictions in C++.
     */
    static Transliterator* createBasicInstance(const UnicodeString& id,
                                               const UnicodeString* canonID);

    friend class RuleHalf;

    // Disallowed methods; no impl.
    /**
     * Copy constructor
     */
    TransliteratorParser(const TransliteratorParser&);
    
    /**
     * Assignment operator
     */
    TransliteratorParser& operator=(const TransliteratorParser&);
};

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif
Commit	Line	Data
b75a7d8f A	1	/*
	2	* Copyright (C) {1999-2003}, International Business Machines Corporation and others. All Rights Reserved.
	3	**********************************************************************
	4	* Date Name Description
	5	* 11/17/99 aliu Creation.
	6	**********************************************************************
	7	*/
	8	#ifndef RBT_PARS_H
	9	#define RBT_PARS_H
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_TRANSLITERATION
	14
	15	#include "unicode/uobject.h"
	16	#include "unicode/parseerr.h"
	17	#include "unicode/unorm.h"
	18	#include "rbt.h"
	19
	20	U_NAMESPACE_BEGIN
	21
	22	class TransliterationRuleData;
	23	class UnicodeFunctor;
	24	class ParseData;
	25	class RuleHalf;
	26	class ParsePosition;
	27	class UVector;
	28	class StringMatcher;
	29
	30	class TransliteratorParser : public UMemory {
	31
	32	public:
	33
	34	/**
	35	* PUBLIC data member containing the parsed data object, or null if
	36	* there were no rules.
	37	*/
	38	TransliterationRuleData* data;
	39
	40	/**
	41	* PUBLIC data member.
	42	* The block of ::IDs, both at the top and at the bottom.
	43	* Inserted into these may be additional rules at the
	44	* idSplitPoint.
	45	*/
	46	UnicodeString idBlock;
	47
	48	/**
	49	* PUBLIC data member.
	50	* In a compound RBT, the index at which the RBT rules are
	51	* inserted into the ID block. Index 0 means before any IDs
	52	* in the block. Index idBlock.length() means after all IDs
	53	* in the block. Index is a string index.
	54	*/
	55	int32_t idSplitPoint;
	56
	57	/**
	58	* PUBLIC data member containing the parsed compound filter, if any.
	59	*/
	60	UnicodeSet* compoundFilter;
	61
	62	private:
	63
	64	// The number of rules parsed. This tells us if there were
65	// any actual transliterator rules, or if there were just ::ID
66	// block IDs.
67	int32_t ruleCount;
68
69	UTransDirection direction;
70
71	/**
72	* We use a single error code during parsing. Rather than pass it
73	* through each API, we keep it here.
74	*/
75	UErrorCode status;
76
77	/**
78	* Parse error information.
79	*/
80	UParseError parseError;
81
82	/**
83	* Temporary symbol table used during parsing.
84	*/
85	ParseData* parseData;
86
87	/**
88	* Temporary vector of matcher variables. When parsing is complete, this
89	* is copied into the array data.variables. As with data.variables,
90	* element 0 corresponds to character data.variablesBase.
91	*/
92	UVector* variablesVector;
93
94	/**
95	* String of standins for segments. Used during the parsing of a single
96	* rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
97	* to StringMatcher object segmentObjects.elementAt(0), etc.
98	*/
99	UnicodeString segmentStandins;
100
101	/**
102	* Vector of StringMatcher objects for segments. Used during the
103	* parsing of a single rule.
104	* segmentStandins.charAt(0) is the standin for "$1" and corresponds
105	* to StringMatcher object segmentObjects.elementAt(0), etc.
106	*/
107	UVector* segmentObjects;
108
109	/**
110	* The next available stand-in for variables. This starts at some point in
111	* the private use area (discovered dynamically) and increments up toward
112	* <code>variableLimit</code>. At any point during parsing, available
113	* variables are <code>variableNext..variableLimit-1</code>.
114	*/
115	UChar variableNext;
116
117	/**
118	* The last available stand-in for variables. This is discovered
119	* dynamically. At any point during parsing, available variables are
120	* <code>variableNext..variableLimit-1</code>.
121	*/
122	UChar variableLimit;
123
124	/**
125	* When we encounter an undefined variable, we do not immediately signal
126	* an error, in case we are defining this variable, e.g., "$a = [a-z];".
127	* Instead, we save the name of the undefined variable, and substitute
128	* in the placeholder char variableLimit - 1, and decrement
129	* variableLimit.
130	*/
131	UnicodeString undefinedVariableName;
132
133	/**
134	* The stand-in character for the 'dot' set, represented by '.' in
135	* patterns. This is allocated the first time it is needed, and
136	* reused thereafter.
137	*/
138	UChar dotStandIn;
139
140	public:
141
142	/**
143	* Constructor.
144	*/
145	TransliteratorParser();
146
147	/**
148	* Destructor.
149	*/
150	~TransliteratorParser();
151
152	/**
153	* Parse the given string as a sequence of rules, separated by newline
154	* characters ('\n'), and cause this object to implement those rules. Any
155	* previous rules are discarded. Typically this method is called exactly
156	* once after construction.
157	*
158	* Parse the given rules, in the given direction. After this call
159	* returns, query the public data members for results. The caller
160	* owns the 'data' and 'compoundFilter' data members after this
161	* call returns.
162	* @param rules rules, separated by ';'
163	* @param direction either FORWARD or REVERSE.
164	* @param pe Struct to recieve information on position
165	* of error if an error is encountered
166	* @param ec Output param set to success/failure code.
167	*/
168	void parse(const UnicodeString& rules,
169	UTransDirection direction,
170	UParseError& pe,
171	UErrorCode& ec);
172
173	/**
174	* Return the compound filter parsed by parse(). Caller owns result.
175	* @return the compound filter parsed by parse().
176	*/
177	UnicodeSet* orphanCompoundFilter();
178
179	/**
180	* Return the data object parsed by parse(). Caller owns result.
181	* @return the data object parsed by parse().
182	*/
183	TransliterationRuleData* orphanData();
184
185	private:
186
187	/**
188	* Return a representation of this transliterator as source rules.
189	* @param rules Output param to receive the rules.
190	* @param direction either FORWARD or REVERSE.
191	*/
192	void parseRules(const UnicodeString& rules,
193	UTransDirection direction);
194
195	/**
196	* MAIN PARSER. Parse the next rule in the given rule string, starting
197	* at pos. Return the index after the last character parsed. Do not
198	* parse characters at or after limit.
199	*
200	* Important: The character at pos must be a non-whitespace character
201	* that is not the comment character.
202	*
203	* This method handles quoting, escaping, and whitespace removal. It
204	* parses the end-of-rule character. It recognizes context and cursor
205	* indicators. Once it does a lexical breakdown of the rule at pos, it
206	* creates a rule object and adds it to our rule list.
207	* @param rules Output param to receive the rules.
208	* @param pos the starting position.
209	* @param limit pointer past the last character of the rule.
210	* @return the index after the last character parsed.
211	*/
212	int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit);
213
214	/**
215	* Set the variable range to [start, end] (inclusive).
216	* @param start the start value of the range.
217	* @param end the end value of the range.
218	*/
219	void setVariableRange(int32_t start, int32_t end);
220
221	/**
222	* Assert that the given character is NOT within the variable range.
223	* If it is, return FALSE. This is neccesary to ensure that the
224	* variable range does not overlap characters used in a rule.
225	* @param ch the given character.
226	* @return True, if the given character is NOT within the variable range.
227	*/
228	UBool checkVariableRange(UChar32 ch) const;
229
230	/**
231	* Set the maximum backup to 'backup', in response to a pragma
232	* statement.
233	* @param backup the new value to be set.
234	*/
235	void pragmaMaximumBackup(int32_t backup);
236
237	/**
238	* Begin normalizing all rules using the given mode, in response
239	* to a pragma statement.
240	* @param mode the given mode.
241	*/
242	void pragmaNormalizeRules(UNormalizationMode mode);
243
244	/**
245	* Return true if the given rule looks like a pragma.
246	* @param pos offset to the first non-whitespace character
247	* of the rule.
248	* @param limit pointer past the last character of the rule.
249	* @return true if the given rule looks like a pragma.
250	*/
251	static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
252
253	/**
254	* Parse a pragma. This method assumes resemblesPragma() has
255	* already returned true.
256	* @param pos offset to the first non-whitespace character
257	* of the rule.
258	* @param limit pointer past the last character of the rule.
259	* @return the position index after the final ';' of the pragma,
260	* or -1 on failure.
261	*/
262	int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit);
263
264	/**
265	* Called by main parser upon syntax error. Search the rule string
266	* for the probable end of the rule. Of course, if the error is that
267	* the end of rule marker is missing, then the rule end will not be found.
268	* In any case the rule start will be correctly reported.
269	* @param parseErrorCode error code.
270	* @param msg error description.
271	* @param start position of first character of current rule.
272	* @return start position of first character of current rule.
273	*/
274	int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start);
275
276	/**
277	* Parse a UnicodeSet out, store it, and return the stand-in character
278	* used to represent it.
279	*
280	* @param rule the rule for UnicodeSet.
281	* @param pos the position in pattern at which to start parsing.
282	* @return the stand-in character used to represent it.
283	*/
284	UChar parseSet(const UnicodeString& rule,
285	ParsePosition& pos);
286
287	/**
288	* Generate and return a stand-in for a new UnicodeFunctor. Store
289	* the matcher (adopt it).
290	* @param adopted the UnicodeFunctor to be adopted.
291	* @return a stand-in for a new UnicodeFunctor.
292	*/
293	UChar generateStandInFor(UnicodeFunctor* adopted);
294
295	/**
296	* Return the standin for segment seg (1-based).
297	* @param seg the given segment.
298	* @return the standIn character for the given segment.
299	*/
300	UChar getSegmentStandin(int32_t seg);
301
302	/**
303	* Set the object for segment seg (1-based).
304	* @param seg the given segment.
305	* @param adopted the StringMatcher to be adopted.
306	*/
307	void setSegmentObject(int32_t seg, StringMatcher* adopted);
308
309	/**
310	* Return the stand-in for the dot set. It is allocated the first
311	* time and reused thereafter.
312	* @return the stand-in for the dot set.
313	*/
314	UChar getDotStandIn();
315
316	/**
317	* Append the value of the given variable name to the given
318	* UnicodeString.
319	* @param name the variable name to be appended.
320	* @param buf the given UnicodeString to append to.
321	*/
322	void appendVariableDef(const UnicodeString& name,
323	UnicodeString& buf);
324
325	/**
326	* Glue method to get around access restrictions in C++.
327	*/
328	static Transliterator* createBasicInstance(const UnicodeString& id,
329	const UnicodeString* canonID);
330
331	friend class RuleHalf;
332
333	// Disallowed methods; no impl.
334	/**
335	* Copy constructor
336	*/
337	TransliteratorParser(const TransliteratorParser&);
338
339	/**
340	* Assignment operator
341	*/
342	TransliteratorParser& operator=(const TransliteratorParser&);
343	};
344
345	U_NAMESPACE_END
346
347	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
348
349	#endif