]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/rbt_pars.h
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / i18n / rbt_pars.h
1 /*
2 * Copyright (C) {1999-2003}, International Business Machines Corporation and others. All Rights Reserved.
3 **********************************************************************
4 * Date Name Description
5 * 11/17/99 aliu Creation.
6 **********************************************************************
7 */
8 #ifndef RBT_PARS_H
9 #define RBT_PARS_H
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/uobject.h"
16 #include "unicode/parseerr.h"
17 #include "unicode/unorm.h"
18 #include "rbt.h"
19
20 U_NAMESPACE_BEGIN
21
22 class TransliterationRuleData;
23 class UnicodeFunctor;
24 class ParseData;
25 class RuleHalf;
26 class ParsePosition;
27 class UVector;
28 class StringMatcher;
29
30 class TransliteratorParser : public UMemory {
31
32 public:
33
34 /**
35 * PUBLIC data member containing the parsed data object, or null if
36 * there were no rules.
37 */
38 TransliterationRuleData* data;
39
40 /**
41 * PUBLIC data member.
42 * The block of ::IDs, both at the top and at the bottom.
43 * Inserted into these may be additional rules at the
44 * idSplitPoint.
45 */
46 UnicodeString idBlock;
47
48 /**
49 * PUBLIC data member.
50 * In a compound RBT, the index at which the RBT rules are
51 * inserted into the ID block. Index 0 means before any IDs
52 * in the block. Index idBlock.length() means after all IDs
53 * in the block. Index is a string index.
54 */
55 int32_t idSplitPoint;
56
57 /**
58 * PUBLIC data member containing the parsed compound filter, if any.
59 */
60 UnicodeSet* compoundFilter;
61
62 private:
63
64 // The number of rules parsed. This tells us if there were
65 // any actual transliterator rules, or if there were just ::ID
66 // block IDs.
67 int32_t ruleCount;
68
69 UTransDirection direction;
70
71 /**
72 * We use a single error code during parsing. Rather than pass it
73 * through each API, we keep it here.
74 */
75 UErrorCode status;
76
77 /**
78 * Parse error information.
79 */
80 UParseError parseError;
81
82 /**
83 * Temporary symbol table used during parsing.
84 */
85 ParseData* parseData;
86
87 /**
88 * Temporary vector of matcher variables. When parsing is complete, this
89 * is copied into the array data.variables. As with data.variables,
90 * element 0 corresponds to character data.variablesBase.
91 */
92 UVector* variablesVector;
93
94 /**
95 * String of standins for segments. Used during the parsing of a single
96 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
97 * to StringMatcher object segmentObjects.elementAt(0), etc.
98 */
99 UnicodeString segmentStandins;
100
101 /**
102 * Vector of StringMatcher objects for segments. Used during the
103 * parsing of a single rule.
104 * segmentStandins.charAt(0) is the standin for "$1" and corresponds
105 * to StringMatcher object segmentObjects.elementAt(0), etc.
106 */
107 UVector* segmentObjects;
108
109 /**
110 * The next available stand-in for variables. This starts at some point in
111 * the private use area (discovered dynamically) and increments up toward
112 * <code>variableLimit</code>. At any point during parsing, available
113 * variables are <code>variableNext..variableLimit-1</code>.
114 */
115 UChar variableNext;
116
117 /**
118 * The last available stand-in for variables. This is discovered
119 * dynamically. At any point during parsing, available variables are
120 * <code>variableNext..variableLimit-1</code>.
121 */
122 UChar variableLimit;
123
124 /**
125 * When we encounter an undefined variable, we do not immediately signal
126 * an error, in case we are defining this variable, e.g., "$a = [a-z];".
127 * Instead, we save the name of the undefined variable, and substitute
128 * in the placeholder char variableLimit - 1, and decrement
129 * variableLimit.
130 */
131 UnicodeString undefinedVariableName;
132
133 /**
134 * The stand-in character for the 'dot' set, represented by '.' in
135 * patterns. This is allocated the first time it is needed, and
136 * reused thereafter.
137 */
138 UChar dotStandIn;
139
140 public:
141
142 /**
143 * Constructor.
144 */
145 TransliteratorParser();
146
147 /**
148 * Destructor.
149 */
150 ~TransliteratorParser();
151
152 /**
153 * Parse the given string as a sequence of rules, separated by newline
154 * characters ('\n'), and cause this object to implement those rules. Any
155 * previous rules are discarded. Typically this method is called exactly
156 * once after construction.
157 *
158 * Parse the given rules, in the given direction. After this call
159 * returns, query the public data members for results. The caller
160 * owns the 'data' and 'compoundFilter' data members after this
161 * call returns.
162 * @param rules rules, separated by ';'
163 * @param direction either FORWARD or REVERSE.
164 * @param pe Struct to recieve information on position
165 * of error if an error is encountered
166 * @param ec Output param set to success/failure code.
167 */
168 void parse(const UnicodeString& rules,
169 UTransDirection direction,
170 UParseError& pe,
171 UErrorCode& ec);
172
173 /**
174 * Return the compound filter parsed by parse(). Caller owns result.
175 * @return the compound filter parsed by parse().
176 */
177 UnicodeSet* orphanCompoundFilter();
178
179 /**
180 * Return the data object parsed by parse(). Caller owns result.
181 * @return the data object parsed by parse().
182 */
183 TransliterationRuleData* orphanData();
184
185 private:
186
187 /**
188 * Return a representation of this transliterator as source rules.
189 * @param rules Output param to receive the rules.
190 * @param direction either FORWARD or REVERSE.
191 */
192 void parseRules(const UnicodeString& rules,
193 UTransDirection direction);
194
195 /**
196 * MAIN PARSER. Parse the next rule in the given rule string, starting
197 * at pos. Return the index after the last character parsed. Do not
198 * parse characters at or after limit.
199 *
200 * Important: The character at pos must be a non-whitespace character
201 * that is not the comment character.
202 *
203 * This method handles quoting, escaping, and whitespace removal. It
204 * parses the end-of-rule character. It recognizes context and cursor
205 * indicators. Once it does a lexical breakdown of the rule at pos, it
206 * creates a rule object and adds it to our rule list.
207 * @param rules Output param to receive the rules.
208 * @param pos the starting position.
209 * @param limit pointer past the last character of the rule.
210 * @return the index after the last character parsed.
211 */
212 int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit);
213
214 /**
215 * Set the variable range to [start, end] (inclusive).
216 * @param start the start value of the range.
217 * @param end the end value of the range.
218 */
219 void setVariableRange(int32_t start, int32_t end);
220
221 /**
222 * Assert that the given character is NOT within the variable range.
223 * If it is, return FALSE. This is neccesary to ensure that the
224 * variable range does not overlap characters used in a rule.
225 * @param ch the given character.
226 * @return True, if the given character is NOT within the variable range.
227 */
228 UBool checkVariableRange(UChar32 ch) const;
229
230 /**
231 * Set the maximum backup to 'backup', in response to a pragma
232 * statement.
233 * @param backup the new value to be set.
234 */
235 void pragmaMaximumBackup(int32_t backup);
236
237 /**
238 * Begin normalizing all rules using the given mode, in response
239 * to a pragma statement.
240 * @param mode the given mode.
241 */
242 void pragmaNormalizeRules(UNormalizationMode mode);
243
244 /**
245 * Return true if the given rule looks like a pragma.
246 * @param pos offset to the first non-whitespace character
247 * of the rule.
248 * @param limit pointer past the last character of the rule.
249 * @return true if the given rule looks like a pragma.
250 */
251 static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
252
253 /**
254 * Parse a pragma. This method assumes resemblesPragma() has
255 * already returned true.
256 * @param pos offset to the first non-whitespace character
257 * of the rule.
258 * @param limit pointer past the last character of the rule.
259 * @return the position index after the final ';' of the pragma,
260 * or -1 on failure.
261 */
262 int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit);
263
264 /**
265 * Called by main parser upon syntax error. Search the rule string
266 * for the probable end of the rule. Of course, if the error is that
267 * the end of rule marker is missing, then the rule end will not be found.
268 * In any case the rule start will be correctly reported.
269 * @param parseErrorCode error code.
270 * @param msg error description.
271 * @param start position of first character of current rule.
272 * @return start position of first character of current rule.
273 */
274 int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start);
275
276 /**
277 * Parse a UnicodeSet out, store it, and return the stand-in character
278 * used to represent it.
279 *
280 * @param rule the rule for UnicodeSet.
281 * @param pos the position in pattern at which to start parsing.
282 * @return the stand-in character used to represent it.
283 */
284 UChar parseSet(const UnicodeString& rule,
285 ParsePosition& pos);
286
287 /**
288 * Generate and return a stand-in for a new UnicodeFunctor. Store
289 * the matcher (adopt it).
290 * @param adopted the UnicodeFunctor to be adopted.
291 * @return a stand-in for a new UnicodeFunctor.
292 */
293 UChar generateStandInFor(UnicodeFunctor* adopted);
294
295 /**
296 * Return the standin for segment seg (1-based).
297 * @param seg the given segment.
298 * @return the standIn character for the given segment.
299 */
300 UChar getSegmentStandin(int32_t seg);
301
302 /**
303 * Set the object for segment seg (1-based).
304 * @param seg the given segment.
305 * @param adopted the StringMatcher to be adopted.
306 */
307 void setSegmentObject(int32_t seg, StringMatcher* adopted);
308
309 /**
310 * Return the stand-in for the dot set. It is allocated the first
311 * time and reused thereafter.
312 * @return the stand-in for the dot set.
313 */
314 UChar getDotStandIn();
315
316 /**
317 * Append the value of the given variable name to the given
318 * UnicodeString.
319 * @param name the variable name to be appended.
320 * @param buf the given UnicodeString to append to.
321 */
322 void appendVariableDef(const UnicodeString& name,
323 UnicodeString& buf);
324
325 /**
326 * Glue method to get around access restrictions in C++.
327 */
328 static Transliterator* createBasicInstance(const UnicodeString& id,
329 const UnicodeString* canonID);
330
331 friend class RuleHalf;
332
333 // Disallowed methods; no impl.
334 /**
335 * Copy constructor
336 */
337 TransliteratorParser(const TransliteratorParser&);
338
339 /**
340 * Assignment operator
341 */
342 TransliteratorParser& operator=(const TransliteratorParser&);
343 };
344
345 U_NAMESPACE_END
346
347 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
348
349 #endif