]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/rbt_pars.h
ICU-400.39.tar.gz
[apple/icu.git] / icuSources / i18n / rbt_pars.h
1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2007, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/17/99 aliu Creation.
8 **********************************************************************
9 */
10 #ifndef RBT_PARS_H
11 #define RBT_PARS_H
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_TRANSLITERATION
16 #ifdef XP_CPLUSPLUS
17
18 #include "unicode/uobject.h"
19 #include "unicode/parseerr.h"
20 #include "unicode/unorm.h"
21 #include "rbt.h"
22 #include "hash.h"
23 #include "uvector.h"
24
25 U_NAMESPACE_BEGIN
26
27 class TransliterationRuleData;
28 class UnicodeFunctor;
29 class ParseData;
30 class RuleHalf;
31 class ParsePosition;
32 class StringMatcher;
33
34 class TransliteratorParser : public UMemory {
35
36 public:
37
38 /**
39 * A Vector of TransliterationRuleData objects, one for each discrete group
40 * of rules in the rule set
41 */
42 UVector dataVector;
43
44 /**
45 * PUBLIC data member.
46 * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
47 */
48 UVector idBlockVector;
49
50 /**
51 * PUBLIC data member containing the parsed compound filter, if any.
52 */
53 UnicodeSet* compoundFilter;
54
55 private:
56
57 /**
58 * The current data object for which we are parsing rules
59 */
60 TransliterationRuleData* curData;
61
62 UTransDirection direction;
63
64 /**
65 * Parse error information.
66 */
67 UParseError parseError;
68
69 /**
70 * Temporary symbol table used during parsing.
71 */
72 ParseData* parseData;
73
74 /**
75 * Temporary vector of matcher variables. When parsing is complete, this
76 * is copied into the array data.variables. As with data.variables,
77 * element 0 corresponds to character data.variablesBase.
78 */
79 UVector variablesVector;
80
81 /**
82 * Temporary table of variable names. When parsing is complete, this is
83 * copied into data.variableNames.
84 */
85 Hashtable variableNames;
86
87 /**
88 * String of standins for segments. Used during the parsing of a single
89 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
90 * to StringMatcher object segmentObjects.elementAt(0), etc.
91 */
92 UnicodeString segmentStandins;
93
94 /**
95 * Vector of StringMatcher objects for segments. Used during the
96 * parsing of a single rule.
97 * segmentStandins.charAt(0) is the standin for "$1" and corresponds
98 * to StringMatcher object segmentObjects.elementAt(0), etc.
99 */
100 UVector segmentObjects;
101
102 /**
103 * The next available stand-in for variables. This starts at some point in
104 * the private use area (discovered dynamically) and increments up toward
105 * <code>variableLimit</code>. At any point during parsing, available
106 * variables are <code>variableNext..variableLimit-1</code>.
107 */
108 UChar variableNext;
109
110 /**
111 * The last available stand-in for variables. This is discovered
112 * dynamically. At any point during parsing, available variables are
113 * <code>variableNext..variableLimit-1</code>.
114 */
115 UChar variableLimit;
116
117 /**
118 * When we encounter an undefined variable, we do not immediately signal
119 * an error, in case we are defining this variable, e.g., "$a = [a-z];".
120 * Instead, we save the name of the undefined variable, and substitute
121 * in the placeholder char variableLimit - 1, and decrement
122 * variableLimit.
123 */
124 UnicodeString undefinedVariableName;
125
126 /**
127 * The stand-in character for the 'dot' set, represented by '.' in
128 * patterns. This is allocated the first time it is needed, and
129 * reused thereafter.
130 */
131 UChar dotStandIn;
132
133 public:
134
135 /**
136 * Constructor.
137 */
138 TransliteratorParser(UErrorCode &statusReturn);
139
140 /**
141 * Destructor.
142 */
143 ~TransliteratorParser();
144
145 /**
146 * Parse the given string as a sequence of rules, separated by newline
147 * characters ('\n'), and cause this object to implement those rules. Any
148 * previous rules are discarded. Typically this method is called exactly
149 * once after construction.
150 *
151 * Parse the given rules, in the given direction. After this call
152 * returns, query the public data members for results. The caller
153 * owns the 'data' and 'compoundFilter' data members after this
154 * call returns.
155 * @param rules rules, separated by ';'
156 * @param direction either FORWARD or REVERSE.
157 * @param pe Struct to recieve information on position
158 * of error if an error is encountered
159 * @param ec Output param set to success/failure code.
160 */
161 void parse(const UnicodeString& rules,
162 UTransDirection direction,
163 UParseError& pe,
164 UErrorCode& ec);
165
166 /**
167 * Return the compound filter parsed by parse(). Caller owns result.
168 * @return the compound filter parsed by parse().
169 */
170 UnicodeSet* orphanCompoundFilter();
171
172 private:
173
174 /**
175 * Return a representation of this transliterator as source rules.
176 * @param rules Output param to receive the rules.
177 * @param direction either FORWARD or REVERSE.
178 */
179 void parseRules(const UnicodeString& rules,
180 UTransDirection direction,
181 UErrorCode& status);
182
183 /**
184 * MAIN PARSER. Parse the next rule in the given rule string, starting
185 * at pos. Return the index after the last character parsed. Do not
186 * parse characters at or after limit.
187 *
188 * Important: The character at pos must be a non-whitespace character
189 * that is not the comment character.
190 *
191 * This method handles quoting, escaping, and whitespace removal. It
192 * parses the end-of-rule character. It recognizes context and cursor
193 * indicators. Once it does a lexical breakdown of the rule at pos, it
194 * creates a rule object and adds it to our rule list.
195 * @param rules Output param to receive the rules.
196 * @param pos the starting position.
197 * @param limit pointer past the last character of the rule.
198 * @return the index after the last character parsed.
199 */
200 int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
201
202 /**
203 * Set the variable range to [start, end] (inclusive).
204 * @param start the start value of the range.
205 * @param end the end value of the range.
206 */
207 void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
208
209 /**
210 * Assert that the given character is NOT within the variable range.
211 * If it is, return FALSE. This is neccesary to ensure that the
212 * variable range does not overlap characters used in a rule.
213 * @param ch the given character.
214 * @return True, if the given character is NOT within the variable range.
215 */
216 UBool checkVariableRange(UChar32 ch) const;
217
218 /**
219 * Set the maximum backup to 'backup', in response to a pragma
220 * statement.
221 * @param backup the new value to be set.
222 */
223 void pragmaMaximumBackup(int32_t backup);
224
225 /**
226 * Begin normalizing all rules using the given mode, in response
227 * to a pragma statement.
228 * @param mode the given mode.
229 */
230 void pragmaNormalizeRules(UNormalizationMode mode);
231
232 /**
233 * Return true if the given rule looks like a pragma.
234 * @param pos offset to the first non-whitespace character
235 * of the rule.
236 * @param limit pointer past the last character of the rule.
237 * @return true if the given rule looks like a pragma.
238 */
239 static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
240
241 /**
242 * Parse a pragma. This method assumes resemblesPragma() has
243 * already returned true.
244 * @param pos offset to the first non-whitespace character
245 * of the rule.
246 * @param limit pointer past the last character of the rule.
247 * @return the position index after the final ';' of the pragma,
248 * or -1 on failure.
249 */
250 int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
251
252 /**
253 * Called by main parser upon syntax error. Search the rule string
254 * for the probable end of the rule. Of course, if the error is that
255 * the end of rule marker is missing, then the rule end will not be found.
256 * In any case the rule start will be correctly reported.
257 * @param parseErrorCode error code.
258 * @param msg error description.
259 * @param start position of first character of current rule.
260 * @return start position of first character of current rule.
261 */
262 int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
263 UErrorCode& status);
264
265 /**
266 * Parse a UnicodeSet out, store it, and return the stand-in character
267 * used to represent it.
268 *
269 * @param rule the rule for UnicodeSet.
270 * @param pos the position in pattern at which to start parsing.
271 * @return the stand-in character used to represent it.
272 */
273 UChar parseSet(const UnicodeString& rule,
274 ParsePosition& pos,
275 UErrorCode& status);
276
277 /**
278 * Generate and return a stand-in for a new UnicodeFunctor. Store
279 * the matcher (adopt it).
280 * @param adopted the UnicodeFunctor to be adopted.
281 * @return a stand-in for a new UnicodeFunctor.
282 */
283 UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
284
285 /**
286 * Return the standin for segment seg (1-based).
287 * @param seg the given segment.
288 * @return the standIn character for the given segment.
289 */
290 UChar getSegmentStandin(int32_t seg, UErrorCode& status);
291
292 /**
293 * Set the object for segment seg (1-based).
294 * @param seg the given segment.
295 * @param adopted the StringMatcher to be adopted.
296 */
297 void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
298
299 /**
300 * Return the stand-in for the dot set. It is allocated the first
301 * time and reused thereafter.
302 * @return the stand-in for the dot set.
303 */
304 UChar getDotStandIn(UErrorCode& status);
305
306 /**
307 * Append the value of the given variable name to the given
308 * UnicodeString.
309 * @param name the variable name to be appended.
310 * @param buf the given UnicodeString to append to.
311 */
312 void appendVariableDef(const UnicodeString& name,
313 UnicodeString& buf,
314 UErrorCode& status);
315
316 /**
317 * Glue method to get around access restrictions in C++.
318 */
319 /*static Transliterator* createBasicInstance(const UnicodeString& id,
320 const UnicodeString* canonID);*/
321
322 friend class RuleHalf;
323
324 // Disallowed methods; no impl.
325 /**
326 * Copy constructor
327 */
328 TransliteratorParser(const TransliteratorParser&);
329
330 /**
331 * Assignment operator
332 */
333 TransliteratorParser& operator=(const TransliteratorParser&);
334 };
335
336 U_NAMESPACE_END
337
338 #endif /* #ifdef XP_CPLUSPLUS */
339
340 /**
341 * Strip/convert the following from the transliterator rules:
342 * comments
343 * newlines
344 * white space at the beginning and end of a line
345 * unescape \u notation
346 *
347 * The target must be equal in size as the source.
348 * @internal
349 */
350 U_CAPI int32_t
351 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
352
353 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
354
355 #endif