]>
Commit | Line | Data |
---|---|---|
b75a7d8f | 1 | /* |
73c04bcf | 2 | ********************************************************************** |
4388f060 | 3 | * Copyright (C) 1999-2011, International Business Machines Corporation |
73c04bcf | 4 | * and others. All Rights Reserved. |
b75a7d8f A |
5 | ********************************************************************** |
6 | * Date Name Description | |
7 | * 11/17/99 aliu Creation. | |
8 | ********************************************************************** | |
9 | */ | |
10 | #ifndef RBT_PARS_H | |
11 | #define RBT_PARS_H | |
12 | ||
13 | #include "unicode/utypes.h" | |
14 | ||
15 | #if !UCONFIG_NO_TRANSLITERATION | |
4388f060 | 16 | #ifdef __cplusplus |
b75a7d8f A |
17 | |
18 | #include "unicode/uobject.h" | |
19 | #include "unicode/parseerr.h" | |
20 | #include "unicode/unorm.h" | |
21 | #include "rbt.h" | |
73c04bcf A |
22 | #include "hash.h" |
23 | #include "uvector.h" | |
b75a7d8f A |
24 | |
25 | U_NAMESPACE_BEGIN | |
26 | ||
27 | class TransliterationRuleData; | |
28 | class UnicodeFunctor; | |
29 | class ParseData; | |
30 | class RuleHalf; | |
31 | class ParsePosition; | |
b75a7d8f A |
32 | class StringMatcher; |
33 | ||
34 | class TransliteratorParser : public UMemory { | |
35 | ||
36 | public: | |
37 | ||
38 | /** | |
73c04bcf A |
39 | * A Vector of TransliterationRuleData objects, one for each discrete group |
40 | * of rules in the rule set | |
b75a7d8f | 41 | */ |
73c04bcf | 42 | UVector dataVector; |
b75a7d8f A |
43 | |
44 | /** | |
45 | * PUBLIC data member. | |
73c04bcf | 46 | * A Vector of UnicodeStrings containing all of the ID blocks in the rule set |
b75a7d8f | 47 | */ |
73c04bcf | 48 | UVector idBlockVector; |
b75a7d8f A |
49 | |
50 | /** | |
51 | * PUBLIC data member containing the parsed compound filter, if any. | |
52 | */ | |
53 | UnicodeSet* compoundFilter; | |
54 | ||
55 | private: | |
56 | ||
b75a7d8f | 57 | /** |
73c04bcf | 58 | * The current data object for which we are parsing rules |
b75a7d8f | 59 | */ |
73c04bcf A |
60 | TransliterationRuleData* curData; |
61 | ||
62 | UTransDirection direction; | |
b75a7d8f A |
63 | |
64 | /** | |
65 | * Parse error information. | |
66 | */ | |
67 | UParseError parseError; | |
68 | ||
69 | /** | |
70 | * Temporary symbol table used during parsing. | |
71 | */ | |
72 | ParseData* parseData; | |
73 | ||
74 | /** | |
75 | * Temporary vector of matcher variables. When parsing is complete, this | |
76 | * is copied into the array data.variables. As with data.variables, | |
77 | * element 0 corresponds to character data.variablesBase. | |
78 | */ | |
73c04bcf | 79 | UVector variablesVector; |
b75a7d8f | 80 | |
73c04bcf A |
81 | /** |
82 | * Temporary table of variable names. When parsing is complete, this is | |
83 | * copied into data.variableNames. | |
84 | */ | |
85 | Hashtable variableNames; | |
86 | ||
b75a7d8f A |
87 | /** |
88 | * String of standins for segments. Used during the parsing of a single | |
89 | * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds | |
90 | * to StringMatcher object segmentObjects.elementAt(0), etc. | |
91 | */ | |
92 | UnicodeString segmentStandins; | |
93 | ||
94 | /** | |
95 | * Vector of StringMatcher objects for segments. Used during the | |
96 | * parsing of a single rule. | |
97 | * segmentStandins.charAt(0) is the standin for "$1" and corresponds | |
98 | * to StringMatcher object segmentObjects.elementAt(0), etc. | |
99 | */ | |
73c04bcf | 100 | UVector segmentObjects; |
b75a7d8f A |
101 | |
102 | /** | |
103 | * The next available stand-in for variables. This starts at some point in | |
104 | * the private use area (discovered dynamically) and increments up toward | |
105 | * <code>variableLimit</code>. At any point during parsing, available | |
106 | * variables are <code>variableNext..variableLimit-1</code>. | |
107 | */ | |
108 | UChar variableNext; | |
109 | ||
110 | /** | |
111 | * The last available stand-in for variables. This is discovered | |
112 | * dynamically. At any point during parsing, available variables are | |
113 | * <code>variableNext..variableLimit-1</code>. | |
114 | */ | |
115 | UChar variableLimit; | |
116 | ||
117 | /** | |
118 | * When we encounter an undefined variable, we do not immediately signal | |
119 | * an error, in case we are defining this variable, e.g., "$a = [a-z];". | |
120 | * Instead, we save the name of the undefined variable, and substitute | |
121 | * in the placeholder char variableLimit - 1, and decrement | |
122 | * variableLimit. | |
123 | */ | |
124 | UnicodeString undefinedVariableName; | |
125 | ||
126 | /** | |
127 | * The stand-in character for the 'dot' set, represented by '.' in | |
128 | * patterns. This is allocated the first time it is needed, and | |
129 | * reused thereafter. | |
130 | */ | |
131 | UChar dotStandIn; | |
132 | ||
133 | public: | |
134 | ||
135 | /** | |
136 | * Constructor. | |
137 | */ | |
73c04bcf | 138 | TransliteratorParser(UErrorCode &statusReturn); |
b75a7d8f A |
139 | |
140 | /** | |
141 | * Destructor. | |
142 | */ | |
143 | ~TransliteratorParser(); | |
144 | ||
145 | /** | |
146 | * Parse the given string as a sequence of rules, separated by newline | |
147 | * characters ('\n'), and cause this object to implement those rules. Any | |
148 | * previous rules are discarded. Typically this method is called exactly | |
149 | * once after construction. | |
150 | * | |
151 | * Parse the given rules, in the given direction. After this call | |
152 | * returns, query the public data members for results. The caller | |
153 | * owns the 'data' and 'compoundFilter' data members after this | |
154 | * call returns. | |
155 | * @param rules rules, separated by ';' | |
156 | * @param direction either FORWARD or REVERSE. | |
157 | * @param pe Struct to recieve information on position | |
158 | * of error if an error is encountered | |
159 | * @param ec Output param set to success/failure code. | |
160 | */ | |
161 | void parse(const UnicodeString& rules, | |
162 | UTransDirection direction, | |
163 | UParseError& pe, | |
164 | UErrorCode& ec); | |
165 | ||
166 | /** | |
167 | * Return the compound filter parsed by parse(). Caller owns result. | |
168 | * @return the compound filter parsed by parse(). | |
169 | */ | |
170 | UnicodeSet* orphanCompoundFilter(); | |
171 | ||
b75a7d8f A |
172 | private: |
173 | ||
174 | /** | |
175 | * Return a representation of this transliterator as source rules. | |
176 | * @param rules Output param to receive the rules. | |
177 | * @param direction either FORWARD or REVERSE. | |
178 | */ | |
179 | void parseRules(const UnicodeString& rules, | |
73c04bcf A |
180 | UTransDirection direction, |
181 | UErrorCode& status); | |
b75a7d8f A |
182 | |
183 | /** | |
184 | * MAIN PARSER. Parse the next rule in the given rule string, starting | |
185 | * at pos. Return the index after the last character parsed. Do not | |
186 | * parse characters at or after limit. | |
187 | * | |
188 | * Important: The character at pos must be a non-whitespace character | |
189 | * that is not the comment character. | |
190 | * | |
191 | * This method handles quoting, escaping, and whitespace removal. It | |
192 | * parses the end-of-rule character. It recognizes context and cursor | |
193 | * indicators. Once it does a lexical breakdown of the rule at pos, it | |
194 | * creates a rule object and adds it to our rule list. | |
195 | * @param rules Output param to receive the rules. | |
196 | * @param pos the starting position. | |
197 | * @param limit pointer past the last character of the rule. | |
198 | * @return the index after the last character parsed. | |
199 | */ | |
73c04bcf | 200 | int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); |
b75a7d8f A |
201 | |
202 | /** | |
203 | * Set the variable range to [start, end] (inclusive). | |
204 | * @param start the start value of the range. | |
205 | * @param end the end value of the range. | |
206 | */ | |
73c04bcf | 207 | void setVariableRange(int32_t start, int32_t end, UErrorCode& status); |
b75a7d8f A |
208 | |
209 | /** | |
210 | * Assert that the given character is NOT within the variable range. | |
211 | * If it is, return FALSE. This is neccesary to ensure that the | |
212 | * variable range does not overlap characters used in a rule. | |
213 | * @param ch the given character. | |
214 | * @return True, if the given character is NOT within the variable range. | |
215 | */ | |
216 | UBool checkVariableRange(UChar32 ch) const; | |
217 | ||
218 | /** | |
219 | * Set the maximum backup to 'backup', in response to a pragma | |
220 | * statement. | |
221 | * @param backup the new value to be set. | |
222 | */ | |
223 | void pragmaMaximumBackup(int32_t backup); | |
224 | ||
225 | /** | |
226 | * Begin normalizing all rules using the given mode, in response | |
227 | * to a pragma statement. | |
228 | * @param mode the given mode. | |
229 | */ | |
230 | void pragmaNormalizeRules(UNormalizationMode mode); | |
231 | ||
232 | /** | |
233 | * Return true if the given rule looks like a pragma. | |
234 | * @param pos offset to the first non-whitespace character | |
235 | * of the rule. | |
236 | * @param limit pointer past the last character of the rule. | |
237 | * @return true if the given rule looks like a pragma. | |
238 | */ | |
239 | static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit); | |
240 | ||
241 | /** | |
242 | * Parse a pragma. This method assumes resemblesPragma() has | |
243 | * already returned true. | |
244 | * @param pos offset to the first non-whitespace character | |
245 | * of the rule. | |
246 | * @param limit pointer past the last character of the rule. | |
247 | * @return the position index after the final ';' of the pragma, | |
248 | * or -1 on failure. | |
249 | */ | |
73c04bcf | 250 | int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); |
b75a7d8f A |
251 | |
252 | /** | |
253 | * Called by main parser upon syntax error. Search the rule string | |
254 | * for the probable end of the rule. Of course, if the error is that | |
255 | * the end of rule marker is missing, then the rule end will not be found. | |
256 | * In any case the rule start will be correctly reported. | |
257 | * @param parseErrorCode error code. | |
258 | * @param msg error description. | |
259 | * @param start position of first character of current rule. | |
260 | * @return start position of first character of current rule. | |
261 | */ | |
73c04bcf A |
262 | int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start, |
263 | UErrorCode& status); | |
b75a7d8f A |
264 | |
265 | /** | |
266 | * Parse a UnicodeSet out, store it, and return the stand-in character | |
267 | * used to represent it. | |
268 | * | |
269 | * @param rule the rule for UnicodeSet. | |
270 | * @param pos the position in pattern at which to start parsing. | |
271 | * @return the stand-in character used to represent it. | |
272 | */ | |
273 | UChar parseSet(const UnicodeString& rule, | |
73c04bcf A |
274 | ParsePosition& pos, |
275 | UErrorCode& status); | |
b75a7d8f A |
276 | |
277 | /** | |
278 | * Generate and return a stand-in for a new UnicodeFunctor. Store | |
279 | * the matcher (adopt it). | |
280 | * @param adopted the UnicodeFunctor to be adopted. | |
281 | * @return a stand-in for a new UnicodeFunctor. | |
282 | */ | |
73c04bcf | 283 | UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status); |
b75a7d8f A |
284 | |
285 | /** | |
286 | * Return the standin for segment seg (1-based). | |
287 | * @param seg the given segment. | |
288 | * @return the standIn character for the given segment. | |
289 | */ | |
73c04bcf | 290 | UChar getSegmentStandin(int32_t seg, UErrorCode& status); |
b75a7d8f A |
291 | |
292 | /** | |
293 | * Set the object for segment seg (1-based). | |
294 | * @param seg the given segment. | |
295 | * @param adopted the StringMatcher to be adopted. | |
296 | */ | |
73c04bcf | 297 | void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status); |
b75a7d8f A |
298 | |
299 | /** | |
300 | * Return the stand-in for the dot set. It is allocated the first | |
301 | * time and reused thereafter. | |
302 | * @return the stand-in for the dot set. | |
303 | */ | |
73c04bcf | 304 | UChar getDotStandIn(UErrorCode& status); |
b75a7d8f A |
305 | |
306 | /** | |
307 | * Append the value of the given variable name to the given | |
308 | * UnicodeString. | |
309 | * @param name the variable name to be appended. | |
310 | * @param buf the given UnicodeString to append to. | |
311 | */ | |
312 | void appendVariableDef(const UnicodeString& name, | |
73c04bcf A |
313 | UnicodeString& buf, |
314 | UErrorCode& status); | |
b75a7d8f A |
315 | |
316 | /** | |
317 | * Glue method to get around access restrictions in C++. | |
318 | */ | |
46f4442e A |
319 | /*static Transliterator* createBasicInstance(const UnicodeString& id, |
320 | const UnicodeString* canonID);*/ | |
b75a7d8f A |
321 | |
322 | friend class RuleHalf; | |
323 | ||
324 | // Disallowed methods; no impl. | |
325 | /** | |
326 | * Copy constructor | |
327 | */ | |
328 | TransliteratorParser(const TransliteratorParser&); | |
329 | ||
330 | /** | |
331 | * Assignment operator | |
332 | */ | |
333 | TransliteratorParser& operator=(const TransliteratorParser&); | |
334 | }; | |
335 | ||
336 | U_NAMESPACE_END | |
337 | ||
4388f060 | 338 | #endif /* #ifdef __cplusplus */ |
73c04bcf A |
339 | |
340 | /** | |
341 | * Strip/convert the following from the transliterator rules: | |
342 | * comments | |
343 | * newlines | |
344 | * white space at the beginning and end of a line | |
345 | * unescape \u notation | |
346 | * | |
347 | * The target must be equal in size as the source. | |
348 | * @internal | |
349 | */ | |
350 | U_CAPI int32_t | |
351 | utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status); | |
352 | ||
b75a7d8f A |
353 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
354 | ||
355 | #endif |