]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/tridpars.h
ICU-6.2.15.tar.gz
[apple/icu.git] / icuSources / i18n / tridpars.h
1 /*
2 **************************************************************************
3 * Copyright (c) 2002-2004, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 **************************************************************************
6 * Date Name Description *
7 * 01/28/2002 aliu Creation. *
8 **************************************************************************
9 */
10 #ifndef TRIDPARS_H
11 #define TRIDPARS_H
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_TRANSLITERATION
16
17 #include "unicode/uobject.h"
18 #include "unicode/unistr.h"
19
20 U_NAMESPACE_BEGIN
21
22 class Transliterator;
23 class UnicodeSet;
24 class UVector;
25
26 /**
27 * Parsing component for transliterator IDs. This class contains only
28 * static members; it cannot be instantiated. Methods in this class
29 * parse various ID formats, including the following:
30 *
31 * A basic ID, which contains source, target, and variant, but no
32 * filter and no explicit inverse. Examples include
33 * "Latin-Greek/UNGEGN" and "Null".
34 *
35 * A single ID, which is a basic ID plus optional filter and optional
36 * explicit inverse. Examples include "[a-zA-Z] Latin-Greek" and
37 * "Lower (Upper)".
38 *
39 * A compound ID, which is a sequence of one or more single IDs,
40 * separated by semicolons, with optional forward and reverse global
41 * filters. The global filters are UnicodeSet patterns prepended or
42 * appended to the IDs, separated by semicolons. An appended filter
43 * must be enclosed in parentheses and applies in the reverse
44 * direction.
45 *
46 * @author Alan Liu
47 */
48 class TransliteratorIDParser /* not : public UObject because all methods are static */ {
49
50 public:
51
52 /**
53 * A structure containing the parsed data of a filtered ID, that
54 * is, a basic ID optionally with a filter.
55 *
56 * 'source' and 'target' will always be non-null. The 'variant'
57 * will be non-null only if a non-empty variant was parsed.
58 *
59 * 'sawSource' is true if there was an explicit source in the
60 * parsed id. If there was no explicit source, then an implied
61 * source of ANY is returned and 'sawSource' is set to false.
62 *
63 * 'filter' is the parsed filter pattern, or null if there was no
64 * filter.
65 */
66 class Specs : public UMemory {
67 public:
68 UnicodeString source; // not null
69 UnicodeString target; // not null
70 UnicodeString variant; // may be null
71 UnicodeString filter; // may be null
72 UBool sawSource;
73 Specs(const UnicodeString& s, const UnicodeString& t,
74 const UnicodeString& v, UBool sawS,
75 const UnicodeString& f);
76
77 private:
78
79 Specs(const Specs &other); // forbid copying of this class
80 Specs &operator=(const Specs &other); // forbid copying of this class
81 };
82
83 /**
84 * A structure containing the canonicalized data of a filtered ID,
85 * that is, a basic ID optionally with a filter.
86 *
87 * 'canonID' is always non-null. It may be the empty string "".
88 * It is the id that should be assigned to the created
89 * transliterator. It _cannot_ be instantiated directly.
90 *
91 * 'basicID' is always non-null and non-empty. It is always of
92 * the form S-T or S-T/V. It is designed to be fed to low-level
93 * instantiation code that only understands these two formats.
94 *
95 * 'filter' may be null, if there is none, or non-null and
96 * non-empty.
97 */
98 class SingleID : public UMemory {
99 public:
100 UnicodeString canonID;
101 UnicodeString basicID;
102 UnicodeString filter;
103 SingleID(const UnicodeString& c, const UnicodeString& b,
104 const UnicodeString& f);
105 SingleID(const UnicodeString& c, const UnicodeString& b);
106 Transliterator* createInstance();
107
108 private:
109
110 SingleID(const SingleID &other); // forbid copying of this class
111 SingleID &operator=(const SingleID &other); // forbid copying of this class
112 };
113
114 /**
115 * Parse a filter ID, that is, an ID of the general form
116 * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
117 * @param id the id to be parsed
118 * @param pos INPUT-OUTPUT parameter. On input, the position of
119 * the first character to parse. On output, the position after
120 * the last character parsed.
121 * @return a SingleID object or null if the parse fails
122 */
123 static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos);
124
125 /**
126 * Parse a single ID, that is, an ID of the general form
127 * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
128 * optional, the filters optional, and the variants optional.
129 * @param id the id to be parsed
130 * @param pos INPUT-OUTPUT parameter. On input, the position of
131 * the first character to parse. On output, the position after
132 * the last character parsed.
133 * @param dir the direction. If the direction is REVERSE then the
134 * SingleID is constructed for the reverse direction.
135 * @return a SingleID object or null
136 */
137 static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos,
138 int32_t dir, UErrorCode& status);
139
140 /**
141 * Parse a global filter of the form "[f]" or "([f])", depending
142 * on 'withParens'.
143 * @param id the pattern the parse
144 * @param pos INPUT-OUTPUT parameter. On input, the position of
145 * the first character to parse. On output, the position after
146 * the last character parsed.
147 * @param dir the direction.
148 * @param withParens INPUT-OUTPUT parameter. On entry, if
149 * withParens[0] is 0, then parens are disallowed. If it is 1,
150 * then parens are requires. If it is -1, then parens are
151 * optional, and the return result will be set to 0 or 1.
152 * @param canonID OUTPUT parameter. The pattern for the filter
153 * added to the canonID, either at the end, if dir is FORWARD, or
154 * at the start, if dir is REVERSE. The pattern will be enclosed
155 * in parentheses if appropriate, and will be suffixed with an
156 * ID_DELIM character. May be null.
157 * @return a UnicodeSet object or null. A non-null results
158 * indicates a successful parse, regardless of whether the filter
159 * applies to the given direction. The caller should discard it
160 * if withParens != (dir == REVERSE).
161 */
162 static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos,
163 int32_t dir,
164 int32_t& withParens,
165 UnicodeString* canonID);
166
167 /**
168 * Parse a compound ID, consisting of an optional forward global
169 * filter, a separator, one or more single IDs delimited by
170 * separators, an an optional reverse global filter. The
171 * separator is a semicolon. The global filters are UnicodeSet
172 * patterns. The reverse global filter must be enclosed in
173 * parentheses.
174 * @param id the pattern the parse
175 * @param dir the direction.
176 * @param canonID OUTPUT parameter that receives the canonical ID,
177 * consisting of canonical IDs for all elements, as returned by
178 * parseSingleID(), separated by semicolons. Previous contents
179 * are discarded.
180 * @param list OUTPUT parameter that receives a list of SingleID
181 * objects representing the parsed IDs. Previous contents are
182 * discarded.
183 * @param globalFilter OUTPUT parameter that receives a pointer to
184 * a newly created global filter for this ID in this direction, or
185 * null if there is none.
186 * @return true if the parse succeeds, that is, if the entire
187 * id is consumed without syntax error.
188 */
189 static UBool parseCompoundID(const UnicodeString& id, int32_t dir,
190 UnicodeString& canonID,
191 UVector& list,
192 UnicodeSet*& globalFilter);
193
194 /**
195 * Convert the elements of the 'list' vector, which are SingleID
196 * objects, into actual Transliterator objects. In the course of
197 * this, some (or all) entries may be removed. If all entries
198 * are removed, the Null transliterator will be added.
199 *
200 * Delete entries with empty basicIDs; these are generated by
201 * elements like "(A)" in the forward direction, or "A()" in
202 * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert
203 * SingleID entries to actual transliterators.
204 *
205 * Also, optionally, insert the given transliterator at the given
206 * position. This effectively happens before anything else.
207 *
208 * @param list vector of SingleID objects. On exit, vector
209 * of one or more Transliterators.
210 * @param insert Transliterator to insert, or null if none.
211 * @param insertIndex index from 0..list.size()-1, at which
212 * to place 'insert', or -1 if none.
213 * @param ec Output param to receive a success or an error code.
214 * @return new value of insertIndex. The index will shift if
215 * there are empty items, like "(Lower)", with indices less than
216 * insertIndex.
217 */
218 static int32_t instantiateList(UVector& list,
219 Transliterator* insert,
220 int32_t insertIndex,
221 UErrorCode& ec);
222
223 /**
224 * Parse an ID into pieces. Take IDs of the form T, T/V, S-T,
225 * S-T/V, or S/V-T. If the source is missing, return a source of
226 * ANY.
227 * @param id the id string, in any of several forms
228 * @param source the given source.
229 * @param target the given target.
230 * @param variant the given variant
231 * @param isSourcePresent If TRUE then the source is present.
232 * If the source is not present, ANY will be
233 * given as the source, and isSourcePresent will be null
234 * @return an array of 4 strings: source, target, variant, and
235 * isSourcePresent. If the source is not present, ANY will be
236 * given as the source, and isSourcePresent will be null. Otherwise
237 * isSourcePresent will be non-null. The target may be empty if the
238 * id is not well-formed. The variant may be empty.
239 */
240 static void IDtoSTV(const UnicodeString& id,
241 UnicodeString& source,
242 UnicodeString& target,
243 UnicodeString& variant,
244 UBool& isSourcePresent);
245
246 /**
247 * Given source, target, and variant strings, concatenate them into a
248 * full ID. If the source is empty, then "Any" will be used for the
249 * source, so the ID will always be of the form s-t/v or s-t.
250 */
251 static void STVtoID(const UnicodeString& source,
252 const UnicodeString& target,
253 const UnicodeString& variant,
254 UnicodeString& id);
255
256 /**
257 * Register two targets as being inverses of one another. For
258 * example, calling registerSpecialInverse("NFC", "NFD", true) causes
259 * Transliterator to form the following inverse relationships:
260 *
261 * <pre>NFC => NFD
262 * Any-NFC => Any-NFD
263 * NFD => NFC
264 * Any-NFD => Any-NFC</pre>
265 *
266 * (Without the special inverse registration, the inverse of NFC
267 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
268 * that the presence or absence of "Any-" is preserved.
269 *
270 * <p>The relationship is symmetrical; registering (a, b) is
271 * equivalent to registering (b, a).
272 *
273 * <p>The relevant IDs must still be registered separately as
274 * factories or classes.
275 *
276 * <p>Only the targets are specified. Special inverses always
277 * have the form Any-Target1 <=> Any-Target2. The target should
278 * have canonical casing (the casing desired to be produced when
279 * an inverse is formed) and should contain no whitespace or other
280 * extraneous characters.
281 *
282 * @param target the target against which to register the inverse
283 * @param inverseTarget the inverse of target, that is
284 * Any-target.getInverse() => Any-inverseTarget
285 * @param bidirectional if true, register the reverse relation
286 * as well, that is, Any-inverseTarget.getInverse() => Any-target
287 */
288 static void registerSpecialInverse(const UnicodeString& target,
289 const UnicodeString& inverseTarget,
290 UBool bidirectional,
291 UErrorCode &status);
292
293 /**
294 * Free static memory.
295 */
296 static void cleanup();
297
298 private:
299 //----------------------------------------------------------------
300 // Private implementation
301 //----------------------------------------------------------------
302
303 // forbid instantiation
304 TransliteratorIDParser();
305
306 /**
307 * Parse an ID into component pieces. Take IDs of the form T,
308 * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a
309 * source of ANY.
310 * @param id the id string, in any of several forms
311 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
312 * offset of the first character to parse in id. On output,
313 * pos[0] is the offset after the last parsed character. If the
314 * parse failed, pos[0] will be unchanged.
315 * @param allowFilter if true, a UnicodeSet pattern is allowed
316 * at any location between specs or delimiters, and is returned
317 * as the fifth string in the array.
318 * @return a Specs object, or null if the parse failed. If
319 * neither source nor target was seen in the parsed id, then the
320 * parse fails. If allowFilter is true, then the parsed filter
321 * pattern is returned in the Specs object, otherwise the returned
322 * filter reference is null. If the parse fails for any reason
323 * null is returned.
324 */
325 static Specs* parseFilterID(const UnicodeString& id, int32_t& pos,
326 UBool allowFilter);
327
328 /**
329 * Givens a Specs object, convert it to a SingleID object. The
330 * Spec object is a more unprocessed parse result. The SingleID
331 * object contains information about canonical and basic IDs.
332 * @param specs the given Specs object.
333 * @param dir either FORWARD or REVERSE.
334 * @return a SingleID; never returns null. Returned object always
335 * has 'filter' field of null.
336 */
337 static SingleID* specsToID(const Specs* specs, int32_t dir);
338
339 /**
340 * Given a Specs object, return a SingleID representing the
341 * special inverse of that ID. If there is no special inverse
342 * then return null.
343 * @param specs the given Specs.
344 * @return a SingleID or null. Returned object always has
345 * 'filter' field of null.
346 */
347 static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status);
348
349 /**
350 * Glue method to get around access problems in C++.
351 * @param id the id string for the transliterator, in any of several forms
352 * @param canonID the given canonical ID
353 */
354 static Transliterator* createBasicInstance(const UnicodeString& id,
355 const UnicodeString* canonID);
356
357 /**
358 * Initialize static memory.
359 */
360 static void init(UErrorCode &status);
361
362 friend class SingleID;
363 };
364
365 U_NAMESPACE_END
366
367 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
368
369 #endif