]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/tridpars.h
ICU-511.27.tar.gz
[apple/icu.git] / icuSources / i18n / tridpars.h
CommitLineData
b75a7d8f 1/*
374ca955 2 **************************************************************************
729e4ab9 3 * Copyright (c) 2002-2010, International Business Machines Corporation *
374ca955
A
4 * and others. All Rights Reserved. *
5 **************************************************************************
6 * Date Name Description *
7 * 01/28/2002 aliu Creation. *
8 **************************************************************************
9 */
b75a7d8f
A
10#ifndef TRIDPARS_H
11#define TRIDPARS_H
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_TRANSLITERATION
16
17#include "unicode/uobject.h"
18#include "unicode/unistr.h"
19
20U_NAMESPACE_BEGIN
21
22class Transliterator;
23class UnicodeSet;
24class UVector;
25
26/**
27 * Parsing component for transliterator IDs. This class contains only
28 * static members; it cannot be instantiated. Methods in this class
29 * parse various ID formats, including the following:
30 *
31 * A basic ID, which contains source, target, and variant, but no
32 * filter and no explicit inverse. Examples include
33 * "Latin-Greek/UNGEGN" and "Null".
34 *
35 * A single ID, which is a basic ID plus optional filter and optional
36 * explicit inverse. Examples include "[a-zA-Z] Latin-Greek" and
37 * "Lower (Upper)".
38 *
39 * A compound ID, which is a sequence of one or more single IDs,
40 * separated by semicolons, with optional forward and reverse global
41 * filters. The global filters are UnicodeSet patterns prepended or
42 * appended to the IDs, separated by semicolons. An appended filter
43 * must be enclosed in parentheses and applies in the reverse
44 * direction.
45 *
46 * @author Alan Liu
47 */
48class TransliteratorIDParser /* not : public UObject because all methods are static */ {
49
50 public:
51
52 /**
53 * A structure containing the parsed data of a filtered ID, that
54 * is, a basic ID optionally with a filter.
55 *
56 * 'source' and 'target' will always be non-null. The 'variant'
57 * will be non-null only if a non-empty variant was parsed.
58 *
59 * 'sawSource' is true if there was an explicit source in the
60 * parsed id. If there was no explicit source, then an implied
61 * source of ANY is returned and 'sawSource' is set to false.
62 *
63 * 'filter' is the parsed filter pattern, or null if there was no
64 * filter.
65 */
66 class Specs : public UMemory {
67 public:
68 UnicodeString source; // not null
69 UnicodeString target; // not null
70 UnicodeString variant; // may be null
71 UnicodeString filter; // may be null
72 UBool sawSource;
73 Specs(const UnicodeString& s, const UnicodeString& t,
74 const UnicodeString& v, UBool sawS,
75 const UnicodeString& f);
76
77 private:
78
79 Specs(const Specs &other); // forbid copying of this class
80 Specs &operator=(const Specs &other); // forbid copying of this class
81 };
82
83 /**
84 * A structure containing the canonicalized data of a filtered ID,
85 * that is, a basic ID optionally with a filter.
86 *
87 * 'canonID' is always non-null. It may be the empty string "".
88 * It is the id that should be assigned to the created
89 * transliterator. It _cannot_ be instantiated directly.
90 *
91 * 'basicID' is always non-null and non-empty. It is always of
92 * the form S-T or S-T/V. It is designed to be fed to low-level
93 * instantiation code that only understands these two formats.
94 *
95 * 'filter' may be null, if there is none, or non-null and
96 * non-empty.
97 */
98 class SingleID : public UMemory {
99 public:
100 UnicodeString canonID;
101 UnicodeString basicID;
102 UnicodeString filter;
103 SingleID(const UnicodeString& c, const UnicodeString& b,
104 const UnicodeString& f);
105 SingleID(const UnicodeString& c, const UnicodeString& b);
106 Transliterator* createInstance();
107
108 private:
109
110 SingleID(const SingleID &other); // forbid copying of this class
111 SingleID &operator=(const SingleID &other); // forbid copying of this class
112 };
113
114 /**
115 * Parse a filter ID, that is, an ID of the general form
116 * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
117 * @param id the id to be parsed
118 * @param pos INPUT-OUTPUT parameter. On input, the position of
119 * the first character to parse. On output, the position after
120 * the last character parsed.
121 * @return a SingleID object or null if the parse fails
122 */
123 static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos);
124
125 /**
126 * Parse a single ID, that is, an ID of the general form
127 * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
128 * optional, the filters optional, and the variants optional.
129 * @param id the id to be parsed
130 * @param pos INPUT-OUTPUT parameter. On input, the position of
131 * the first character to parse. On output, the position after
132 * the last character parsed.
133 * @param dir the direction. If the direction is REVERSE then the
134 * SingleID is constructed for the reverse direction.
135 * @return a SingleID object or null
136 */
137 static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos,
374ca955 138 int32_t dir, UErrorCode& status);
b75a7d8f
A
139
140 /**
141 * Parse a global filter of the form "[f]" or "([f])", depending
142 * on 'withParens'.
143 * @param id the pattern the parse
144 * @param pos INPUT-OUTPUT parameter. On input, the position of
145 * the first character to parse. On output, the position after
146 * the last character parsed.
147 * @param dir the direction.
148 * @param withParens INPUT-OUTPUT parameter. On entry, if
149 * withParens[0] is 0, then parens are disallowed. If it is 1,
729e4ab9 150 * then parens are required. If it is -1, then parens are
b75a7d8f
A
151 * optional, and the return result will be set to 0 or 1.
152 * @param canonID OUTPUT parameter. The pattern for the filter
153 * added to the canonID, either at the end, if dir is FORWARD, or
154 * at the start, if dir is REVERSE. The pattern will be enclosed
155 * in parentheses if appropriate, and will be suffixed with an
156 * ID_DELIM character. May be null.
157 * @return a UnicodeSet object or null. A non-null results
158 * indicates a successful parse, regardless of whether the filter
159 * applies to the given direction. The caller should discard it
160 * if withParens != (dir == REVERSE).
161 */
162 static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos,
163 int32_t dir,
164 int32_t& withParens,
165 UnicodeString* canonID);
166
167 /**
168 * Parse a compound ID, consisting of an optional forward global
169 * filter, a separator, one or more single IDs delimited by
170 * separators, an an optional reverse global filter. The
171 * separator is a semicolon. The global filters are UnicodeSet
172 * patterns. The reverse global filter must be enclosed in
173 * parentheses.
174 * @param id the pattern the parse
175 * @param dir the direction.
176 * @param canonID OUTPUT parameter that receives the canonical ID,
177 * consisting of canonical IDs for all elements, as returned by
178 * parseSingleID(), separated by semicolons. Previous contents
179 * are discarded.
180 * @param list OUTPUT parameter that receives a list of SingleID
181 * objects representing the parsed IDs. Previous contents are
182 * discarded.
183 * @param globalFilter OUTPUT parameter that receives a pointer to
184 * a newly created global filter for this ID in this direction, or
185 * null if there is none.
186 * @return true if the parse succeeds, that is, if the entire
187 * id is consumed without syntax error.
188 */
189 static UBool parseCompoundID(const UnicodeString& id, int32_t dir,
190 UnicodeString& canonID,
191 UVector& list,
192 UnicodeSet*& globalFilter);
193
194 /**
195 * Convert the elements of the 'list' vector, which are SingleID
196 * objects, into actual Transliterator objects. In the course of
197 * this, some (or all) entries may be removed. If all entries
198 * are removed, the Null transliterator will be added.
199 *
200 * Delete entries with empty basicIDs; these are generated by
201 * elements like "(A)" in the forward direction, or "A()" in
202 * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert
203 * SingleID entries to actual transliterators.
204 *
b75a7d8f
A
205 * @param list vector of SingleID objects. On exit, vector
206 * of one or more Transliterators.
b75a7d8f
A
207 * @param ec Output param to receive a success or an error code.
208 * @return new value of insertIndex. The index will shift if
209 * there are empty items, like "(Lower)", with indices less than
210 * insertIndex.
211 */
73c04bcf
A
212 static void instantiateList(UVector& list,
213 UErrorCode& ec);
b75a7d8f
A
214
215 /**
216 * Parse an ID into pieces. Take IDs of the form T, T/V, S-T,
217 * S-T/V, or S/V-T. If the source is missing, return a source of
218 * ANY.
219 * @param id the id string, in any of several forms
220 * @param source the given source.
221 * @param target the given target.
222 * @param variant the given variant
223 * @param isSourcePresent If TRUE then the source is present.
224 * If the source is not present, ANY will be
225 * given as the source, and isSourcePresent will be null
226 * @return an array of 4 strings: source, target, variant, and
227 * isSourcePresent. If the source is not present, ANY will be
228 * given as the source, and isSourcePresent will be null. Otherwise
229 * isSourcePresent will be non-null. The target may be empty if the
230 * id is not well-formed. The variant may be empty.
231 */
232 static void IDtoSTV(const UnicodeString& id,
233 UnicodeString& source,
234 UnicodeString& target,
235 UnicodeString& variant,
236 UBool& isSourcePresent);
237
238 /**
239 * Given source, target, and variant strings, concatenate them into a
240 * full ID. If the source is empty, then "Any" will be used for the
241 * source, so the ID will always be of the form s-t/v or s-t.
242 */
243 static void STVtoID(const UnicodeString& source,
244 const UnicodeString& target,
245 const UnicodeString& variant,
246 UnicodeString& id);
247
248 /**
249 * Register two targets as being inverses of one another. For
250 * example, calling registerSpecialInverse("NFC", "NFD", true) causes
251 * Transliterator to form the following inverse relationships:
252 *
253 * <pre>NFC => NFD
254 * Any-NFC => Any-NFD
255 * NFD => NFC
256 * Any-NFD => Any-NFC</pre>
257 *
258 * (Without the special inverse registration, the inverse of NFC
259 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
260 * that the presence or absence of "Any-" is preserved.
261 *
262 * <p>The relationship is symmetrical; registering (a, b) is
263 * equivalent to registering (b, a).
264 *
265 * <p>The relevant IDs must still be registered separately as
266 * factories or classes.
267 *
268 * <p>Only the targets are specified. Special inverses always
269 * have the form Any-Target1 <=> Any-Target2. The target should
270 * have canonical casing (the casing desired to be produced when
271 * an inverse is formed) and should contain no whitespace or other
272 * extraneous characters.
273 *
274 * @param target the target against which to register the inverse
275 * @param inverseTarget the inverse of target, that is
276 * Any-target.getInverse() => Any-inverseTarget
277 * @param bidirectional if true, register the reverse relation
278 * as well, that is, Any-inverseTarget.getInverse() => Any-target
279 */
280 static void registerSpecialInverse(const UnicodeString& target,
281 const UnicodeString& inverseTarget,
374ca955
A
282 UBool bidirectional,
283 UErrorCode &status);
b75a7d8f
A
284
285 /**
286 * Free static memory.
287 */
288 static void cleanup();
289
290 private:
291 //----------------------------------------------------------------
292 // Private implementation
293 //----------------------------------------------------------------
294
295 // forbid instantiation
296 TransliteratorIDParser();
297
298 /**
299 * Parse an ID into component pieces. Take IDs of the form T,
300 * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a
301 * source of ANY.
302 * @param id the id string, in any of several forms
303 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
304 * offset of the first character to parse in id. On output,
305 * pos[0] is the offset after the last parsed character. If the
306 * parse failed, pos[0] will be unchanged.
307 * @param allowFilter if true, a UnicodeSet pattern is allowed
308 * at any location between specs or delimiters, and is returned
309 * as the fifth string in the array.
310 * @return a Specs object, or null if the parse failed. If
311 * neither source nor target was seen in the parsed id, then the
312 * parse fails. If allowFilter is true, then the parsed filter
313 * pattern is returned in the Specs object, otherwise the returned
314 * filter reference is null. If the parse fails for any reason
315 * null is returned.
316 */
317 static Specs* parseFilterID(const UnicodeString& id, int32_t& pos,
318 UBool allowFilter);
319
320 /**
321 * Givens a Specs object, convert it to a SingleID object. The
322 * Spec object is a more unprocessed parse result. The SingleID
323 * object contains information about canonical and basic IDs.
324 * @param specs the given Specs object.
325 * @param dir either FORWARD or REVERSE.
326 * @return a SingleID; never returns null. Returned object always
327 * has 'filter' field of null.
328 */
329 static SingleID* specsToID(const Specs* specs, int32_t dir);
330
331 /**
332 * Given a Specs object, return a SingleID representing the
333 * special inverse of that ID. If there is no special inverse
334 * then return null.
335 * @param specs the given Specs.
336 * @return a SingleID or null. Returned object always has
337 * 'filter' field of null.
338 */
374ca955 339 static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status);
b75a7d8f
A
340
341 /**
342 * Glue method to get around access problems in C++.
343 * @param id the id string for the transliterator, in any of several forms
344 * @param canonID the given canonical ID
345 */
346 static Transliterator* createBasicInstance(const UnicodeString& id,
347 const UnicodeString* canonID);
348
349 /**
350 * Initialize static memory.
351 */
374ca955 352 static void init(UErrorCode &status);
b75a7d8f
A
353
354 friend class SingleID;
355};
356
357U_NAMESPACE_END
358
359#endif /* #if !UCONFIG_NO_TRANSLITERATION */
360
361#endif