]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/tridpars.h
2 **************************************************************************
3 * Copyright (c) 2002-2004, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 **************************************************************************
6 * Date Name Description *
7 * 01/28/2002 aliu Creation. *
8 **************************************************************************
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_TRANSLITERATION
17 #include "unicode/uobject.h"
18 #include "unicode/unistr.h"
27 * Parsing component for transliterator IDs. This class contains only
28 * static members; it cannot be instantiated. Methods in this class
29 * parse various ID formats, including the following:
31 * A basic ID, which contains source, target, and variant, but no
32 * filter and no explicit inverse. Examples include
33 * "Latin-Greek/UNGEGN" and "Null".
35 * A single ID, which is a basic ID plus optional filter and optional
36 * explicit inverse. Examples include "[a-zA-Z] Latin-Greek" and
39 * A compound ID, which is a sequence of one or more single IDs,
40 * separated by semicolons, with optional forward and reverse global
41 * filters. The global filters are UnicodeSet patterns prepended or
42 * appended to the IDs, separated by semicolons. An appended filter
43 * must be enclosed in parentheses and applies in the reverse
48 class TransliteratorIDParser
/* not : public UObject because all methods are static */ {
53 * A structure containing the parsed data of a filtered ID, that
54 * is, a basic ID optionally with a filter.
56 * 'source' and 'target' will always be non-null. The 'variant'
57 * will be non-null only if a non-empty variant was parsed.
59 * 'sawSource' is true if there was an explicit source in the
60 * parsed id. If there was no explicit source, then an implied
61 * source of ANY is returned and 'sawSource' is set to false.
63 * 'filter' is the parsed filter pattern, or null if there was no
66 class Specs
: public UMemory
{
68 UnicodeString source
; // not null
69 UnicodeString target
; // not null
70 UnicodeString variant
; // may be null
71 UnicodeString filter
; // may be null
73 Specs(const UnicodeString
& s
, const UnicodeString
& t
,
74 const UnicodeString
& v
, UBool sawS
,
75 const UnicodeString
& f
);
79 Specs(const Specs
&other
); // forbid copying of this class
80 Specs
&operator=(const Specs
&other
); // forbid copying of this class
84 * A structure containing the canonicalized data of a filtered ID,
85 * that is, a basic ID optionally with a filter.
87 * 'canonID' is always non-null. It may be the empty string "".
88 * It is the id that should be assigned to the created
89 * transliterator. It _cannot_ be instantiated directly.
91 * 'basicID' is always non-null and non-empty. It is always of
92 * the form S-T or S-T/V. It is designed to be fed to low-level
93 * instantiation code that only understands these two formats.
95 * 'filter' may be null, if there is none, or non-null and
98 class SingleID
: public UMemory
{
100 UnicodeString canonID
;
101 UnicodeString basicID
;
102 UnicodeString filter
;
103 SingleID(const UnicodeString
& c
, const UnicodeString
& b
,
104 const UnicodeString
& f
);
105 SingleID(const UnicodeString
& c
, const UnicodeString
& b
);
106 Transliterator
* createInstance();
110 SingleID(const SingleID
&other
); // forbid copying of this class
111 SingleID
&operator=(const SingleID
&other
); // forbid copying of this class
115 * Parse a filter ID, that is, an ID of the general form
116 * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
117 * @param id the id to be parsed
118 * @param pos INPUT-OUTPUT parameter. On input, the position of
119 * the first character to parse. On output, the position after
120 * the last character parsed.
121 * @return a SingleID object or null if the parse fails
123 static SingleID
* parseFilterID(const UnicodeString
& id
, int32_t& pos
);
126 * Parse a single ID, that is, an ID of the general form
127 * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
128 * optional, the filters optional, and the variants optional.
129 * @param id the id to be parsed
130 * @param pos INPUT-OUTPUT parameter. On input, the position of
131 * the first character to parse. On output, the position after
132 * the last character parsed.
133 * @param dir the direction. If the direction is REVERSE then the
134 * SingleID is constructed for the reverse direction.
135 * @return a SingleID object or null
137 static SingleID
* parseSingleID(const UnicodeString
& id
, int32_t& pos
,
138 int32_t dir
, UErrorCode
& status
);
141 * Parse a global filter of the form "[f]" or "([f])", depending
143 * @param id the pattern the parse
144 * @param pos INPUT-OUTPUT parameter. On input, the position of
145 * the first character to parse. On output, the position after
146 * the last character parsed.
147 * @param dir the direction.
148 * @param withParens INPUT-OUTPUT parameter. On entry, if
149 * withParens[0] is 0, then parens are disallowed. If it is 1,
150 * then parens are requires. If it is -1, then parens are
151 * optional, and the return result will be set to 0 or 1.
152 * @param canonID OUTPUT parameter. The pattern for the filter
153 * added to the canonID, either at the end, if dir is FORWARD, or
154 * at the start, if dir is REVERSE. The pattern will be enclosed
155 * in parentheses if appropriate, and will be suffixed with an
156 * ID_DELIM character. May be null.
157 * @return a UnicodeSet object or null. A non-null results
158 * indicates a successful parse, regardless of whether the filter
159 * applies to the given direction. The caller should discard it
160 * if withParens != (dir == REVERSE).
162 static UnicodeSet
* parseGlobalFilter(const UnicodeString
& id
, int32_t& pos
,
165 UnicodeString
* canonID
);
168 * Parse a compound ID, consisting of an optional forward global
169 * filter, a separator, one or more single IDs delimited by
170 * separators, an an optional reverse global filter. The
171 * separator is a semicolon. The global filters are UnicodeSet
172 * patterns. The reverse global filter must be enclosed in
174 * @param id the pattern the parse
175 * @param dir the direction.
176 * @param canonID OUTPUT parameter that receives the canonical ID,
177 * consisting of canonical IDs for all elements, as returned by
178 * parseSingleID(), separated by semicolons. Previous contents
180 * @param list OUTPUT parameter that receives a list of SingleID
181 * objects representing the parsed IDs. Previous contents are
183 * @param globalFilter OUTPUT parameter that receives a pointer to
184 * a newly created global filter for this ID in this direction, or
185 * null if there is none.
186 * @return true if the parse succeeds, that is, if the entire
187 * id is consumed without syntax error.
189 static UBool
parseCompoundID(const UnicodeString
& id
, int32_t dir
,
190 UnicodeString
& canonID
,
192 UnicodeSet
*& globalFilter
);
195 * Convert the elements of the 'list' vector, which are SingleID
196 * objects, into actual Transliterator objects. In the course of
197 * this, some (or all) entries may be removed. If all entries
198 * are removed, the Null transliterator will be added.
200 * Delete entries with empty basicIDs; these are generated by
201 * elements like "(A)" in the forward direction, or "A()" in
202 * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert
203 * SingleID entries to actual transliterators.
205 * Also, optionally, insert the given transliterator at the given
206 * position. This effectively happens before anything else.
208 * @param list vector of SingleID objects. On exit, vector
209 * of one or more Transliterators.
210 * @param insert Transliterator to insert, or null if none.
211 * @param insertIndex index from 0..list.size()-1, at which
212 * to place 'insert', or -1 if none.
213 * @param ec Output param to receive a success or an error code.
214 * @return new value of insertIndex. The index will shift if
215 * there are empty items, like "(Lower)", with indices less than
218 static int32_t instantiateList(UVector
& list
,
219 Transliterator
* insert
,
224 * Parse an ID into pieces. Take IDs of the form T, T/V, S-T,
225 * S-T/V, or S/V-T. If the source is missing, return a source of
227 * @param id the id string, in any of several forms
228 * @param source the given source.
229 * @param target the given target.
230 * @param variant the given variant
231 * @param isSourcePresent If TRUE then the source is present.
232 * If the source is not present, ANY will be
233 * given as the source, and isSourcePresent will be null
234 * @return an array of 4 strings: source, target, variant, and
235 * isSourcePresent. If the source is not present, ANY will be
236 * given as the source, and isSourcePresent will be null. Otherwise
237 * isSourcePresent will be non-null. The target may be empty if the
238 * id is not well-formed. The variant may be empty.
240 static void IDtoSTV(const UnicodeString
& id
,
241 UnicodeString
& source
,
242 UnicodeString
& target
,
243 UnicodeString
& variant
,
244 UBool
& isSourcePresent
);
247 * Given source, target, and variant strings, concatenate them into a
248 * full ID. If the source is empty, then "Any" will be used for the
249 * source, so the ID will always be of the form s-t/v or s-t.
251 static void STVtoID(const UnicodeString
& source
,
252 const UnicodeString
& target
,
253 const UnicodeString
& variant
,
257 * Register two targets as being inverses of one another. For
258 * example, calling registerSpecialInverse("NFC", "NFD", true) causes
259 * Transliterator to form the following inverse relationships:
264 * Any-NFD => Any-NFC</pre>
266 * (Without the special inverse registration, the inverse of NFC
267 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
268 * that the presence or absence of "Any-" is preserved.
270 * <p>The relationship is symmetrical; registering (a, b) is
271 * equivalent to registering (b, a).
273 * <p>The relevant IDs must still be registered separately as
274 * factories or classes.
276 * <p>Only the targets are specified. Special inverses always
277 * have the form Any-Target1 <=> Any-Target2. The target should
278 * have canonical casing (the casing desired to be produced when
279 * an inverse is formed) and should contain no whitespace or other
280 * extraneous characters.
282 * @param target the target against which to register the inverse
283 * @param inverseTarget the inverse of target, that is
284 * Any-target.getInverse() => Any-inverseTarget
285 * @param bidirectional if true, register the reverse relation
286 * as well, that is, Any-inverseTarget.getInverse() => Any-target
288 static void registerSpecialInverse(const UnicodeString
& target
,
289 const UnicodeString
& inverseTarget
,
294 * Free static memory.
296 static void cleanup();
299 //----------------------------------------------------------------
300 // Private implementation
301 //----------------------------------------------------------------
303 // forbid instantiation
304 TransliteratorIDParser();
307 * Parse an ID into component pieces. Take IDs of the form T,
308 * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a
310 * @param id the id string, in any of several forms
311 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
312 * offset of the first character to parse in id. On output,
313 * pos[0] is the offset after the last parsed character. If the
314 * parse failed, pos[0] will be unchanged.
315 * @param allowFilter if true, a UnicodeSet pattern is allowed
316 * at any location between specs or delimiters, and is returned
317 * as the fifth string in the array.
318 * @return a Specs object, or null if the parse failed. If
319 * neither source nor target was seen in the parsed id, then the
320 * parse fails. If allowFilter is true, then the parsed filter
321 * pattern is returned in the Specs object, otherwise the returned
322 * filter reference is null. If the parse fails for any reason
325 static Specs
* parseFilterID(const UnicodeString
& id
, int32_t& pos
,
329 * Givens a Specs object, convert it to a SingleID object. The
330 * Spec object is a more unprocessed parse result. The SingleID
331 * object contains information about canonical and basic IDs.
332 * @param specs the given Specs object.
333 * @param dir either FORWARD or REVERSE.
334 * @return a SingleID; never returns null. Returned object always
335 * has 'filter' field of null.
337 static SingleID
* specsToID(const Specs
* specs
, int32_t dir
);
340 * Given a Specs object, return a SingleID representing the
341 * special inverse of that ID. If there is no special inverse
343 * @param specs the given Specs.
344 * @return a SingleID or null. Returned object always has
345 * 'filter' field of null.
347 static SingleID
* specsToSpecialInverse(const Specs
& specs
, UErrorCode
&status
);
350 * Glue method to get around access problems in C++.
351 * @param id the id string for the transliterator, in any of several forms
352 * @param canonID the given canonical ID
354 static Transliterator
* createBasicInstance(const UnicodeString
& id
,
355 const UnicodeString
* canonID
);
358 * Initialize static memory.
360 static void init(UErrorCode
&status
);
362 friend class SingleID
;
367 #endif /* #if !UCONFIG_NO_TRANSLITERATION */