]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | /* |
4 | ****************************************************************************** | |
2ca993e8 | 5 | * Copyright (C) 1996-2016, International Business Machines Corporation and |
b75a7d8f A |
6 | * others. All Rights Reserved. |
7 | ****************************************************************************** | |
8 | */ | |
9 | ||
46f4442e | 10 | /** |
b331163b | 11 | * \file |
57a6839d | 12 | * \brief C++ API: The RuleBasedCollator class implements the Collator abstract base class. |
46f4442e A |
13 | */ |
14 | ||
b75a7d8f A |
15 | /** |
16 | * File tblcoll.h | |
17 | * | |
18 | * Created by: Helena Shih | |
19 | * | |
20 | * Modification History: | |
21 | * | |
22 | * Date Name Description | |
23 | * 2/5/97 aliu Added streamIn and streamOut methods. Added | |
24 | * constructor which reads RuleBasedCollator object from | |
25 | * a binary file. Added writeToFile method which streams | |
26 | * RuleBasedCollator out to a binary file. The streamIn | |
27 | * and streamOut methods use istream and ostream objects | |
28 | * in binary mode. | |
29 | * 2/12/97 aliu Modified to use TableCollationData sub-object to | |
30 | * hold invariant data. | |
31 | * 2/13/97 aliu Moved several methods into this class from Collation. | |
32 | * Added a private RuleBasedCollator(Locale&) constructor, | |
33 | * to be used by Collator::createDefault(). General | |
34 | * clean up. | |
35 | * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy | |
36 | * constructor and getDynamicClassID. | |
37 | * 3/5/97 aliu Modified constructFromFile() to add parameter | |
38 | * specifying whether or not binary loading is to be | |
39 | * attempted. This is required for dynamic rule loading. | |
40 | * 05/07/97 helena Added memory allocation error detection. | |
41 | * 6/17/97 helena Added IDENTICAL strength for compare, changed getRules to | |
42 | * use MergeCollation::getPattern. | |
43 | * 6/20/97 helena Java class name change. | |
44 | * 8/18/97 helena Added internal API documentation. | |
45 | * 09/03/97 helena Added createCollationKeyValues(). | |
46 | * 02/10/98 damiba Added compare with "length" parameter | |
47 | * 08/05/98 erm Synched with 1.2 version of RuleBasedCollator.java | |
48 | * 04/23/99 stephen Removed EDecompositionMode, merged with | |
49 | * Normalizer::EMode | |
50 | * 06/14/99 stephen Removed kResourceBundleSuffix | |
51 | * 11/02/99 helena Collator performance enhancements. Eliminates the | |
52 | * UnicodeString construction and special case for NO_OP. | |
53 | * 11/23/99 srl More performance enhancements. Updates to NormalizerIterator | |
54 | * internal state management. | |
55 | * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator | |
56 | * to implementation file. | |
57 | * 01/29/01 synwee Modified into a C++ wrapper which calls C API | |
58 | * (ucol.h) | |
57a6839d | 59 | * 2012-2014 markus Rewritten in C++ again. |
b75a7d8f A |
60 | */ |
61 | ||
62 | #ifndef TBLCOLL_H | |
63 | #define TBLCOLL_H | |
64 | ||
65 | #include "unicode/utypes.h" | |
66 | ||
67 | #if !UCONFIG_NO_COLLATION | |
68 | ||
69 | #include "unicode/coll.h" | |
57a6839d A |
70 | #include "unicode/locid.h" |
71 | #include "unicode/uiter.h" | |
b75a7d8f | 72 | #include "unicode/ucol.h" |
b75a7d8f | 73 | |
f3c0d7a5 | 74 | #if U_SHOW_CPLUSPLUS_API |
b75a7d8f A |
75 | U_NAMESPACE_BEGIN |
76 | ||
b331163b | 77 | struct CollationCacheEntry; |
57a6839d A |
78 | struct CollationData; |
79 | struct CollationSettings; | |
80 | struct CollationTailoring; | |
b75a7d8f A |
81 | /** |
82 | * @stable ICU 2.0 | |
83 | */ | |
84 | class StringSearch; | |
85 | /** | |
86 | * @stable ICU 2.0 | |
87 | */ | |
88 | class CollationElementIterator; | |
57a6839d A |
89 | class CollationKey; |
90 | class SortKeyByteSink; | |
91 | class UnicodeSet; | |
92 | class UnicodeString; | |
93 | class UVector64; | |
b75a7d8f A |
94 | |
95 | /** | |
57a6839d | 96 | * The RuleBasedCollator class provides the implementation of |
b75a7d8f A |
97 | * Collator, using data-driven tables. The user can create a customized |
98 | * table-based collation. | |
b75a7d8f | 99 | * <p> |
2ca993e8 | 100 | * For more information about the collation service see |
57a6839d | 101 | * <a href="http://userguide.icu-project.org/collation">the User Guide</a>. |
b75a7d8f | 102 | * <p> |
2ca993e8 | 103 | * Collation service provides correct sorting orders for most locales supported in ICU. |
b75a7d8f | 104 | * If specific data for a locale is not available, the orders eventually falls back |
2ca993e8 | 105 | * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>. |
b75a7d8f A |
106 | * <p> |
107 | * Sort ordering may be customized by providing your own set of rules. For more on | |
57a6839d A |
108 | * this subject see the <a href="http://userguide.icu-project.org/collation/customization"> |
109 | * Collation Customization</a> section of the User Guide. | |
b75a7d8f A |
110 | * <p> |
111 | * Note, RuleBasedCollator is not to be subclassed. | |
112 | * @see Collator | |
b75a7d8f | 113 | */ |
57a6839d | 114 | class U_I18N_API RuleBasedCollator : public Collator { |
b75a7d8f | 115 | public: |
374ca955 A |
116 | /** |
117 | * RuleBasedCollator constructor. This takes the table rules and builds a | |
118 | * collation table out of them. Please see RuleBasedCollator class | |
119 | * description for more details on the collation rule syntax. | |
120 | * @param rules the collation rules to build the collation table from. | |
121 | * @param status reporting a success or an error. | |
374ca955 A |
122 | * @stable ICU 2.0 |
123 | */ | |
b75a7d8f A |
124 | RuleBasedCollator(const UnicodeString& rules, UErrorCode& status); |
125 | ||
374ca955 A |
126 | /** |
127 | * RuleBasedCollator constructor. This takes the table rules and builds a | |
128 | * collation table out of them. Please see RuleBasedCollator class | |
129 | * description for more details on the collation rule syntax. | |
130 | * @param rules the collation rules to build the collation table from. | |
b331163b | 131 | * @param collationStrength strength for comparison |
374ca955 | 132 | * @param status reporting a success or an error. |
374ca955 A |
133 | * @stable ICU 2.0 |
134 | */ | |
135 | RuleBasedCollator(const UnicodeString& rules, | |
b75a7d8f A |
136 | ECollationStrength collationStrength, |
137 | UErrorCode& status); | |
138 | ||
374ca955 A |
139 | /** |
140 | * RuleBasedCollator constructor. This takes the table rules and builds a | |
141 | * collation table out of them. Please see RuleBasedCollator class | |
142 | * description for more details on the collation rule syntax. | |
143 | * @param rules the collation rules to build the collation table from. | |
144 | * @param decompositionMode the normalisation mode | |
145 | * @param status reporting a success or an error. | |
374ca955 A |
146 | * @stable ICU 2.0 |
147 | */ | |
148 | RuleBasedCollator(const UnicodeString& rules, | |
b75a7d8f A |
149 | UColAttributeValue decompositionMode, |
150 | UErrorCode& status); | |
151 | ||
374ca955 A |
152 | /** |
153 | * RuleBasedCollator constructor. This takes the table rules and builds a | |
154 | * collation table out of them. Please see RuleBasedCollator class | |
155 | * description for more details on the collation rule syntax. | |
156 | * @param rules the collation rules to build the collation table from. | |
b331163b | 157 | * @param collationStrength strength for comparison |
374ca955 A |
158 | * @param decompositionMode the normalisation mode |
159 | * @param status reporting a success or an error. | |
374ca955 A |
160 | * @stable ICU 2.0 |
161 | */ | |
162 | RuleBasedCollator(const UnicodeString& rules, | |
b75a7d8f A |
163 | ECollationStrength collationStrength, |
164 | UColAttributeValue decompositionMode, | |
165 | UErrorCode& status); | |
166 | ||
2ca993e8 | 167 | #ifndef U_HIDE_INTERNAL_API |
57a6839d A |
168 | /** |
169 | * TODO: document & propose as public API | |
170 | * @internal | |
171 | */ | |
172 | RuleBasedCollator(const UnicodeString &rules, | |
173 | UParseError &parseError, UnicodeString &reason, | |
174 | UErrorCode &errorCode); | |
175 | #endif /* U_HIDE_INTERNAL_API */ | |
176 | ||
374ca955 A |
177 | /** |
178 | * Copy constructor. | |
179 | * @param other the RuleBasedCollator object to be copied | |
374ca955 A |
180 | * @stable ICU 2.0 |
181 | */ | |
b75a7d8f A |
182 | RuleBasedCollator(const RuleBasedCollator& other); |
183 | ||
73c04bcf A |
184 | |
185 | /** Opens a collator from a collator binary image created using | |
2ca993e8 A |
186 | * cloneBinary. Binary image used in instantiation of the |
187 | * collator remains owned by the user and should stay around for | |
73c04bcf | 188 | * the lifetime of the collator. The API also takes a base collator |
b331163b | 189 | * which must be the root collator. |
73c04bcf A |
190 | * @param bin binary image owned by the user and required through the |
191 | * lifetime of the collator | |
192 | * @param length size of the image. If negative, the API will try to | |
193 | * figure out the length of the image | |
b331163b A |
194 | * @param base Base collator, for lookup of untailored characters. |
195 | * Must be the root collator, must not be NULL. | |
196 | * The base is required to be present through the lifetime of the collator. | |
73c04bcf A |
197 | * @param status for catching errors |
198 | * @return newly created collator | |
199 | * @see cloneBinary | |
46f4442e | 200 | * @stable ICU 3.4 |
73c04bcf | 201 | */ |
2ca993e8 A |
202 | RuleBasedCollator(const uint8_t *bin, int32_t length, |
203 | const RuleBasedCollator *base, | |
73c04bcf | 204 | UErrorCode &status); |
b75a7d8f | 205 | |
374ca955 A |
206 | /** |
207 | * Destructor. | |
208 | * @stable ICU 2.0 | |
209 | */ | |
b75a7d8f A |
210 | virtual ~RuleBasedCollator(); |
211 | ||
374ca955 A |
212 | /** |
213 | * Assignment operator. | |
57a6839d | 214 | * @param other other RuleBasedCollator object to copy from. |
374ca955 A |
215 | * @stable ICU 2.0 |
216 | */ | |
b75a7d8f A |
217 | RuleBasedCollator& operator=(const RuleBasedCollator& other); |
218 | ||
374ca955 A |
219 | /** |
220 | * Returns true if argument is the same as this object. | |
221 | * @param other Collator object to be compared. | |
222 | * @return true if arguments is the same as this object. | |
223 | * @stable ICU 2.0 | |
224 | */ | |
225 | virtual UBool operator==(const Collator& other) const; | |
226 | ||
227 | /** | |
51004dcb A |
228 | * Makes a copy of this object. |
229 | * @return a copy of this object, owned by the caller | |
374ca955 A |
230 | * @stable ICU 2.0 |
231 | */ | |
232 | virtual Collator* clone(void) const; | |
233 | ||
234 | /** | |
235 | * Creates a collation element iterator for the source string. The caller of | |
236 | * this method is responsible for the memory management of the return | |
237 | * pointer. | |
238 | * @param source the string over which the CollationElementIterator will | |
239 | * iterate. | |
240 | * @return the collation element iterator of the source string using this as | |
241 | * the based Collator. | |
242 | * @stable ICU 2.2 | |
243 | */ | |
b75a7d8f A |
244 | virtual CollationElementIterator* createCollationElementIterator( |
245 | const UnicodeString& source) const; | |
246 | ||
374ca955 A |
247 | /** |
248 | * Creates a collation element iterator for the source. The caller of this | |
249 | * method is responsible for the memory management of the returned pointer. | |
250 | * @param source the CharacterIterator which produces the characters over | |
251 | * which the CollationElementItgerator will iterate. | |
252 | * @return the collation element iterator of the source using this as the | |
253 | * based Collator. | |
254 | * @stable ICU 2.2 | |
255 | */ | |
256 | virtual CollationElementIterator* createCollationElementIterator( | |
257 | const CharacterIterator& source) const; | |
258 | ||
51004dcb A |
259 | // Make deprecated versions of Collator::compare() visible. |
260 | using Collator::compare; | |
374ca955 A |
261 | |
262 | /** | |
263 | * The comparison function compares the character data stored in two | |
2ca993e8 | 264 | * different strings. Returns information about whether a string is less |
374ca955 A |
265 | * than, greater than or equal to another string. |
266 | * @param source the source string to be compared with. | |
267 | * @param target the string that is to be compared with the source string. | |
268 | * @param status possible error code | |
269 | * @return Returns an enum value. UCOL_GREATER if source is greater | |
270 | * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less | |
271 | * than target | |
272 | * @stable ICU 2.6 | |
273 | **/ | |
274 | virtual UCollationResult compare(const UnicodeString& source, | |
57a6839d A |
275 | const UnicodeString& target, |
276 | UErrorCode &status) const; | |
374ca955 | 277 | |
374ca955 | 278 | /** |
2ca993e8 | 279 | * Does the same thing as compare but limits the comparison to a specified |
374ca955 A |
280 | * length |
281 | * @param source the source string to be compared with. | |
282 | * @param target the string that is to be compared with the source string. | |
283 | * @param length the length the comparison is limited to | |
284 | * @param status possible error code | |
2ca993e8 A |
285 | * @return Returns an enum value. UCOL_GREATER if source (up to the specified |
286 | * length) is greater than target; UCOL_EQUAL if source (up to specified | |
287 | * length) is equal to target; UCOL_LESS if source (up to the specified | |
374ca955 A |
288 | * length) is less than target. |
289 | * @stable ICU 2.6 | |
290 | */ | |
291 | virtual UCollationResult compare(const UnicodeString& source, | |
57a6839d A |
292 | const UnicodeString& target, |
293 | int32_t length, | |
294 | UErrorCode &status) const; | |
374ca955 | 295 | |
374ca955 A |
296 | /** |
297 | * The comparison function compares the character data stored in two | |
2ca993e8 | 298 | * different string arrays. Returns information about whether a string array |
374ca955 A |
299 | * is less than, greater than or equal to another string array. |
300 | * @param source the source string array to be compared with. | |
301 | * @param sourceLength the length of the source string array. If this value | |
302 | * is equal to -1, the string array is null-terminated. | |
303 | * @param target the string that is to be compared with the source string. | |
304 | * @param targetLength the length of the target string array. If this value | |
305 | * is equal to -1, the string array is null-terminated. | |
306 | * @param status possible error code | |
307 | * @return Returns an enum value. UCOL_GREATER if source is greater | |
308 | * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less | |
309 | * than target | |
310 | * @stable ICU 2.6 | |
311 | */ | |
f3c0d7a5 A |
312 | virtual UCollationResult compare(const char16_t* source, int32_t sourceLength, |
313 | const char16_t* target, int32_t targetLength, | |
57a6839d | 314 | UErrorCode &status) const; |
374ca955 | 315 | |
729e4ab9 A |
316 | /** |
317 | * Compares two strings using the Collator. | |
318 | * Returns whether the first one compares less than/equal to/greater than | |
319 | * the second one. | |
320 | * This version takes UCharIterator input. | |
321 | * @param sIter the first ("source") string iterator | |
322 | * @param tIter the second ("target") string iterator | |
323 | * @param status ICU status | |
324 | * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER | |
325 | * @stable ICU 4.2 | |
326 | */ | |
327 | virtual UCollationResult compare(UCharIterator &sIter, | |
328 | UCharIterator &tIter, | |
329 | UErrorCode &status) const; | |
330 | ||
57a6839d A |
331 | /** |
332 | * Compares two UTF-8 strings using the Collator. | |
333 | * Returns whether the first one compares less than/equal to/greater than | |
334 | * the second one. | |
335 | * This version takes UTF-8 input. | |
336 | * Note that a StringPiece can be implicitly constructed | |
337 | * from a std::string or a NUL-terminated const char * string. | |
338 | * @param source the first UTF-8 string | |
339 | * @param target the second UTF-8 string | |
340 | * @param status ICU status | |
341 | * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER | |
342 | * @stable ICU 51 | |
343 | */ | |
344 | virtual UCollationResult compareUTF8(const StringPiece &source, | |
345 | const StringPiece &target, | |
346 | UErrorCode &status) const; | |
347 | ||
374ca955 | 348 | /** |
b331163b A |
349 | * Transforms the string into a series of characters |
350 | * that can be compared with CollationKey.compare(). | |
351 | * | |
2ca993e8 | 352 | * Note that sort keys are often less efficient than simply doing comparison. |
b331163b A |
353 | * For more details, see the ICU User Guide. |
354 | * | |
355 | * @param source the source string. | |
356 | * @param key the transformed key of the source string. | |
357 | * @param status the error code status. | |
358 | * @return the transformed key. | |
359 | * @see CollationKey | |
360 | * @stable ICU 2.0 | |
361 | */ | |
374ca955 A |
362 | virtual CollationKey& getCollationKey(const UnicodeString& source, |
363 | CollationKey& key, | |
364 | UErrorCode& status) const; | |
365 | ||
366 | /** | |
b331163b A |
367 | * Transforms a specified region of the string into a series of characters |
368 | * that can be compared with CollationKey.compare. | |
369 | * | |
2ca993e8 | 370 | * Note that sort keys are often less efficient than simply doing comparison. |
b331163b A |
371 | * For more details, see the ICU User Guide. |
372 | * | |
373 | * @param source the source string. | |
374 | * @param sourceLength the length of the source string. | |
375 | * @param key the transformed key of the source string. | |
376 | * @param status the error code status. | |
377 | * @return the transformed key. | |
378 | * @see CollationKey | |
379 | * @stable ICU 2.0 | |
380 | */ | |
f3c0d7a5 | 381 | virtual CollationKey& getCollationKey(const char16_t *source, |
374ca955 A |
382 | int32_t sourceLength, |
383 | CollationKey& key, | |
384 | UErrorCode& status) const; | |
385 | ||
386 | /** | |
387 | * Generates the hash code for the rule-based collation object. | |
388 | * @return the hash code. | |
389 | * @stable ICU 2.0 | |
390 | */ | |
57a6839d | 391 | virtual int32_t hashCode() const; |
374ca955 A |
392 | |
393 | /** | |
394 | * Gets the locale of the Collator | |
395 | * @param type can be either requested, valid or actual locale. For more | |
396 | * information see the definition of ULocDataLocaleType in | |
397 | * uloc.h | |
398 | * @param status the error code status. | |
399 | * @return locale where the collation data lives. If the collator | |
400 | * was instantiated from rules, locale is empty. | |
401 | * @deprecated ICU 2.8 likely to change in ICU 3.0, based on feedback | |
402 | */ | |
51004dcb | 403 | virtual Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const; |
374ca955 A |
404 | |
405 | /** | |
51004dcb A |
406 | * Gets the tailoring rules for this collator. |
407 | * @return the collation tailoring from which this collator was created | |
374ca955 A |
408 | * @stable ICU 2.0 |
409 | */ | |
57a6839d | 410 | const UnicodeString& getRules() const; |
374ca955 A |
411 | |
412 | /** | |
413 | * Gets the version information for a Collator. | |
414 | * @param info the version # information, the result will be filled in | |
415 | * @stable ICU 2.0 | |
416 | */ | |
417 | virtual void getVersion(UVersionInfo info) const; | |
418 | ||
2ca993e8 | 419 | #ifndef U_HIDE_DEPRECATED_API |
374ca955 | 420 | /** |
51004dcb | 421 | * Returns the maximum length of any expansion sequences that end with the |
374ca955 | 422 | * specified comparison order. |
51004dcb A |
423 | * |
424 | * This is specific to the kind of collation element values and sequences | |
425 | * returned by the CollationElementIterator. | |
426 | * Call CollationElementIterator::getMaxExpansion() instead. | |
427 | * | |
428 | * @param order a collation order returned by CollationElementIterator::previous | |
429 | * or CollationElementIterator::next. | |
374ca955 | 430 | * @return maximum size of the expansion sequences ending with the collation |
51004dcb | 431 | * element, or 1 if the collation element does not occur at the end of |
374ca955 A |
432 | * any expansion sequence |
433 | * @see CollationElementIterator#getMaxExpansion | |
51004dcb | 434 | * @deprecated ICU 51 Use CollationElementIterator::getMaxExpansion() instead. |
374ca955 A |
435 | */ |
436 | int32_t getMaxExpansion(int32_t order) const; | |
51004dcb | 437 | #endif /* U_HIDE_DEPRECATED_API */ |
374ca955 A |
438 | |
439 | /** | |
440 | * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This | |
441 | * method is to implement a simple version of RTTI, since not all C++ | |
442 | * compilers support genuine RTTI. Polymorphic operator==() and clone() | |
443 | * methods call this method. | |
444 | * @return The class ID for this object. All objects of a given class have | |
445 | * the same class ID. Objects of other classes have different class | |
446 | * IDs. | |
447 | * @stable ICU 2.0 | |
448 | */ | |
449 | virtual UClassID getDynamicClassID(void) const; | |
450 | ||
451 | /** | |
452 | * Returns the class ID for this class. This is useful only for comparing to | |
453 | * a return value from getDynamicClassID(). For example: | |
454 | * <pre> | |
455 | * Base* polymorphic_pointer = createPolymorphicObject(); | |
456 | * if (polymorphic_pointer->getDynamicClassID() == | |
457 | * Derived::getStaticClassID()) ... | |
458 | * </pre> | |
459 | * @return The class ID for all objects of this class. | |
460 | * @stable ICU 2.0 | |
461 | */ | |
462 | static UClassID U_EXPORT2 getStaticClassID(void); | |
463 | ||
2ca993e8 | 464 | #ifndef U_HIDE_DEPRECATED_API |
374ca955 | 465 | /** |
57a6839d A |
466 | * Do not use this method: The caller and the ICU library might use different heaps. |
467 | * Use cloneBinary() instead which writes to caller-provided memory. | |
468 | * | |
469 | * Returns a binary format of this collator. | |
374ca955 A |
470 | * @param length Returns the length of the data, in bytes |
471 | * @param status the error code status. | |
472 | * @return memory, owned by the caller, of size 'length' bytes. | |
57a6839d | 473 | * @deprecated ICU 52. Use cloneBinary() instead. |
374ca955 | 474 | */ |
57a6839d A |
475 | uint8_t *cloneRuleData(int32_t &length, UErrorCode &status) const; |
476 | #endif /* U_HIDE_DEPRECATED_API */ | |
73c04bcf | 477 | |
2ca993e8 | 478 | /** Creates a binary image of a collator. This binary image can be stored and |
73c04bcf A |
479 | * later used to instantiate a collator using ucol_openBinary. |
480 | * This API supports preflighting. | |
481 | * @param buffer a fill-in buffer to receive the binary image | |
482 | * @param capacity capacity of the destination buffer | |
483 | * @param status for catching errors | |
484 | * @return size of the image | |
485 | * @see ucol_openBinary | |
46f4442e | 486 | * @stable ICU 3.4 |
73c04bcf | 487 | */ |
57a6839d | 488 | int32_t cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) const; |
73c04bcf | 489 | |
374ca955 A |
490 | /** |
491 | * Returns current rules. Delta defines whether full rules are returned or | |
492 | * just the tailoring. | |
51004dcb A |
493 | * |
494 | * getRules(void) should normally be used instead. | |
495 | * See http://userguide.icu-project.org/collation/customization#TOC-Building-on-Existing-Locales | |
374ca955 A |
496 | * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES. |
497 | * @param buffer UnicodeString to store the result rules | |
498 | * @stable ICU 2.2 | |
51004dcb | 499 | * @see UCOL_FULL_RULES |
374ca955 | 500 | */ |
57a6839d | 501 | void getRules(UColRuleOption delta, UnicodeString &buffer) const; |
374ca955 A |
502 | |
503 | /** | |
504 | * Universal attribute setter | |
505 | * @param attr attribute type | |
506 | * @param value attribute value | |
507 | * @param status to indicate whether the operation went on smoothly or there were errors | |
508 | * @stable ICU 2.2 | |
509 | */ | |
510 | virtual void setAttribute(UColAttribute attr, UColAttributeValue value, | |
511 | UErrorCode &status); | |
512 | ||
513 | /** | |
514 | * Universal attribute getter. | |
515 | * @param attr attribute type | |
516 | * @param status to indicate whether the operation went on smoothly or there were errors | |
517 | * @return attribute value | |
518 | * @stable ICU 2.2 | |
519 | */ | |
520 | virtual UColAttributeValue getAttribute(UColAttribute attr, | |
51004dcb | 521 | UErrorCode &status) const; |
374ca955 A |
522 | |
523 | /** | |
57a6839d A |
524 | * Sets the variable top to the top of the specified reordering group. |
525 | * The variable top determines the highest-sorting character | |
526 | * which is affected by UCOL_ALTERNATE_HANDLING. | |
527 | * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect. | |
528 | * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION, | |
529 | * UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY; | |
530 | * or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group | |
531 | * @param errorCode Standard ICU error code. Its input value must | |
532 | * pass the U_SUCCESS() test, or else the function returns | |
533 | * immediately. Check for U_FAILURE() on output or use with | |
534 | * function chaining. (See User Guide for details.) | |
535 | * @return *this | |
536 | * @see getMaxVariable | |
b331163b | 537 | * @stable ICU 53 |
57a6839d A |
538 | */ |
539 | virtual Collator &setMaxVariable(UColReorderCode group, UErrorCode &errorCode); | |
540 | ||
541 | /** | |
542 | * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING. | |
543 | * @return the maximum variable reordering group. | |
544 | * @see setMaxVariable | |
b331163b | 545 | * @stable ICU 53 |
57a6839d A |
546 | */ |
547 | virtual UColReorderCode getMaxVariable() const; | |
548 | ||
549 | /** | |
550 | * Sets the variable top to the primary weight of the specified string. | |
551 | * | |
552 | * Beginning with ICU 53, the variable top is pinned to | |
553 | * the top of one of the supported reordering groups, | |
554 | * and it must not be beyond the last of those groups. | |
555 | * See setMaxVariable(). | |
f3c0d7a5 | 556 | * @param varTop one or more (if contraction) char16_ts to which the variable top should be set |
374ca955 A |
557 | * @param len length of variable top string. If -1 it is considered to be zero terminated. |
558 | * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br> | |
57a6839d A |
559 | * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br> |
560 | * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond | |
561 | * the last reordering group supported by setMaxVariable() | |
562 | * @return variable top primary weight | |
563 | * @deprecated ICU 53 Call setMaxVariable() instead. | |
374ca955 | 564 | */ |
f3c0d7a5 | 565 | virtual uint32_t setVariableTop(const char16_t *varTop, int32_t len, UErrorCode &status); |
374ca955 A |
566 | |
567 | /** | |
57a6839d A |
568 | * Sets the variable top to the primary weight of the specified string. |
569 | * | |
570 | * Beginning with ICU 53, the variable top is pinned to | |
571 | * the top of one of the supported reordering groups, | |
572 | * and it must not be beyond the last of those groups. | |
573 | * See setMaxVariable(). | |
f3c0d7a5 | 574 | * @param varTop a UnicodeString size 1 or more (if contraction) of char16_ts to which the variable top should be set |
374ca955 | 575 | * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br> |
57a6839d A |
576 | * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br> |
577 | * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond | |
578 | * the last reordering group supported by setMaxVariable() | |
579 | * @return variable top primary weight | |
580 | * @deprecated ICU 53 Call setMaxVariable() instead. | |
374ca955 | 581 | */ |
51004dcb | 582 | virtual uint32_t setVariableTop(const UnicodeString &varTop, UErrorCode &status); |
374ca955 A |
583 | |
584 | /** | |
57a6839d A |
585 | * Sets the variable top to the specified primary weight. |
586 | * | |
587 | * Beginning with ICU 53, the variable top is pinned to | |
588 | * the top of one of the supported reordering groups, | |
589 | * and it must not be beyond the last of those groups. | |
590 | * See setMaxVariable(). | |
591 | * @param varTop primary weight, as returned by setVariableTop or ucol_getVariableTop | |
592 | * @param status error code | |
593 | * @deprecated ICU 53 Call setMaxVariable() instead. | |
374ca955 | 594 | */ |
51004dcb | 595 | virtual void setVariableTop(uint32_t varTop, UErrorCode &status); |
374ca955 A |
596 | |
597 | /** | |
598 | * Gets the variable top value of a Collator. | |
374ca955 | 599 | * @param status error code (not changed by function). If error code is set, the return value is undefined. |
57a6839d A |
600 | * @return the variable top primary weight |
601 | * @see getMaxVariable | |
374ca955 A |
602 | * @stable ICU 2.0 |
603 | */ | |
604 | virtual uint32_t getVariableTop(UErrorCode &status) const; | |
605 | ||
606 | /** | |
2ca993e8 | 607 | * Get a UnicodeSet that contains all the characters and sequences tailored in |
374ca955 A |
608 | * this collator. |
609 | * @param status error code of the operation | |
2ca993e8 | 610 | * @return a pointer to a UnicodeSet object containing all the |
374ca955 | 611 | * code points and sequences that may sort differently than |
57a6839d | 612 | * in the root collator. The object must be disposed of by using delete |
374ca955 A |
613 | * @stable ICU 2.4 |
614 | */ | |
615 | virtual UnicodeSet *getTailoredSet(UErrorCode &status) const; | |
616 | ||
374ca955 | 617 | /** |
57a6839d | 618 | * Get the sort key as an array of bytes from a UnicodeString. |
b331163b | 619 | * |
2ca993e8 | 620 | * Note that sort keys are often less efficient than simply doing comparison. |
b331163b A |
621 | * For more details, see the ICU User Guide. |
622 | * | |
374ca955 A |
623 | * @param source string to be processed. |
624 | * @param result buffer to store result in. If NULL, number of bytes needed | |
625 | * will be returned. | |
626 | * @param resultLength length of the result buffer. If if not enough the | |
627 | * buffer will be filled to capacity. | |
628 | * @return Number of bytes needed for storing the sort key | |
629 | * @stable ICU 2.0 | |
630 | */ | |
631 | virtual int32_t getSortKey(const UnicodeString& source, uint8_t *result, | |
632 | int32_t resultLength) const; | |
633 | ||
634 | /** | |
f3c0d7a5 | 635 | * Get the sort key as an array of bytes from a char16_t buffer. |
b331163b | 636 | * |
2ca993e8 | 637 | * Note that sort keys are often less efficient than simply doing comparison. |
b331163b A |
638 | * For more details, see the ICU User Guide. |
639 | * | |
374ca955 A |
640 | * @param source string to be processed. |
641 | * @param sourceLength length of string to be processed. If -1, the string | |
642 | * is 0 terminated and length will be decided by the function. | |
643 | * @param result buffer to store result in. If NULL, number of bytes needed | |
644 | * will be returned. | |
645 | * @param resultLength length of the result buffer. If if not enough the | |
646 | * buffer will be filled to capacity. | |
647 | * @return Number of bytes needed for storing the sort key | |
648 | * @stable ICU 2.2 | |
649 | */ | |
f3c0d7a5 | 650 | virtual int32_t getSortKey(const char16_t *source, int32_t sourceLength, |
374ca955 A |
651 | uint8_t *result, int32_t resultLength) const; |
652 | ||
729e4ab9 | 653 | /** |
4388f060 | 654 | * Retrieves the reordering codes for this collator. |
729e4ab9 | 655 | * @param dest The array to fill with the script ordering. |
4388f060 | 656 | * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function |
b331163b | 657 | * will only return the length of the result without writing any codes (pre-flighting). |
4388f060 A |
658 | * @param status A reference to an error code value, which must not indicate |
659 | * a failure before the function call. | |
660 | * @return The length of the script ordering array. | |
661 | * @see ucol_setReorderCodes | |
662 | * @see Collator#getEquivalentReorderCodes | |
663 | * @see Collator#setReorderCodes | |
2ca993e8 | 664 | * @stable ICU 4.8 |
729e4ab9 | 665 | */ |
51004dcb A |
666 | virtual int32_t getReorderCodes(int32_t *dest, |
667 | int32_t destCapacity, | |
668 | UErrorCode& status) const; | |
729e4ab9 A |
669 | |
670 | /** | |
4388f060 | 671 | * Sets the ordering of scripts for this collator. |
2ca993e8 | 672 | * @param reorderCodes An array of script codes in the new order. This can be NULL if the |
4388f060 | 673 | * length is also set to 0. An empty array will clear any reordering codes on the collator. |
729e4ab9 | 674 | * @param reorderCodesLength The length of reorderCodes. |
4388f060 | 675 | * @param status error code |
b331163b | 676 | * @see ucol_setReorderCodes |
4388f060 A |
677 | * @see Collator#getReorderCodes |
678 | * @see Collator#getEquivalentReorderCodes | |
2ca993e8 | 679 | * @stable ICU 4.8 |
4388f060 | 680 | */ |
51004dcb A |
681 | virtual void setReorderCodes(const int32_t* reorderCodes, |
682 | int32_t reorderCodesLength, | |
683 | UErrorCode& status) ; | |
4388f060 A |
684 | |
685 | /** | |
57a6839d A |
686 | * Implements ucol_strcollUTF8(). |
687 | * @internal | |
729e4ab9 | 688 | */ |
57a6839d A |
689 | virtual UCollationResult internalCompareUTF8( |
690 | const char *left, int32_t leftLength, | |
691 | const char *right, int32_t rightLength, | |
692 | UErrorCode &errorCode) const; | |
374ca955 | 693 | |
57a6839d | 694 | /** Get the short definition string for a collator. This internal API harvests the collator's |
2ca993e8 | 695 | * locale and the attribute set and produces a string that can be used for opening |
57a6839d A |
696 | * a collator with the same attributes using the ucol_openFromShortString API. |
697 | * This string will be normalized. | |
698 | * The structure and the syntax of the string is defined in the "Naming collators" | |
2ca993e8 | 699 | * section of the users guide: |
57a6839d A |
700 | * http://userguide.icu-project.org/collation/concepts#TOC-Collator-naming-scheme |
701 | * This function supports preflighting. | |
2ca993e8 | 702 | * |
57a6839d A |
703 | * This is internal, and intended to be used with delegate converters. |
704 | * | |
705 | * @param locale a locale that will appear as a collators locale in the resulting | |
2ca993e8 | 706 | * short string definition. If NULL, the locale will be harvested |
57a6839d A |
707 | * from the collator. |
708 | * @param buffer space to hold the resulting string | |
709 | * @param capacity capacity of the buffer | |
710 | * @param status for returning errors. All the preflighting errors are featured | |
711 | * @return length of the resulting string | |
712 | * @see ucol_openFromShortString | |
713 | * @see ucol_normalizeShortDefinitionString | |
714 | * @see ucol_getShortDefinitionString | |
715 | * @internal | |
716 | */ | |
717 | virtual int32_t internalGetShortDefinitionString(const char *locale, | |
718 | char *buffer, | |
719 | int32_t capacity, | |
720 | UErrorCode &status) const; | |
374ca955 A |
721 | |
722 | /** | |
57a6839d A |
723 | * Implements ucol_nextSortKeyPart(). |
724 | * @internal | |
725 | */ | |
726 | virtual int32_t internalNextSortKeyPart( | |
727 | UCharIterator *iter, uint32_t state[2], | |
728 | uint8_t *dest, int32_t count, UErrorCode &errorCode) const; | |
374ca955 | 729 | |
2ca993e8 | 730 | // Do not enclose the default constructor with #ifndef U_HIDE_INTERNAL_API |
374ca955 | 731 | /** |
57a6839d A |
732 | * Only for use in ucol_openRules(). |
733 | * @internal | |
374ca955 A |
734 | */ |
735 | RuleBasedCollator(); | |
736 | ||
b331163b | 737 | #ifndef U_HIDE_INTERNAL_API |
374ca955 | 738 | /** |
57a6839d A |
739 | * Implements ucol_getLocaleByType(). |
740 | * Needed because the lifetime of the locale ID string must match that of the collator. | |
741 | * getLocale() returns a copy of a Locale, with minimal lifetime in a C wrapper. | |
742 | * @internal | |
374ca955 | 743 | */ |
57a6839d | 744 | const char *internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const; |
374ca955 A |
745 | |
746 | /** | |
57a6839d A |
747 | * Implements ucol_getContractionsAndExpansions(). |
748 | * Gets this collator's sets of contraction strings and/or | |
749 | * characters and strings that map to multiple collation elements (expansions). | |
750 | * If addPrefixes is TRUE, then contractions that are expressed as | |
751 | * prefix/pre-context rules are included. | |
752 | * @param contractions if not NULL, the set to hold the contractions | |
753 | * @param expansions if not NULL, the set to hold the expansions | |
754 | * @param addPrefixes include prefix contextual mappings | |
755 | * @param errorCode in/out ICU error code | |
756 | * @internal | |
374ca955 | 757 | */ |
57a6839d A |
758 | void internalGetContractionsAndExpansions( |
759 | UnicodeSet *contractions, UnicodeSet *expansions, | |
760 | UBool addPrefixes, UErrorCode &errorCode) const; | |
374ca955 A |
761 | |
762 | /** | |
57a6839d A |
763 | * Adds the contractions that start with character c to the set. |
764 | * Ignores prefixes. Used by AlphabeticIndex. | |
765 | * @internal | |
766 | */ | |
767 | void internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const; | |
374ca955 A |
768 | |
769 | /** | |
57a6839d A |
770 | * Implements from-rule constructors, and ucol_openRules(). |
771 | * @internal | |
772 | */ | |
773 | void internalBuildTailoring( | |
774 | const UnicodeString &rules, | |
775 | int32_t strength, | |
776 | UColAttributeValue decompositionMode, | |
777 | UParseError *outParseError, UnicodeString *outReason, | |
778 | UErrorCode &errorCode); | |
374ca955 | 779 | |
57a6839d A |
780 | /** @internal */ |
781 | static inline RuleBasedCollator *rbcFromUCollator(UCollator *uc) { | |
782 | return dynamic_cast<RuleBasedCollator *>(fromUCollator(uc)); | |
783 | } | |
784 | /** @internal */ | |
785 | static inline const RuleBasedCollator *rbcFromUCollator(const UCollator *uc) { | |
786 | return dynamic_cast<const RuleBasedCollator *>(fromUCollator(uc)); | |
787 | } | |
b75a7d8f | 788 | |
374ca955 | 789 | /** |
57a6839d A |
790 | * Appends the CEs for the string to the vector. |
791 | * @internal for tests & tools | |
792 | */ | |
793 | void internalGetCEs(const UnicodeString &str, UVector64 &ces, UErrorCode &errorCode) const; | |
794 | #endif // U_HIDE_INTERNAL_API | |
b75a7d8f A |
795 | |
796 | protected: | |
374ca955 | 797 | /** |
57a6839d A |
798 | * Used internally by registration to define the requested and valid locales. |
799 | * @param requestedLocale the requested locale | |
374ca955 | 800 | * @param validLocale the valid locale |
46f4442e | 801 | * @param actualLocale the actual locale |
374ca955 A |
802 | * @internal |
803 | */ | |
46f4442e | 804 | virtual void setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale); |
b75a7d8f A |
805 | |
806 | private: | |
57a6839d A |
807 | friend class CollationElementIterator; |
808 | friend class Collator; | |
374ca955 | 809 | |
b331163b | 810 | RuleBasedCollator(const CollationCacheEntry *entry); |
374ca955 | 811 | |
57a6839d A |
812 | /** |
813 | * Enumeration of attributes that are relevant for short definition strings | |
814 | * (e.g., ucol_getShortDefinitionString()). | |
815 | * Effectively extends UColAttribute. | |
4388f060 | 816 | */ |
57a6839d A |
817 | enum Attributes { |
818 | ATTR_VARIABLE_TOP = UCOL_ATTRIBUTE_COUNT, | |
819 | ATTR_LIMIT | |
820 | }; | |
821 | ||
b331163b | 822 | void adoptTailoring(CollationTailoring *t, UErrorCode &errorCode); |
b75a7d8f | 823 | |
57a6839d | 824 | // Both lengths must be <0 or else both must be >=0. |
f3c0d7a5 A |
825 | UCollationResult doCompare(const char16_t *left, int32_t leftLength, |
826 | const char16_t *right, int32_t rightLength, | |
57a6839d A |
827 | UErrorCode &errorCode) const; |
828 | UCollationResult doCompare(const uint8_t *left, int32_t leftLength, | |
829 | const uint8_t *right, int32_t rightLength, | |
830 | UErrorCode &errorCode) const; | |
b75a7d8f | 831 | |
f3c0d7a5 | 832 | void writeSortKey(const char16_t *s, int32_t length, |
57a6839d | 833 | SortKeyByteSink &sink, UErrorCode &errorCode) const; |
b75a7d8f | 834 | |
f3c0d7a5 | 835 | void writeIdenticalLevel(const char16_t *s, const char16_t *limit, |
57a6839d | 836 | SortKeyByteSink &sink, UErrorCode &errorCode) const; |
b75a7d8f | 837 | |
57a6839d | 838 | const CollationSettings &getDefaultSettings() const; |
73c04bcf | 839 | |
57a6839d A |
840 | void setAttributeDefault(int32_t attribute) { |
841 | explicitlySetAttributes &= ~((uint32_t)1 << attribute); | |
842 | } | |
843 | void setAttributeExplicitly(int32_t attribute) { | |
844 | explicitlySetAttributes |= (uint32_t)1 << attribute; | |
845 | } | |
846 | UBool attributeHasBeenSetExplicitly(int32_t attribute) const { | |
847 | // assert(0 <= attribute < ATTR_LIMIT); | |
848 | return (UBool)((explicitlySetAttributes & ((uint32_t)1 << attribute)) != 0); | |
b75a7d8f | 849 | } |
b75a7d8f | 850 | |
57a6839d A |
851 | /** |
852 | * Tests whether a character is "unsafe" for use as a collation starting point. | |
853 | * | |
854 | * @param c code point or code unit | |
855 | * @return TRUE if c is unsafe | |
856 | * @see CollationElementIterator#setOffset(int) | |
857 | */ | |
858 | UBool isUnsafe(UChar32 c) const; | |
b75a7d8f | 859 | |
f3c0d7a5 | 860 | static void U_CALLCONV computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode); |
57a6839d A |
861 | UBool initMaxExpansions(UErrorCode &errorCode) const; |
862 | ||
863 | void setFastLatinOptions(CollationSettings &ownedSettings) const; | |
b75a7d8f | 864 | |
57a6839d A |
865 | const CollationData *data; |
866 | const CollationSettings *settings; // reference-counted | |
b331163b A |
867 | const CollationTailoring *tailoring; // alias of cacheEntry->tailoring |
868 | const CollationCacheEntry *cacheEntry; // reference-counted | |
57a6839d A |
869 | Locale validLocale; |
870 | uint32_t explicitlySetAttributes; | |
871 | ||
872 | UBool actualLocaleIsSameAsValid; | |
873 | }; | |
874 | ||
875 | U_NAMESPACE_END | |
f3c0d7a5 | 876 | #endif // U_SHOW_CPLUSPLUS_API |
b75a7d8f | 877 | |
57a6839d A |
878 | #endif // !UCONFIG_NO_COLLATION |
879 | #endif // TBLCOLL_H |