]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/unicode/coll.h
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / i18n / unicode / coll.h
CommitLineData
b75a7d8f
A
1/*
2******************************************************************************
374ca955 3* Copyright (C) 1996-2004, International Business Machines *
b75a7d8f
A
4* Corporation and others. All Rights Reserved. *
5******************************************************************************
6*/
7
8/**
9* File coll.h
374ca955 10*
b75a7d8f
A
11* Created by: Helena Shih
12*
13* Modification History:
14*
15* Date Name Description
16* 02/5/97 aliu Modified createDefault to load collation data from
17* binary files when possible. Added related methods
18* createCollationFromFile, chopLocale, createPathName.
19* 02/11/97 aliu Added members addToCache, findInCache, and fgCache.
20* 02/12/97 aliu Modified to create objects from RuleBasedCollator cache.
21* Moved cache out of Collation class.
22* 02/13/97 aliu Moved several methods out of this class and into
23* RuleBasedCollator, with modifications. Modified
24* createDefault() to call new RuleBasedCollator(Locale&)
25* constructor. General clean up and documentation.
26* 02/20/97 helena Added clone, operator==, operator!=, operator=, copy
27* constructor and getDynamicClassID.
28* 03/25/97 helena Updated with platform independent data types.
29* 05/06/97 helena Added memory allocation error detection.
30* 06/20/97 helena Java class name change.
31* 09/03/97 helena Added createCollationKeyValues().
32* 02/10/98 damiba Added compare() with length as parameter.
33* 04/23/99 stephen Removed EDecompositionMode, merged with
34* Normalizer::EMode.
374ca955 35* 11/02/99 helena Collator performance enhancements. Eliminates the
b75a7d8f
A
36* UnicodeString construction and special case for NO_OP.
37* 11/23/99 srl More performance enhancements. Inlining of
38* critical accessors.
374ca955
A
39* 05/15/00 helena Added version information API.
40* 01/29/01 synwee Modified into a C++ wrapper which calls C apis
41* (ucoll.h).
b75a7d8f
A
42*/
43
44#ifndef COLL_H
45#define COLL_H
46
47#include "unicode/utypes.h"
48
49#if !UCONFIG_NO_COLLATION
50
51#include "unicode/uobject.h"
52#include "unicode/ucol.h"
53#include "unicode/normlzr.h"
54#include "unicode/locid.h"
55#include "unicode/uniset.h"
56
57U_NAMESPACE_BEGIN
58
59class StringEnumeration;
60
374ca955 61#if !UCONFIG_NO_SERVICE
b75a7d8f 62/**
374ca955 63 * @stable ICU 2.6
b75a7d8f
A
64 */
65typedef const void* URegistryKey;
66
67/**
374ca955 68 * @stable ICU 2.6
b75a7d8f
A
69 */
70class CollatorFactory;
374ca955 71#endif
b75a7d8f
A
72
73/**
74* @stable ICU 2.0
75*/
76class CollationKey;
77
78/**
374ca955 79* The <code>Collator</code> class performs locale-sensitive string
b75a7d8f 80* comparison.<br>
374ca955 81* You use this class to build searching and sorting routines for natural
b75a7d8f 82* language text.<br>
374ca955
A
83* <em>Important: </em>The ICU collation service has been reimplemented
84* in order to achieve better performance and UCA compliance.
85* For details, see the
b75a7d8f
A
86* <a href="http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/collation/ICU_collation_design.htm">
87* collation design document</a>.
88* <p>
374ca955
A
89* <code>Collator</code> is an abstract base class. Subclasses implement
90* specific collation strategies. One subclass,
91* <code>RuleBasedCollator</code>, is currently provided and is applicable
92* to a wide set of languages. Other subclasses may be created to handle more
b75a7d8f
A
93* specialized needs.
94* <p>
374ca955
A
95* Like other locale-sensitive classes, you can use the static factory method,
96* <code>createInstance</code>, to obtain the appropriate
97* <code>Collator</code> object for a given locale. You will only need to
98* look at the subclasses of <code>Collator</code> if you need to
99* understand the details of a particular collation strategy or if you need to
b75a7d8f
A
100* modify that strategy.
101* <p>
374ca955 102* The following example shows how to compare two strings using the
b75a7d8f 103* <code>Collator</code> for the default locale.
374ca955 104* \htmlonly<blockquote>\endhtmlonly
b75a7d8f
A
105* <pre>
106* \code
107* // Compare two strings in the default locale
108* UErrorCode success = U_ZERO_ERROR;
109* Collator* myCollator = Collator::createInstance(success);
110* if (myCollator->compare("abc", "ABC") < 0)
111* cout << "abc is less than ABC" << endl;
112* else
113* cout << "abc is greater than or equal to ABC" << endl;
114* \endcode
115* </pre>
374ca955 116* \htmlonly</blockquote>\endhtmlonly
b75a7d8f 117* <p>
374ca955
A
118* You can set a <code>Collator</code>'s <em>strength</em> property to
119* determine the level of difference considered significant in comparisons.
120* Five strengths are provided: <code>PRIMARY</code>, <code>SECONDARY</code>,
121* <code>TERTIARY</code>, <code>QUATERNARY</code> and <code>IDENTICAL</code>.
122* The exact assignment of strengths to language features is locale dependant.
123* For example, in Czech, "e" and "f" are considered primary differences,
124* while "e" and "\u00EA" are secondary differences, "e" and "E" are tertiary
125* differences and "e" and "e" are identical. The following shows how both case
126* and accents could be ignored for US English.
127* \htmlonly<blockquote>\endhtmlonly
b75a7d8f
A
128* <pre>
129* \code
374ca955 130* //Get the Collator for US English and set its strength to PRIMARY
b75a7d8f 131* UErrorCode success = U_ZERO_ERROR;
374ca955 132* Collator* usCollator = Collator::createInstance(Locale::US, success);
b75a7d8f
A
133* usCollator->setStrength(Collator::PRIMARY);
134* if (usCollator->compare("abc", "ABC") == 0)
374ca955 135* cout << "'abc' and 'ABC' strings are equivalent with strength PRIMARY" << endl;
b75a7d8f
A
136* \endcode
137* </pre>
374ca955 138* \htmlonly</blockquote>\endhtmlonly
b75a7d8f 139* <p>
374ca955
A
140* For comparing strings exactly once, the <code>compare</code> method
141* provides the best performance. When sorting a list of strings however, it
142* is generally necessary to compare each string multiple times. In this case,
143* sort keys provide better performance. The <code>getSortKey</code> methods
144* convert a string to a series of bytes that can be compared bitwise against
145* other sort keys using <code>strcmp()</code>. Sort keys are written as
146* zero-terminated byte strings. They consist of several substrings, one for
b75a7d8f 147* each collation strength level, that are delimited by 0x01 bytes.
374ca955
A
148* If the string code points are appended for UCOL_IDENTICAL, then they are
149* processed for correct code point order comparison and may contain 0x01
b75a7d8f
A
150* bytes but not zero bytes.
151* </p>
152* <p>
374ca955 153* An older set of APIs returns a <code>CollationKey</code> object that wraps
b75a7d8f 154* the sort key bytes instead of returning the bytes themselves.
374ca955 155* Its use is deprecated, but it is still available for compatibility with
b75a7d8f
A
156* Java.
157* </p>
158* <p>
159* <strong>Note:</strong> <code>Collator</code>s with different Locale,
374ca955
A
160* and CollationStrength settings will return different sort
161* orders for the same set of strings. Locales have specific collation rules,
162* and the way in which secondary and tertiary differences are taken into
163* account, for example, will result in a different sorting order for same
b75a7d8f
A
164* strings.
165* </p>
166* @see RuleBasedCollator
167* @see CollationKey
168* @see CollationElementIterator
169* @see Locale
170* @see Normalizer
171* @version 2.0 11/15/01
172*/
173
174class U_I18N_API Collator : public UObject {
175public:
176
374ca955
A
177 // Collator public enums -----------------------------------------------
178
179 /**
180 * Base letter represents a primary difference. Set comparison level to
181 * PRIMARY to ignore secondary and tertiary differences.<br>
182 * Use this to set the strength of a Collator object.<br>
183 * Example of primary difference, "abc" &lt; "abd"
184 *
185 * Diacritical differences on the same base letter represent a secondary
186 * difference. Set comparison level to SECONDARY to ignore tertiary
187 * differences. Use this to set the strength of a Collator object.<br>
188