]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/identifier_info.h
ICU-511.32.tar.gz
[apple/icu.git] / icuSources / i18n / identifier_info.h
1 /*
2 **********************************************************************
3 * Copyright (C) 2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 *
7 * indentifier_info.h
8 *
9 * created on: 2013 Jan 7
10 * created by: Andy Heninger
11 */
12
13 #ifndef __IDENTIFIER_INFO_H__
14 #define __IDENTIFIER_INFO_H__
15
16 #include "unicode/utypes.h"
17
18 #include "unicode/uniset.h"
19 #include "unicode/uspoof.h"
20 #include "uhash.h"
21
22 U_NAMESPACE_BEGIN
23
24 class ScriptSet;
25
26 // TODO(andy): review consistency of reference vs pointer arguments to the funcions.
27
28 /**
29 * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
30 * then setIdentifier. Available methods include:
31 * <ol>
32 * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
33 * each of these.
34 * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
35 * either Katakana or Hiragana.
36 * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
37 * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
38 * the identifier.
39 * <li>call getRestrictionLevel to see what the UTS36 restriction level is.
40 * </ol>
41 *
42 * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
43 */
44 class U_I18N_API IdentifierInfo : public UMemory {
45
46 public:
47 /**
48 * Create an identifier info object. Subsequently, call setIdentifier(), etc.
49 * @internal
50 */
51 IdentifierInfo(UErrorCode &status);
52
53 /**
54 * Destructor
55 */
56 virtual ~IdentifierInfo();
57
58 private:
59 /* Disallow copying for now. Can be added if there's a need. */
60 IdentifierInfo(const IdentifierInfo &other);
61
62 public:
63
64 /**
65 * Set the identifier profile: the characters that are to be allowed in the identifier.
66 *
67 * @param identifierProfile the characters that are to be allowed in the identifier
68 * @return this
69 * @internal
70 */
71 IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
72
73 /**
74 * Get the identifier profile: the characters that are to be allowed in the identifier.
75 *
76 * @return The characters that are to be allowed in the identifier.
77 * @internal
78 */
79 const UnicodeSet &getIdentifierProfile() const;
80
81
82 /**
83 * Set an identifier to analyze. Afterwards, call methods like getScripts()
84 *
85 * @param identifier the identifier to analyze
86 * @param status Errorcode, set if errors occur.
87 * @return this
88 * @internal
89 */
90 IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
91
92
93 /**
94 * Get the identifier that was analyzed. The returned string is owned by the ICU library,
95 * and must not be deleted by the caller.
96 *
97 * @return the identifier that was analyzed.
98 * @internal
99 */
100 const UnicodeString *getIdentifier() const;
101
102
103 /**
104 * Get the scripts found in the identifiers.
105 *
106 * @return the set of explicit scripts.
107 * @internal
108 */
109 const ScriptSet *getScripts() const;
110
111 /**
112 * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
113 * the set consisting of those scripts will be returned.
114 *
115 * @return a uhash, with each key being of type (ScriptSet *).
116 * This is a set, not a map, so the value stored in the uhash is not relevant.
117 * (It is, in fact, 1).
118 * Ownership of the uhash and its contents remains with the IndetifierInfo object,
119 * and remains valid until a new identifer is set or until the object is deleted.
120 * @internal
121 */
122 const UHashtable *getAlternates() const;
123
124 /**
125 * Get the representative characters (zeros) for the numerics found in the identifier.
126 *
127 * @return the set of explicit scripts.
128 * @internal
129 */
130 const UnicodeSet *getNumerics() const;
131
132 /**
133 * Find out which scripts are in common among the alternates.
134 *
135 * @return the set of scripts that are in common among the alternates.
136 * @internal
137 */
138 const ScriptSet *getCommonAmongAlternates() const;
139
140 /**
141 * Get the number of scripts appearing in the identifier.
142 * Note: Common and Inherited scripts are omitted from the count.
143 * Note: Result may be high when the identifier contains characters
144 * with alternate scripts. The distinction between
145 * 0, 1 and > 1 will remain valid, however.
146 * @return the number of scripts.
147 */
148 int32_t getScriptCount() const;
149
150 #if !UCONFIG_NO_NORMALIZATION
151
152 /**
153 * Find the "tightest" restriction level that the identifier satisfies.
154 *
155 * @return the restriction level.
156 * @internal
157 */
158 URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
159
160 #endif /*!UCONFIG_NO_NORMALIZATION */
161
162 UnicodeString toString() const;
163
164 /**
165 * Produce a readable string of alternates.
166 *
167 * @param alternates a UHashtable of UScriptSets.
168 * Keys only, no meaningful values in the UHash.
169 * @return display form
170 * @internal
171 */
172 static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
173
174 /**
175 * Static memory cleanup function.
176 * @internal
177 */
178 static UBool cleanup();
179 private:
180
181 IdentifierInfo & clear();
182 UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
183
184 UnicodeString *fIdentifier;
185 ScriptSet *fRequiredScripts;
186 UHashtable *fScriptSetSet;
187 ScriptSet *fCommonAmongAlternates;
188 UnicodeSet *fNumerics;
189 UnicodeSet *fIdentifierProfile;
190
191 static UnicodeSet *ASCII;
192 static ScriptSet *JAPANESE;
193 static ScriptSet *CHINESE;
194 static ScriptSet *KOREAN;
195 static ScriptSet *CONFUSABLE_WITH_LATIN;
196
197
198
199 };
200
201 U_NAMESPACE_END
202
203 #endif // __IDENTIFIER_INFO_H__
204