]>
Commit | Line | Data |
---|---|---|
51004dcb A |
1 | /* |
2 | ********************************************************************** | |
3 | * Copyright (C) 2013, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ********************************************************************** | |
6 | * | |
7 | * indentifier_info.h | |
8 | * | |
9 | * created on: 2013 Jan 7 | |
10 | * created by: Andy Heninger | |
11 | */ | |
12 | ||
13 | #ifndef __IDENTIFIER_INFO_H__ | |
14 | #define __IDENTIFIER_INFO_H__ | |
15 | ||
16 | #include "unicode/utypes.h" | |
17 | ||
18 | #include "unicode/uniset.h" | |
19 | #include "unicode/uspoof.h" | |
20 | #include "uhash.h" | |
21 | ||
22 | U_NAMESPACE_BEGIN | |
23 | ||
24 | class ScriptSet; | |
25 | ||
26 | // TODO(andy): review consistency of reference vs pointer arguments to the funcions. | |
27 | ||
28 | /** | |
29 | * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile | |
30 | * then setIdentifier. Available methods include: | |
31 | * <ol> | |
32 | * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in | |
33 | * each of these. | |
34 | * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be | |
35 | * either Katakana or Hiragana. | |
36 | * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates. | |
37 | * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in | |
38 | * the identifier. | |
39 | * <li>call getRestrictionLevel to see what the UTS36 restriction level is. | |
40 | * </ol> | |
41 | * | |
42 | * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo | |
43 | */ | |
44 | class U_I18N_API IdentifierInfo : public UMemory { | |
45 | ||
46 | public: | |
47 | /** | |
48 | * Create an identifier info object. Subsequently, call setIdentifier(), etc. | |
49 | * @internal | |
50 | */ | |
51 | IdentifierInfo(UErrorCode &status); | |
52 | ||
53 | /** | |
54 | * Destructor | |
55 | */ | |
56 | virtual ~IdentifierInfo(); | |
57 | ||
58 | private: | |
59 | /* Disallow copying for now. Can be added if there's a need. */ | |
60 | IdentifierInfo(const IdentifierInfo &other); | |
61 | ||
62 | public: | |
63 | ||
64 | /** | |
65 | * Set the identifier profile: the characters that are to be allowed in the identifier. | |
66 | * | |
67 | * @param identifierProfile the characters that are to be allowed in the identifier | |
68 | * @return this | |
69 | * @internal | |
70 | */ | |
71 | IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile); | |
72 | ||
73 | /** | |
74 | * Get the identifier profile: the characters that are to be allowed in the identifier. | |
75 | * | |
76 | * @return The characters that are to be allowed in the identifier. | |
77 | * @internal | |
78 | */ | |
79 | const UnicodeSet &getIdentifierProfile() const; | |
80 | ||
81 | ||
82 | /** | |
83 | * Set an identifier to analyze. Afterwards, call methods like getScripts() | |
84 | * | |
85 | * @param identifier the identifier to analyze | |
86 | * @param status Errorcode, set if errors occur. | |
87 | * @return this | |
88 | * @internal | |
89 | */ | |
90 | IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status); | |
91 | ||
92 | ||
93 | /** | |
94 | * Get the identifier that was analyzed. The returned string is owned by the ICU library, | |
95 | * and must not be deleted by the caller. | |
96 | * | |
97 | * @return the identifier that was analyzed. | |
98 | * @internal | |
99 | */ | |
100 | const UnicodeString *getIdentifier() const; | |
101 | ||
102 | ||
103 | /** | |
104 | * Get the scripts found in the identifiers. | |
105 | * | |
106 | * @return the set of explicit scripts. | |
107 | * @internal | |
108 | */ | |
109 | const ScriptSet *getScripts() const; | |
110 | ||
111 | /** | |
112 | * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then | |
113 | * the set consisting of those scripts will be returned. | |
114 | * | |
115 | * @return a uhash, with each key being of type (ScriptSet *). | |
116 | * This is a set, not a map, so the value stored in the uhash is not relevant. | |
117 | * (It is, in fact, 1). | |
118 | * Ownership of the uhash and its contents remains with the IndetifierInfo object, | |
119 | * and remains valid until a new identifer is set or until the object is deleted. | |
120 | * @internal | |
121 | */ | |
122 | const UHashtable *getAlternates() const; | |
123 | ||
124 | /** | |
125 | * Get the representative characters (zeros) for the numerics found in the identifier. | |
126 | * | |
127 | * @return the set of explicit scripts. | |
128 | * @internal | |
129 | */ | |
130 | const UnicodeSet *getNumerics() const; | |
131 | ||
132 | /** | |
133 | * Find out which scripts are in common among the alternates. | |
134 | * | |
135 | * @return the set of scripts that are in common among the alternates. | |
136 | * @internal | |
137 | */ | |
138 | const ScriptSet *getCommonAmongAlternates() const; | |
139 | ||
140 | /** | |
141 | * Get the number of scripts appearing in the identifier. | |
142 | * Note: Common and Inherited scripts are omitted from the count. | |
143 | * Note: Result may be high when the identifier contains characters | |
144 | * with alternate scripts. The distinction between | |
145 | * 0, 1 and > 1 will remain valid, however. | |
146 | * @return the number of scripts. | |
147 | */ | |
148 | int32_t getScriptCount() const; | |
149 | ||
150 | #if !UCONFIG_NO_NORMALIZATION | |
151 | ||
152 | /** | |
153 | * Find the "tightest" restriction level that the identifier satisfies. | |
154 | * | |
155 | * @return the restriction level. | |
156 | * @internal | |
157 | */ | |
158 | URestrictionLevel getRestrictionLevel(UErrorCode &status) const; | |
159 | ||
160 | #endif /*!UCONFIG_NO_NORMALIZATION */ | |
161 | ||
162 | UnicodeString toString() const; | |
163 | ||
164 | /** | |
165 | * Produce a readable string of alternates. | |
166 | * | |
167 | * @param alternates a UHashtable of UScriptSets. | |
168 | * Keys only, no meaningful values in the UHash. | |
169 | * @return display form | |
170 | * @internal | |
171 | */ | |
172 | static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status); | |
173 | ||
174 | /** | |
175 | * Static memory cleanup function. | |
176 | * @internal | |
177 | */ | |
178 | static UBool cleanup(); | |
179 | private: | |
180 | ||
181 | IdentifierInfo & clear(); | |
182 | UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const; | |
183 | ||
184 | UnicodeString *fIdentifier; | |
185 | ScriptSet *fRequiredScripts; | |
186 | UHashtable *fScriptSetSet; | |
187 | ScriptSet *fCommonAmongAlternates; | |
188 | UnicodeSet *fNumerics; | |
189 | UnicodeSet *fIdentifierProfile; | |
190 | ||
191 | static UnicodeSet *ASCII; | |
192 | static ScriptSet *JAPANESE; | |
193 | static ScriptSet *CHINESE; | |
194 | static ScriptSet *KOREAN; | |
195 | static ScriptSet *CONFUSABLE_WITH_LATIN; | |
196 | ||
197 | ||
198 | ||
199 | }; | |
200 | ||
201 | U_NAMESPACE_END | |
202 | ||
203 | #endif // __IDENTIFIER_INFO_H__ | |
204 |