]>
Commit | Line | Data |
---|---|---|
51004dcb A |
1 | /* |
2 | ********************************************************************** | |
57a6839d | 3 | * Copyright (C) 2012-2014, International Business Machines |
51004dcb A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #include "unicode/uchar.h" | |
11 | #include "unicode/utf16.h" | |
12 | ||
13 | #include "identifier_info.h" | |
14 | #include "mutex.h" | |
15 | #include "scriptset.h" | |
16 | #include "ucln_in.h" | |
17 | #include "uvector.h" | |
18 | ||
19 | U_NAMESPACE_BEGIN | |
20 | ||
57a6839d A |
21 | static UnicodeSet *ASCII; |
22 | static ScriptSet *JAPANESE; | |
23 | static ScriptSet *CHINESE; | |
24 | static ScriptSet *KOREAN; | |
25 | static ScriptSet *CONFUSABLE_WITH_LATIN; | |
26 | static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER; | |
51004dcb | 27 | |
51004dcb | 28 | |
57a6839d A |
29 | U_CDECL_BEGIN |
30 | static UBool U_CALLCONV | |
31 | IdentifierInfo_cleanup(void) { | |
51004dcb A |
32 | delete ASCII; |
33 | ASCII = NULL; | |
34 | delete JAPANESE; | |
35 | JAPANESE = NULL; | |
36 | delete CHINESE; | |
37 | CHINESE = NULL; | |
38 | delete KOREAN; | |
39 | KOREAN = NULL; | |
40 | delete CONFUSABLE_WITH_LATIN; | |
41 | CONFUSABLE_WITH_LATIN = NULL; | |
57a6839d | 42 | gIdentifierInfoInitOnce.reset(); |
51004dcb A |
43 | return TRUE; |
44 | } | |
45 | ||
57a6839d A |
46 | static void U_CALLCONV |
47 | IdentifierInfo_init(UErrorCode &status) { | |
48 | ASCII = new UnicodeSet(0, 0x7f); | |
49 | JAPANESE = new ScriptSet(); | |
50 | CHINESE = new ScriptSet(); | |
51 | KOREAN = new ScriptSet(); | |
52 | CONFUSABLE_WITH_LATIN = new ScriptSet(); | |
53 | if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL | |
54 | || CONFUSABLE_WITH_LATIN == NULL) { | |
55 | status = U_MEMORY_ALLOCATION_ERROR; | |
56 | return; | |
57 | } | |
58 | ASCII->freeze(); | |
59 | JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) | |
60 | .set(USCRIPT_KATAKANA, status); | |
61 | CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); | |
62 | KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); | |
63 | CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) | |
64 | .set(USCRIPT_CHEROKEE, status); | |
65 | ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); | |
51004dcb A |
66 | } |
67 | U_CDECL_END | |
68 | ||
69 | ||
70 | IdentifierInfo::IdentifierInfo(UErrorCode &status): | |
71 | fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), | |
72 | fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { | |
57a6839d | 73 | umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status); |
51004dcb A |
74 | if (U_FAILURE(status)) { |
75 | return; | |
76 | } | |
57a6839d | 77 | |
51004dcb A |
78 | fIdentifier = new UnicodeString(); |
79 | fRequiredScripts = new ScriptSet(); | |
80 | fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); | |
81 | uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); | |
82 | fCommonAmongAlternates = new ScriptSet(); | |
83 | fNumerics = new UnicodeSet(); | |
84 | fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); | |
85 | ||
86 | if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || | |
87 | fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { | |
88 | status = U_MEMORY_ALLOCATION_ERROR; | |
89 | } | |
90 | } | |
91 | ||
92 | IdentifierInfo::~IdentifierInfo() { | |
93 | delete fIdentifier; | |
94 | delete fRequiredScripts; | |
95 | uhash_close(fScriptSetSet); | |
96 | delete fCommonAmongAlternates; | |
97 | delete fNumerics; | |
98 | delete fIdentifierProfile; | |
99 | } | |
100 | ||
101 | ||
102 | IdentifierInfo &IdentifierInfo::clear() { | |
103 | fRequiredScripts->resetAll(); | |
104 | uhash_removeAll(fScriptSetSet); | |
105 | fNumerics->clear(); | |
106 | fCommonAmongAlternates->resetAll(); | |
107 | return *this; | |
108 | } | |
109 | ||
110 | ||
111 | IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { | |
112 | *fIdentifierProfile = identifierProfile; | |
113 | return *this; | |
114 | } | |
115 | ||
116 | ||
117 | const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { | |
118 | return *fIdentifierProfile; | |
119 | } | |
120 | ||
121 | ||
122 | IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { | |
123 | if (U_FAILURE(status)) { | |
124 | return *this; | |
125 | } | |
126 | *fIdentifier = identifier; | |
127 | clear(); | |
128 | ScriptSet scriptsForCP; | |
129 | UChar32 cp; | |
130 | for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { | |
131 | cp = identifier.char32At(i); | |
132 | // Store a representative character for each kind of decimal digit | |
133 | if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { | |
134 | // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value | |
135 | fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); | |
136 | } | |
137 | UScriptCode extensions[500]; | |
b331163b | 138 | int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status); |
51004dcb A |
139 | if (U_FAILURE(status)) { |
140 | return *this; | |
141 | } | |
142 | scriptsForCP.resetAll(); | |
143 | for (int32_t j=0; j<extensionsCount; j++) { | |
144 | scriptsForCP.set(extensions[j], status); | |
145 | } | |
146 | scriptsForCP.reset(USCRIPT_COMMON, status); | |
147 | scriptsForCP.reset(USCRIPT_INHERITED, status); | |
148 | switch (scriptsForCP.countMembers()) { | |
149 | case 0: break; | |
150 | case 1: | |
151 | // Single script, record it. | |
152 | fRequiredScripts->Union(scriptsForCP); | |
153 | break; | |
154 | default: | |
155 | if (!fRequiredScripts->intersects(scriptsForCP) | |
156 | && !uhash_geti(fScriptSetSet, &scriptsForCP)) { | |
157 | // If the set hasn't been added already, add it | |
158 | // (Add a copy, fScriptSetSet takes ownership of the copy.) | |
159 | uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); | |
160 | } | |
161 | break; | |
162 | } | |
163 | } | |
164 | // Now make a final pass through ScriptSetSet to remove alternates that came before singles. | |
165 | // [Kana], [Kana Hira] => [Kana] | |
166 | // This is relatively infrequent, so doesn't have to be optimized. | |
167 | // We also compute any commonalities among the alternates. | |
168 | if (uhash_count(fScriptSetSet) > 0) { | |
169 | fCommonAmongAlternates->setAll(); | |
b331163b | 170 | for (int32_t it = UHASH_FIRST;;) { |
51004dcb A |
171 | const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); |
172 | if (nextHashEl == NULL) { | |
173 | break; | |
174 | } | |
175 | ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); | |
176 | // [Kana], [Kana Hira] => [Kana] | |
177 | if (fRequiredScripts->intersects(*next)) { | |
178 | uhash_removeElement(fScriptSetSet, nextHashEl); | |
179 | } else { | |
180 | fCommonAmongAlternates->intersect(*next); | |
181 | // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] | |
b331163b | 182 | for (int32_t otherIt = UHASH_FIRST;;) { |
51004dcb A |
183 | const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); |
184 | if (otherHashEl == NULL) { | |
185 | break; | |
186 | } | |
187 | ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); | |
188 | if (next != other && next->contains(*other)) { | |
189 | uhash_removeElement(fScriptSetSet, nextHashEl); | |
190 | break; | |
191 | } | |
192 | } | |
193 | } | |
194 | } | |
195 | } | |
196 | if (uhash_count(fScriptSetSet) == 0) { | |
197 | fCommonAmongAlternates->resetAll(); | |
198 | } | |
199 | return *this; | |
200 | } | |
201 | ||
202 | ||
203 | const UnicodeString *IdentifierInfo::getIdentifier() const { | |
204 | return fIdentifier; | |
205 | } | |
206 | ||
207 | const ScriptSet *IdentifierInfo::getScripts() const { | |
208 | return fRequiredScripts; | |
209 | } | |
210 | ||
211 | const UHashtable *IdentifierInfo::getAlternates() const { | |
212 | return fScriptSetSet; | |
213 | } | |
214 | ||
215 | ||
216 | const UnicodeSet *IdentifierInfo::getNumerics() const { | |
217 | return fNumerics; | |
218 | } | |
219 | ||
220 | const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { | |
221 | return fCommonAmongAlternates; | |
222 | } | |
223 | ||
224 | #if !UCONFIG_NO_NORMALIZATION | |
225 | ||
226 | URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { | |
227 | if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { | |
228 | return USPOOF_UNRESTRICTIVE; | |
229 | } | |
230 | if (ASCII->containsAll(*fIdentifier)) { | |
231 | return USPOOF_ASCII; | |
232 | } | |
233 | // This is a bit tricky. We look at a number of factors. | |
234 | // The number of scripts in the text. | |
235 | // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) | |
236 | // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) | |
237 | ||
238 | // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the | |
239 | // time it is created, in setIdentifier(). | |
240 | int32_t cardinalityPlus = fRequiredScripts->countMembers() + | |
241 | (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); | |
242 | if (cardinalityPlus < 2) { | |
57a6839d | 243 | return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; |
51004dcb A |
244 | } |
245 | if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) | |
246 | || containsWithAlternates(*KOREAN, *fRequiredScripts)) { | |
247 | return USPOOF_HIGHLY_RESTRICTIVE; | |
248 | } | |
249 | if (cardinalityPlus == 2 && | |
250 | fRequiredScripts->test(USCRIPT_LATIN, status) && | |
251 | !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { | |
252 | return USPOOF_MODERATELY_RESTRICTIVE; | |
253 | } | |
254 | return USPOOF_MINIMALLY_RESTRICTIVE; | |
255 | } | |
256 | ||
257 | #endif /* !UCONFIG_NO_NORMALIZATION */ | |
258 | ||
259 | int32_t IdentifierInfo::getScriptCount() const { | |
260 | // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. | |
261 | int32_t count = fRequiredScripts->countMembers() + | |
262 | (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); | |
263 | return count; | |
264 | } | |
265 | ||
266 | ||
267 | ||
268 | UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { | |
269 | if (!container.contains(containee)) { | |
270 | return FALSE; | |
271 | } | |
b331163b | 272 | for (int32_t iter = UHASH_FIRST; ;) { |
51004dcb A |
273 | const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); |
274 | if (hashEl == NULL) { | |
275 | break; | |
276 | } | |
277 | ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer); | |
278 | if (!container.intersects(*alternatives)) { | |
279 | return false; | |
280 | } | |
281 | } | |
282 | return true; | |
283 | } | |
284 | ||
285 | UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { | |
286 | UVector sorted(status); | |
287 | if (U_FAILURE(status)) { | |
288 | return dest; | |
289 | } | |
b331163b | 290 | for (int32_t pos = UHASH_FIRST; ;) { |
51004dcb A |
291 | const UHashElement *el = uhash_nextElement(alternates, &pos); |
292 | if (el == NULL) { | |
293 | break; | |
294 | } | |
295 | ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer); | |
296 | sorted.addElement(ss, status); | |
297 | } | |
298 | sorted.sort(uhash_compareScriptSet, status); | |
299 | UnicodeString separator = UNICODE_STRING_SIMPLE("; "); | |
300 | for (int32_t i=0; i<sorted.size(); i++) { | |
301 | if (i>0) { | |
302 | dest.append(separator); | |
303 | } | |
304 | ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); | |
305 | ss->displayScripts(dest); | |
306 | } | |
307 | return dest; | |
308 | } | |
309 | ||
310 | U_NAMESPACE_END | |
311 |