]>
Commit | Line | Data |
---|---|---|
51004dcb A |
1 | /* |
2 | ********************************************************************** | |
3 | * Copyright (C) 2012-2013, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ********************************************************************** | |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #include "unicode/uchar.h" | |
11 | #include "unicode/utf16.h" | |
12 | ||
13 | #include "identifier_info.h" | |
14 | #include "mutex.h" | |
15 | #include "scriptset.h" | |
16 | #include "ucln_in.h" | |
17 | #include "uvector.h" | |
18 | ||
19 | U_NAMESPACE_BEGIN | |
20 | ||
21 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) | |
22 | ||
23 | static UMutex gInitMutex = U_MUTEX_INITIALIZER; | |
24 | static UBool gStaticsAreInitialized = FALSE; | |
25 | ||
26 | UnicodeSet *IdentifierInfo::ASCII; | |
27 | ScriptSet *IdentifierInfo::JAPANESE; | |
28 | ScriptSet *IdentifierInfo::CHINESE; | |
29 | ScriptSet *IdentifierInfo::KOREAN; | |
30 | ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN; | |
31 | ||
32 | UBool IdentifierInfo::cleanup() { | |
33 | delete ASCII; | |
34 | ASCII = NULL; | |
35 | delete JAPANESE; | |
36 | JAPANESE = NULL; | |
37 | delete CHINESE; | |
38 | CHINESE = NULL; | |
39 | delete KOREAN; | |
40 | KOREAN = NULL; | |
41 | delete CONFUSABLE_WITH_LATIN; | |
42 | CONFUSABLE_WITH_LATIN = NULL; | |
43 | gStaticsAreInitialized = FALSE; | |
44 | return TRUE; | |
45 | } | |
46 | ||
47 | U_CDECL_BEGIN | |
48 | static UBool U_CALLCONV | |
49 | IdentifierInfo_cleanup(void) { | |
50 | return IdentifierInfo::cleanup(); | |
51 | } | |
52 | U_CDECL_END | |
53 | ||
54 | ||
55 | IdentifierInfo::IdentifierInfo(UErrorCode &status): | |
56 | fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), | |
57 | fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { | |
58 | if (U_FAILURE(status)) { | |
59 | return; | |
60 | } | |
61 | { | |
62 | Mutex lock(&gInitMutex); | |
63 | if (!gStaticsAreInitialized) { | |
64 | ASCII = new UnicodeSet(0, 0x7f); | |
65 | JAPANESE = new ScriptSet(); | |
66 | CHINESE = new ScriptSet(); | |
67 | KOREAN = new ScriptSet(); | |
68 | CONFUSABLE_WITH_LATIN = new ScriptSet(); | |
69 | if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL | |
70 | || CONFUSABLE_WITH_LATIN == NULL) { | |
71 | status = U_MEMORY_ALLOCATION_ERROR; | |
72 | return; | |
73 | } | |
74 | ASCII->freeze(); | |
75 | JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) | |
76 | .set(USCRIPT_KATAKANA, status); | |
77 | CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); | |
78 | KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); | |
79 | CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) | |
80 | .set(USCRIPT_CHEROKEE, status); | |
81 | ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); | |
82 | gStaticsAreInitialized = TRUE; | |
83 | } | |
84 | } | |
85 | fIdentifier = new UnicodeString(); | |
86 | fRequiredScripts = new ScriptSet(); | |
87 | fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); | |
88 | uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); | |
89 | fCommonAmongAlternates = new ScriptSet(); | |
90 | fNumerics = new UnicodeSet(); | |
91 | fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); | |
92 | ||
93 | if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || | |
94 | fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { | |
95 | status = U_MEMORY_ALLOCATION_ERROR; | |
96 | } | |
97 | } | |
98 | ||
99 | IdentifierInfo::~IdentifierInfo() { | |
100 | delete fIdentifier; | |
101 | delete fRequiredScripts; | |
102 | uhash_close(fScriptSetSet); | |
103 | delete fCommonAmongAlternates; | |
104 | delete fNumerics; | |
105 | delete fIdentifierProfile; | |
106 | } | |
107 | ||
108 | ||
109 | IdentifierInfo &IdentifierInfo::clear() { | |
110 | fRequiredScripts->resetAll(); | |
111 | uhash_removeAll(fScriptSetSet); | |
112 | fNumerics->clear(); | |
113 | fCommonAmongAlternates->resetAll(); | |
114 | return *this; | |
115 | } | |
116 | ||
117 | ||
118 | IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { | |
119 | *fIdentifierProfile = identifierProfile; | |
120 | return *this; | |
121 | } | |
122 | ||
123 | ||
124 | const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { | |
125 | return *fIdentifierProfile; | |
126 | } | |
127 | ||
128 | ||
129 | IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { | |
130 | if (U_FAILURE(status)) { | |
131 | return *this; | |
132 | } | |
133 | *fIdentifier = identifier; | |
134 | clear(); | |
135 | ScriptSet scriptsForCP; | |
136 | UChar32 cp; | |
137 | for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { | |
138 | cp = identifier.char32At(i); | |
139 | // Store a representative character for each kind of decimal digit | |
140 | if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { | |
141 | // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value | |
142 | fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); | |
143 | } | |
144 | UScriptCode extensions[500]; | |
145 | int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status); | |
146 | if (U_FAILURE(status)) { | |
147 | return *this; | |
148 | } | |
149 | scriptsForCP.resetAll(); | |
150 | for (int32_t j=0; j<extensionsCount; j++) { | |
151 | scriptsForCP.set(extensions[j], status); | |
152 | } | |
153 | scriptsForCP.reset(USCRIPT_COMMON, status); | |
154 | scriptsForCP.reset(USCRIPT_INHERITED, status); | |
155 | switch (scriptsForCP.countMembers()) { | |
156 | case 0: break; | |
157 | case 1: | |
158 | // Single script, record it. | |
159 | fRequiredScripts->Union(scriptsForCP); | |
160 | break; | |
161 | default: | |
162 | if (!fRequiredScripts->intersects(scriptsForCP) | |
163 | && !uhash_geti(fScriptSetSet, &scriptsForCP)) { | |
164 | // If the set hasn't been added already, add it | |
165 | // (Add a copy, fScriptSetSet takes ownership of the copy.) | |
166 | uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); | |
167 | } | |
168 | break; | |
169 | } | |
170 | } | |
171 | // Now make a final pass through ScriptSetSet to remove alternates that came before singles. | |
172 | // [Kana], [Kana Hira] => [Kana] | |
173 | // This is relatively infrequent, so doesn't have to be optimized. | |
174 | // We also compute any commonalities among the alternates. | |
175 | if (uhash_count(fScriptSetSet) > 0) { | |
176 | fCommonAmongAlternates->setAll(); | |
177 | for (int32_t it = -1;;) { | |
178 | const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); | |
179 | if (nextHashEl == NULL) { | |
180 | break; | |
181 | } | |
182 | ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); | |
183 | // [Kana], [Kana Hira] => [Kana] | |
184 | if (fRequiredScripts->intersects(*next)) { | |
185 | uhash_removeElement(fScriptSetSet, nextHashEl); | |
186 | } else { | |
187 | fCommonAmongAlternates->intersect(*next); | |
188 | // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] | |
189 | for (int32_t otherIt = -1;;) { | |
190 | const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); | |
191 | if (otherHashEl == NULL) { | |
192 | break; | |
193 | } | |
194 | ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); | |
195 | if (next != other && next->contains(*other)) { | |
196 | uhash_removeElement(fScriptSetSet, nextHashEl); | |
197 | break; | |
198 | } | |
199 | } | |
200 | } | |
201 | } | |
202 | } | |
203 | if (uhash_count(fScriptSetSet) == 0) { | |
204 | fCommonAmongAlternates->resetAll(); | |
205 | } | |
206 | return *this; | |
207 | } | |
208 | ||
209 | ||
210 | const UnicodeString *IdentifierInfo::getIdentifier() const { | |
211 | return fIdentifier; | |
212 | } | |
213 | ||
214 | const ScriptSet *IdentifierInfo::getScripts() const { | |
215 | return fRequiredScripts; | |
216 | } | |
217 | ||
218 | const UHashtable *IdentifierInfo::getAlternates() const { | |
219 | return fScriptSetSet; | |
220 | } | |
221 | ||
222 | ||
223 | const UnicodeSet *IdentifierInfo::getNumerics() const { | |
224 | return fNumerics; | |
225 | } | |
226 | ||
227 | const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { | |
228 | return fCommonAmongAlternates; | |
229 | } | |
230 | ||
231 | #if !UCONFIG_NO_NORMALIZATION | |
232 | ||
233 | URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { | |
234 | if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { | |
235 | return USPOOF_UNRESTRICTIVE; | |
236 | } | |
237 | if (ASCII->containsAll(*fIdentifier)) { | |
238 | return USPOOF_ASCII; | |
239 | } | |
240 | // This is a bit tricky. We look at a number of factors. | |
241 | // The number of scripts in the text. | |
242 | // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) | |
243 | // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) | |
244 | ||
245 | // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the | |
246 | // time it is created, in setIdentifier(). | |
247 | int32_t cardinalityPlus = fRequiredScripts->countMembers() + | |
248 | (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); | |
249 | if (cardinalityPlus < 2) { | |
250 | return USPOOF_HIGHLY_RESTRICTIVE; | |
251 | } | |
252 | if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) | |
253 | || containsWithAlternates(*KOREAN, *fRequiredScripts)) { | |
254 | return USPOOF_HIGHLY_RESTRICTIVE; | |
255 | } | |
256 | if (cardinalityPlus == 2 && | |
257 | fRequiredScripts->test(USCRIPT_LATIN, status) && | |
258 | !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { | |
259 | return USPOOF_MODERATELY_RESTRICTIVE; | |
260 | } | |
261 | return USPOOF_MINIMALLY_RESTRICTIVE; | |
262 | } | |
263 | ||
264 | #endif /* !UCONFIG_NO_NORMALIZATION */ | |
265 | ||
266 | int32_t IdentifierInfo::getScriptCount() const { | |
267 | // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. | |
268 | int32_t count = fRequiredScripts->countMembers() + | |
269 | (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); | |
270 | return count; | |
271 | } | |
272 | ||
273 | ||
274 | ||
275 | UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { | |
276 | if (!container.contains(containee)) { | |
277 | return FALSE; | |
278 | } | |
279 | for (int32_t iter = -1; ;) { | |
280 | const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); | |
281 | if (hashEl == NULL) { | |
282 | break; | |
283 | } | |
284 | ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer); | |
285 | if (!container.intersects(*alternatives)) { | |
286 | return false; | |
287 | } | |
288 | } | |
289 | return true; | |
290 | } | |
291 | ||
292 | UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { | |
293 | UVector sorted(status); | |
294 | if (U_FAILURE(status)) { | |
295 | return dest; | |
296 | } | |
297 | for (int32_t pos = -1; ;) { | |
298 | const UHashElement *el = uhash_nextElement(alternates, &pos); | |
299 | if (el == NULL) { | |
300 | break; | |
301 | } | |
302 | ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer); | |
303 | sorted.addElement(ss, status); | |
304 | } | |
305 | sorted.sort(uhash_compareScriptSet, status); | |
306 | UnicodeString separator = UNICODE_STRING_SIMPLE("; "); | |
307 | for (int32_t i=0; i<sorted.size(); i++) { | |
308 | if (i>0) { | |
309 | dest.append(separator); | |
310 | } | |
311 | ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); | |
312 | ss->displayScripts(dest); | |
313 | } | |
314 | return dest; | |
315 | } | |
316 | ||
317 | U_NAMESPACE_END | |
318 |