]>
Commit | Line | Data |
---|---|---|
51004dcb A |
1 | /* |
2 | ********************************************************************** | |
57a6839d | 3 | * Copyright (C) 2012-2014, International Business Machines |
51004dcb A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #include "unicode/uchar.h" | |
11 | #include "unicode/utf16.h" | |
12 | ||
13 | #include "identifier_info.h" | |
14 | #include "mutex.h" | |
15 | #include "scriptset.h" | |
16 | #include "ucln_in.h" | |
17 | #include "uvector.h" | |
18 | ||
19 | U_NAMESPACE_BEGIN | |
20 | ||
21 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) | |
22 | ||
57a6839d A |
23 | static UnicodeSet *ASCII; |
24 | static ScriptSet *JAPANESE; | |
25 | static ScriptSet *CHINESE; | |
26 | static ScriptSet *KOREAN; | |
27 | static ScriptSet *CONFUSABLE_WITH_LATIN; | |
28 | static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER; | |
51004dcb | 29 | |
51004dcb | 30 | |
57a6839d A |
31 | U_CDECL_BEGIN |
32 | static UBool U_CALLCONV | |
33 | IdentifierInfo_cleanup(void) { | |
51004dcb A |
34 | delete ASCII; |
35 | ASCII = NULL; | |
36 | delete JAPANESE; | |
37 | JAPANESE = NULL; | |
38 | delete CHINESE; | |
39 | CHINESE = NULL; | |
40 | delete KOREAN; | |
41 | KOREAN = NULL; | |
42 | delete CONFUSABLE_WITH_LATIN; | |
43 | CONFUSABLE_WITH_LATIN = NULL; | |
57a6839d | 44 | gIdentifierInfoInitOnce.reset(); |
51004dcb A |
45 | return TRUE; |
46 | } | |
47 | ||
57a6839d A |
48 | static void U_CALLCONV |
49 | IdentifierInfo_init(UErrorCode &status) { | |
50 | ASCII = new UnicodeSet(0, 0x7f); | |
51 | JAPANESE = new ScriptSet(); | |
52 | CHINESE = new ScriptSet(); | |
53 | KOREAN = new ScriptSet(); | |
54 | CONFUSABLE_WITH_LATIN = new ScriptSet(); | |
55 | if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL | |
56 | || CONFUSABLE_WITH_LATIN == NULL) { | |
57 | status = U_MEMORY_ALLOCATION_ERROR; | |
58 | return; | |
59 | } | |
60 | ASCII->freeze(); | |
61 | JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) | |
62 | .set(USCRIPT_KATAKANA, status); | |
63 | CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); | |
64 | KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); | |
65 | CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) | |
66 | .set(USCRIPT_CHEROKEE, status); | |
67 | ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); | |
51004dcb A |
68 | } |
69 | U_CDECL_END | |
70 | ||
71 | ||
72 | IdentifierInfo::IdentifierInfo(UErrorCode &status): | |
73 | fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), | |
74 | fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { | |
57a6839d | 75 | umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status); |
51004dcb A |
76 | if (U_FAILURE(status)) { |
77 | return; | |
78 | } | |
57a6839d | 79 | |
51004dcb A |
80 | fIdentifier = new UnicodeString(); |
81 | fRequiredScripts = new ScriptSet(); | |
82 | fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); | |
83 | uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); | |
84 | fCommonAmongAlternates = new ScriptSet(); | |
85 | fNumerics = new UnicodeSet(); | |
86 | fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); | |
87 | ||
88 | if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || | |
89 | fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { | |
90 | status = U_MEMORY_ALLOCATION_ERROR; | |
91 | } | |
92 | } | |
93 | ||
94 | IdentifierInfo::~IdentifierInfo() { | |
95 | delete fIdentifier; | |
96 | delete fRequiredScripts; | |
97 | uhash_close(fScriptSetSet); | |
98 | delete fCommonAmongAlternates; | |
99 | delete fNumerics; | |
100 | delete fIdentifierProfile; | |
101 | } | |
102 | ||
103 | ||
104 | IdentifierInfo &IdentifierInfo::clear() { | |
105 | fRequiredScripts->resetAll(); | |
106 | uhash_removeAll(fScriptSetSet); | |
107 | fNumerics->clear(); | |
108 | fCommonAmongAlternates->resetAll(); | |
109 | return *this; | |
110 | } | |
111 | ||
112 | ||
113 | IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { | |
114 | *fIdentifierProfile = identifierProfile; | |
115 | return *this; | |
116 | } | |
117 | ||
118 | ||
119 | const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { | |
120 | return *fIdentifierProfile; | |
121 | } | |
122 | ||
123 | ||
124 | IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { | |
125 | if (U_FAILURE(status)) { | |
126 | return *this; | |
127 | } | |
128 | *fIdentifier = identifier; | |
129 | clear(); | |
130 | ScriptSet scriptsForCP; | |
131 | UChar32 cp; | |
132 | for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { | |
133 | cp = identifier.char32At(i); | |
134 | // Store a representative character for each kind of decimal digit | |
135 | if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { | |
136 | // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value | |
137 | fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); | |
138 | } | |
139 | UScriptCode extensions[500]; | |
140 | int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status); | |
141 | if (U_FAILURE(status)) { | |
142 | return *this; | |
143 | } | |
144 | scriptsForCP.resetAll(); | |
145 | for (int32_t j=0; j<extensionsCount; j++) { | |
146 | scriptsForCP.set(extensions[j], status); | |
147 | } | |
148 | scriptsForCP.reset(USCRIPT_COMMON, status); | |
149 | scriptsForCP.reset(USCRIPT_INHERITED, status); | |
150 | switch (scriptsForCP.countMembers()) { | |
151 | case 0: break; | |
152 | case 1: | |
153 | // Single script, record it. | |
154 | fRequiredScripts->Union(scriptsForCP); | |
155 | break; | |
156 | default: | |
157 | if (!fRequiredScripts->intersects(scriptsForCP) | |
158 | && !uhash_geti(fScriptSetSet, &scriptsForCP)) { | |
159 | // If the set hasn't been added already, add it | |
160 | // (Add a copy, fScriptSetSet takes ownership of the copy.) | |
161 | uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); | |
162 | } | |
163 | break; | |
164 | } | |
165 | } | |
166 | // Now make a final pass through ScriptSetSet to remove alternates that came before singles. | |
167 | // [Kana], [Kana Hira] => [Kana] | |
168 | // This is relatively infrequent, so doesn't have to be optimized. | |
169 | // We also compute any commonalities among the alternates. | |
170 | if (uhash_count(fScriptSetSet) > 0) { | |
171 | fCommonAmongAlternates->setAll(); | |
172 | for (int32_t it = -1;;) { | |
173 | const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); | |
174 | if (nextHashEl == NULL) { | |
175 | break; | |
176 | } | |
177 | ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); | |
178 | // [Kana], [Kana Hira] => [Kana] | |
179 | if (fRequiredScripts->intersects(*next)) { | |
180 | uhash_removeElement(fScriptSetSet, nextHashEl); | |
181 | } else { | |
182 | fCommonAmongAlternates->intersect(*next); | |
183 | // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] | |
184 | for (int32_t otherIt = -1;;) { | |
185 | const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); | |
186 | if (otherHashEl == NULL) { | |
187 | break; | |
188 | } | |
189 | ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); | |
190 | if (next != other && next->contains(*other)) { | |
191 | uhash_removeElement(fScriptSetSet, nextHashEl); | |
192 | break; | |
193 | } | |
194 | } | |
195 | } | |
196 | } | |
197 | } | |
198 | if (uhash_count(fScriptSetSet) == 0) { | |
199 | fCommonAmongAlternates->resetAll(); | |
200 | } | |
201 | return *this; | |
202 | } | |
203 | ||
204 | ||
205 | const UnicodeString *IdentifierInfo::getIdentifier() const { | |
206 | return fIdentifier; | |
207 | } | |
208 | ||
209 | const ScriptSet *IdentifierInfo::getScripts() const { | |
210 | return fRequiredScripts; | |
211 | } | |
212 | ||
213 | const UHashtable *IdentifierInfo::getAlternates() const { | |
214 | return fScriptSetSet; | |
215 | } | |
216 | ||
217 | ||
218 | const UnicodeSet *IdentifierInfo::getNumerics() const { | |
219 | return fNumerics; | |
220 | } | |
221 | ||
222 | const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { | |
223 | return fCommonAmongAlternates; | |
224 | } | |
225 | ||
226 | #if !UCONFIG_NO_NORMALIZATION | |
227 | ||
228 | URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { | |
229 | if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { | |
230 | return USPOOF_UNRESTRICTIVE; | |
231 | } | |
232 | if (ASCII->containsAll(*fIdentifier)) { | |
233 | return USPOOF_ASCII; | |
234 | } | |
235 | // This is a bit tricky. We look at a number of factors. | |
236 | // The number of scripts in the text. | |
237 | // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) | |
238 | // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) | |
239 | ||
240 | // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the | |
241 | // time it is created, in setIdentifier(). | |
242 | int32_t cardinalityPlus = fRequiredScripts->countMembers() + | |
243 | (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); | |
244 | if (cardinalityPlus < 2) { | |
57a6839d | 245 | return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; |
51004dcb A |
246 | } |
247 | if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) | |
248 | || containsWithAlternates(*KOREAN, *fRequiredScripts)) { | |
249 | return USPOOF_HIGHLY_RESTRICTIVE; | |
250 | } | |
251 | if (cardinalityPlus == 2 && | |
252 | fRequiredScripts->test(USCRIPT_LATIN, status) && | |
253 | !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { | |
254 | return USPOOF_MODERATELY_RESTRICTIVE; | |
255 | } | |
256 | return USPOOF_MINIMALLY_RESTRICTIVE; | |
257 | } | |
258 | ||
259 | #endif /* !UCONFIG_NO_NORMALIZATION */ | |
260 | ||
261 | int32_t IdentifierInfo::getScriptCount() const { | |
262 | // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. | |
263 | int32_t count = fRequiredScripts->countMembers() + | |
264 | (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); | |
265 | return count; | |
266 | } | |
267 | ||
268 | ||
269 | ||
270 | UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { | |
271 | if (!container.contains(containee)) { | |
272 | return FALSE; | |
273 | } | |
274 | for (int32_t iter = -1; ;) { | |
275 | const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); | |
276 | if (hashEl == NULL) { | |
277 | break; | |
278 | } | |
279 | ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer); | |
280 | if (!container.intersects(*alternatives)) { | |
281 | return false; | |
282 | } | |
283 | } | |
284 | return true; | |
285 | } | |
286 | ||
287 | UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { | |
288 | UVector sorted(status); | |
289 | if (U_FAILURE(status)) { | |
290 | return dest; | |
291 | } | |
292 | for (int32_t pos = -1; ;) { | |
293 | const UHashElement *el = uhash_nextElement(alternates, &pos); | |
294 | if (el == NULL) { | |
295 | break; | |
296 | } | |
297 | ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer); | |
298 | sorted.addElement(ss, status); | |
299 | } | |
300 | sorted.sort(uhash_compareScriptSet, status); | |
301 | UnicodeString separator = UNICODE_STRING_SIMPLE("; "); | |
302 | for (int32_t i=0; i<sorted.size(); i++) { | |
303 | if (i>0) { | |
304 | dest.append(separator); | |
305 | } | |
306 | ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); | |
307 | ss->displayScripts(dest); | |
308 | } | |
309 | return dest; | |
310 | } | |
311 | ||
312 | U_NAMESPACE_END | |
313 |