]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/identifier_info.cpp
ICU-551.51.4.tar.gz
[apple/icu.git] / icuSources / i18n / identifier_info.cpp
CommitLineData
51004dcb
A
1/*
2**********************************************************************
57a6839d 3* Copyright (C) 2012-2014, International Business Machines
51004dcb
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6*/
7
8#include "unicode/utypes.h"
9
10#include "unicode/uchar.h"
11#include "unicode/utf16.h"
12
13#include "identifier_info.h"
14#include "mutex.h"
15#include "scriptset.h"
16#include "ucln_in.h"
17#include "uvector.h"
18
19U_NAMESPACE_BEGIN
20
57a6839d
A
21static UnicodeSet *ASCII;
22static ScriptSet *JAPANESE;
23static ScriptSet *CHINESE;
24static ScriptSet *KOREAN;
25static ScriptSet *CONFUSABLE_WITH_LATIN;
26static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
51004dcb 27
51004dcb 28
57a6839d
A
29U_CDECL_BEGIN
30static UBool U_CALLCONV
31IdentifierInfo_cleanup(void) {
51004dcb
A
32 delete ASCII;
33 ASCII = NULL;
34 delete JAPANESE;
35 JAPANESE = NULL;
36 delete CHINESE;
37 CHINESE = NULL;
38 delete KOREAN;
39 KOREAN = NULL;
40 delete CONFUSABLE_WITH_LATIN;
41 CONFUSABLE_WITH_LATIN = NULL;
57a6839d 42 gIdentifierInfoInitOnce.reset();
51004dcb
A
43 return TRUE;
44}
45
57a6839d
A
46static void U_CALLCONV
47IdentifierInfo_init(UErrorCode &status) {
48 ASCII = new UnicodeSet(0, 0x7f);
49 JAPANESE = new ScriptSet();
50 CHINESE = new ScriptSet();
51 KOREAN = new ScriptSet();
52 CONFUSABLE_WITH_LATIN = new ScriptSet();
53 if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
54 || CONFUSABLE_WITH_LATIN == NULL) {
55 status = U_MEMORY_ALLOCATION_ERROR;
56 return;
57 }
58 ASCII->freeze();
59 JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
60 .set(USCRIPT_KATAKANA, status);
61 CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
62 KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
63 CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
64 .set(USCRIPT_CHEROKEE, status);
65 ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
51004dcb
A
66}
67U_CDECL_END
68
69
70IdentifierInfo::IdentifierInfo(UErrorCode &status):
71 fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
72 fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
57a6839d 73 umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
51004dcb
A
74 if (U_FAILURE(status)) {
75 return;
76 }
57a6839d 77
51004dcb
A
78 fIdentifier = new UnicodeString();
79 fRequiredScripts = new ScriptSet();
80 fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
81 uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
82 fCommonAmongAlternates = new ScriptSet();
83 fNumerics = new UnicodeSet();
84 fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
85
86 if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
87 fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
88 status = U_MEMORY_ALLOCATION_ERROR;
89 }
90}
91
92IdentifierInfo::~IdentifierInfo() {
93 delete fIdentifier;
94 delete fRequiredScripts;
95 uhash_close(fScriptSetSet);
96 delete fCommonAmongAlternates;
97 delete fNumerics;
98 delete fIdentifierProfile;
99}
100
101
102IdentifierInfo &IdentifierInfo::clear() {
103 fRequiredScripts->resetAll();
104 uhash_removeAll(fScriptSetSet);
105 fNumerics->clear();
106 fCommonAmongAlternates->resetAll();
107 return *this;
108}
109
110
111IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
112 *fIdentifierProfile = identifierProfile;
113 return *this;
114}
115
116
117const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
118 return *fIdentifierProfile;
119}
120
121
122IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
123 if (U_FAILURE(status)) {
124 return *this;
125 }
126 *fIdentifier = identifier;
127 clear();
128 ScriptSet scriptsForCP;
129 UChar32 cp;
130 for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
131 cp = identifier.char32At(i);
132 // Store a representative character for each kind of decimal digit
133 if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
134 // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
135 fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
136 }
137 UScriptCode extensions[500];
b331163b 138 int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status);
51004dcb
A
139 if (U_FAILURE(status)) {
140 return *this;
141 }
142 scriptsForCP.resetAll();
143 for (int32_t j=0; j<extensionsCount; j++) {
144 scriptsForCP.set(extensions[j], status);
145 }
146 scriptsForCP.reset(USCRIPT_COMMON, status);
147 scriptsForCP.reset(USCRIPT_INHERITED, status);
148 switch (scriptsForCP.countMembers()) {
149 case 0: break;
150 case 1:
151 // Single script, record it.
152 fRequiredScripts->Union(scriptsForCP);
153 break;
154 default:
155 if (!fRequiredScripts->intersects(scriptsForCP)
156 && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
157 // If the set hasn't been added already, add it
158 // (Add a copy, fScriptSetSet takes ownership of the copy.)
159 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
160 }
161 break;
162 }
163 }
164 // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
165 // [Kana], [Kana Hira] => [Kana]
166 // This is relatively infrequent, so doesn't have to be optimized.
167 // We also compute any commonalities among the alternates.
168 if (uhash_count(fScriptSetSet) > 0) {
169 fCommonAmongAlternates->setAll();
b331163b 170 for (int32_t it = UHASH_FIRST;;) {
51004dcb
A
171 const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
172 if (nextHashEl == NULL) {
173 break;
174 }
175 ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
176 // [Kana], [Kana Hira] => [Kana]
177 if (fRequiredScripts->intersects(*next)) {
178 uhash_removeElement(fScriptSetSet, nextHashEl);
179 } else {
180 fCommonAmongAlternates->intersect(*next);
181 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
b331163b 182 for (int32_t otherIt = UHASH_FIRST;;) {
51004dcb
A
183 const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
184 if (otherHashEl == NULL) {
185 break;
186 }
187 ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
188 if (next != other && next->contains(*other)) {
189 uhash_removeElement(fScriptSetSet, nextHashEl);
190 break;
191 }
192 }
193 }
194 }
195 }
196 if (uhash_count(fScriptSetSet) == 0) {
197 fCommonAmongAlternates->resetAll();
198 }
199 return *this;
200}
201
202
203const UnicodeString *IdentifierInfo::getIdentifier() const {
204 return fIdentifier;
205}
206
207const ScriptSet *IdentifierInfo::getScripts() const {
208 return fRequiredScripts;
209}
210
211const UHashtable *IdentifierInfo::getAlternates() const {
212 return fScriptSetSet;
213}
214
215
216const UnicodeSet *IdentifierInfo::getNumerics() const {
217 return fNumerics;
218}
219
220const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
221 return fCommonAmongAlternates;
222}
223
224#if !UCONFIG_NO_NORMALIZATION
225
226URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
227 if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
228 return USPOOF_UNRESTRICTIVE;
229 }
230 if (ASCII->containsAll(*fIdentifier)) {
231 return USPOOF_ASCII;
232 }
233 // This is a bit tricky. We look at a number of factors.
234 // The number of scripts in the text.
235 // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
236 // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
237
238 // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
239 // time it is created, in setIdentifier().
240 int32_t cardinalityPlus = fRequiredScripts->countMembers() +
241 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
242 if (cardinalityPlus < 2) {
57a6839d 243 return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
51004dcb
A
244 }
245 if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
246 || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
247 return USPOOF_HIGHLY_RESTRICTIVE;
248 }
249 if (cardinalityPlus == 2 &&
250 fRequiredScripts->test(USCRIPT_LATIN, status) &&
251 !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
252 return USPOOF_MODERATELY_RESTRICTIVE;
253 }
254 return USPOOF_MINIMALLY_RESTRICTIVE;
255}
256
257#endif /* !UCONFIG_NO_NORMALIZATION */
258
259int32_t IdentifierInfo::getScriptCount() const {
260 // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
261 int32_t count = fRequiredScripts->countMembers() +
262 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
263 return count;
264}
265
266
267
268UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
269 if (!container.contains(containee)) {
270 return FALSE;
271 }
b331163b 272 for (int32_t iter = UHASH_FIRST; ;) {
51004dcb
A
273 const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
274 if (hashEl == NULL) {
275 break;
276 }
277 ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
278 if (!container.intersects(*alternatives)) {
279 return false;
280 }
281 }
282 return true;
283}
284
285UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
286 UVector sorted(status);
287 if (U_FAILURE(status)) {
288 return dest;
289 }
b331163b 290 for (int32_t pos = UHASH_FIRST; ;) {
51004dcb
A
291 const UHashElement *el = uhash_nextElement(alternates, &pos);
292 if (el == NULL) {
293 break;
294 }
295 ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
296 sorted.addElement(ss, status);
297 }
298 sorted.sort(uhash_compareScriptSet, status);
299 UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
300 for (int32_t i=0; i<sorted.size(); i++) {
301 if (i>0) {
302 dest.append(separator);
303 }
304 ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
305 ss->displayScripts(dest);
306 }
307 return dest;
308}
309
310U_NAMESPACE_END
311