2 **********************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #include "unicode/uchar.h"
11 #include "unicode/utf16.h"
13 #include "identifier_info.h"
15 #include "scriptset.h"
21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
23 static UnicodeSet
*ASCII
;
24 static ScriptSet
*JAPANESE
;
25 static ScriptSet
*CHINESE
;
26 static ScriptSet
*KOREAN
;
27 static ScriptSet
*CONFUSABLE_WITH_LATIN
;
28 static UInitOnce gIdentifierInfoInitOnce
= U_INITONCE_INITIALIZER
;
32 static UBool U_CALLCONV
33 IdentifierInfo_cleanup(void) {
42 delete CONFUSABLE_WITH_LATIN
;
43 CONFUSABLE_WITH_LATIN
= NULL
;
44 gIdentifierInfoInitOnce
.reset();
48 static void U_CALLCONV
49 IdentifierInfo_init(UErrorCode
&status
) {
50 ASCII
= new UnicodeSet(0, 0x7f);
51 JAPANESE
= new ScriptSet();
52 CHINESE
= new ScriptSet();
53 KOREAN
= new ScriptSet();
54 CONFUSABLE_WITH_LATIN
= new ScriptSet();
55 if (ASCII
== NULL
|| JAPANESE
== NULL
|| CHINESE
== NULL
|| KOREAN
== NULL
56 || CONFUSABLE_WITH_LATIN
== NULL
) {
57 status
= U_MEMORY_ALLOCATION_ERROR
;
61 JAPANESE
->set(USCRIPT_LATIN
, status
).set(USCRIPT_HAN
, status
).set(USCRIPT_HIRAGANA
, status
)
62 .set(USCRIPT_KATAKANA
, status
);
63 CHINESE
->set(USCRIPT_LATIN
, status
).set(USCRIPT_HAN
, status
).set(USCRIPT_BOPOMOFO
, status
);
64 KOREAN
->set(USCRIPT_LATIN
, status
).set(USCRIPT_HAN
, status
).set(USCRIPT_HANGUL
, status
);
65 CONFUSABLE_WITH_LATIN
->set(USCRIPT_CYRILLIC
, status
).set(USCRIPT_GREEK
, status
)
66 .set(USCRIPT_CHEROKEE
, status
);
67 ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO
, IdentifierInfo_cleanup
);
72 IdentifierInfo::IdentifierInfo(UErrorCode
&status
):
73 fIdentifier(NULL
), fRequiredScripts(NULL
), fScriptSetSet(NULL
),
74 fCommonAmongAlternates(NULL
), fNumerics(NULL
), fIdentifierProfile(NULL
) {
75 umtx_initOnce(gIdentifierInfoInitOnce
, &IdentifierInfo_init
, status
);
76 if (U_FAILURE(status
)) {
80 fIdentifier
= new UnicodeString();
81 fRequiredScripts
= new ScriptSet();
82 fScriptSetSet
= uhash_open(uhash_hashScriptSet
, uhash_compareScriptSet
, NULL
, &status
);
83 uhash_setKeyDeleter(fScriptSetSet
, uhash_deleteScriptSet
);
84 fCommonAmongAlternates
= new ScriptSet();
85 fNumerics
= new UnicodeSet();
86 fIdentifierProfile
= new UnicodeSet(0, 0x10FFFF);
88 if (U_SUCCESS(status
) && (fIdentifier
== NULL
|| fRequiredScripts
== NULL
|| fScriptSetSet
== NULL
||
89 fCommonAmongAlternates
== NULL
|| fNumerics
== NULL
|| fIdentifierProfile
== NULL
)) {
90 status
= U_MEMORY_ALLOCATION_ERROR
;
94 IdentifierInfo::~IdentifierInfo() {
96 delete fRequiredScripts
;
97 uhash_close(fScriptSetSet
);
98 delete fCommonAmongAlternates
;
100 delete fIdentifierProfile
;
104 IdentifierInfo
&IdentifierInfo::clear() {
105 fRequiredScripts
->resetAll();
106 uhash_removeAll(fScriptSetSet
);
108 fCommonAmongAlternates
->resetAll();
113 IdentifierInfo
&IdentifierInfo::setIdentifierProfile(const UnicodeSet
&identifierProfile
) {
114 *fIdentifierProfile
= identifierProfile
;
119 const UnicodeSet
&IdentifierInfo::getIdentifierProfile() const {
120 return *fIdentifierProfile
;
124 IdentifierInfo
&IdentifierInfo::setIdentifier(const UnicodeString
&identifier
, UErrorCode
&status
) {
125 if (U_FAILURE(status
)) {
128 *fIdentifier
= identifier
;
130 ScriptSet scriptsForCP
;
132 for (int32_t i
= 0; i
< identifier
.length(); i
+= U16_LENGTH(cp
)) {
133 cp
= identifier
.char32At(i
);
134 // Store a representative character for each kind of decimal digit
135 if (u_charType(cp
) == U_DECIMAL_DIGIT_NUMBER
) {
136 // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
137 fNumerics
->add(cp
- (UChar32
)u_getNumericValue(cp
));
139 UScriptCode extensions
[500];
140 int32_t extensionsCount
= uscript_getScriptExtensions(cp
, extensions
, LENGTHOF(extensions
), &status
);
141 if (U_FAILURE(status
)) {
144 scriptsForCP
.resetAll();
145 for (int32_t j
=0; j
<extensionsCount
; j
++) {
146 scriptsForCP
.set(extensions
[j
], status
);
148 scriptsForCP
.reset(USCRIPT_COMMON
, status
);
149 scriptsForCP
.reset(USCRIPT_INHERITED
, status
);
150 switch (scriptsForCP
.countMembers()) {
153 // Single script, record it.
154 fRequiredScripts
->Union(scriptsForCP
);
157 if (!fRequiredScripts
->intersects(scriptsForCP
)
158 && !uhash_geti(fScriptSetSet
, &scriptsForCP
)) {
159 // If the set hasn't been added already, add it
160 // (Add a copy, fScriptSetSet takes ownership of the copy.)
161 uhash_puti(fScriptSetSet
, new ScriptSet(scriptsForCP
), 1, &status
);
166 // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
167 // [Kana], [Kana Hira] => [Kana]
168 // This is relatively infrequent, so doesn't have to be optimized.
169 // We also compute any commonalities among the alternates.
170 if (uhash_count(fScriptSetSet
) > 0) {
171 fCommonAmongAlternates
->setAll();
172 for (int32_t it
= -1;;) {
173 const UHashElement
*nextHashEl
= uhash_nextElement(fScriptSetSet
, &it
);
174 if (nextHashEl
== NULL
) {
177 ScriptSet
*next
= static_cast<ScriptSet
*>(nextHashEl
->key
.pointer
);
178 // [Kana], [Kana Hira] => [Kana]
179 if (fRequiredScripts
->intersects(*next
)) {
180 uhash_removeElement(fScriptSetSet
, nextHashEl
);
182 fCommonAmongAlternates
->intersect(*next
);
183 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
184 for (int32_t otherIt
= -1;;) {
185 const UHashElement
*otherHashEl
= uhash_nextElement(fScriptSetSet
, &otherIt
);
186 if (otherHashEl
== NULL
) {
189 ScriptSet
*other
= static_cast<ScriptSet
*>(otherHashEl
->key
.pointer
);
190 if (next
!= other
&& next
->contains(*other
)) {
191 uhash_removeElement(fScriptSetSet
, nextHashEl
);
198 if (uhash_count(fScriptSetSet
) == 0) {
199 fCommonAmongAlternates
->resetAll();
205 const UnicodeString
*IdentifierInfo::getIdentifier() const {
209 const ScriptSet
*IdentifierInfo::getScripts() const {
210 return fRequiredScripts
;
213 const UHashtable
*IdentifierInfo::getAlternates() const {
214 return fScriptSetSet
;
218 const UnicodeSet
*IdentifierInfo::getNumerics() const {
222 const ScriptSet
*IdentifierInfo::getCommonAmongAlternates() const {
223 return fCommonAmongAlternates
;
226 #if !UCONFIG_NO_NORMALIZATION
228 URestrictionLevel
IdentifierInfo::getRestrictionLevel(UErrorCode
&status
) const {
229 if (!fIdentifierProfile
->containsAll(*fIdentifier
) || getNumerics()->size() > 1) {
230 return USPOOF_UNRESTRICTIVE
;
232 if (ASCII
->containsAll(*fIdentifier
)) {
235 // This is a bit tricky. We look at a number of factors.
236 // The number of scripts in the text.
237 // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
238 // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
240 // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
241 // time it is created, in setIdentifier().
242 int32_t cardinalityPlus
= fRequiredScripts
->countMembers() +
243 (fCommonAmongAlternates
->countMembers() == 0 ? uhash_count(fScriptSetSet
) : 1);
244 if (cardinalityPlus
< 2) {
245 return USPOOF_SINGLE_SCRIPT_RESTRICTIVE
;
247 if (containsWithAlternates(*JAPANESE
, *fRequiredScripts
) || containsWithAlternates(*CHINESE
, *fRequiredScripts
)
248 || containsWithAlternates(*KOREAN
, *fRequiredScripts
)) {
249 return USPOOF_HIGHLY_RESTRICTIVE
;
251 if (cardinalityPlus
== 2 &&
252 fRequiredScripts
->test(USCRIPT_LATIN
, status
) &&
253 !fRequiredScripts
->intersects(*CONFUSABLE_WITH_LATIN
)) {
254 return USPOOF_MODERATELY_RESTRICTIVE
;
256 return USPOOF_MINIMALLY_RESTRICTIVE
;
259 #endif /* !UCONFIG_NO_NORMALIZATION */
261 int32_t IdentifierInfo::getScriptCount() const {
262 // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
263 int32_t count
= fRequiredScripts
->countMembers() +
264 (fCommonAmongAlternates
->countMembers() == 0 ? uhash_count(fScriptSetSet
) : 1);
270 UBool
IdentifierInfo::containsWithAlternates(const ScriptSet
&container
, const ScriptSet
&containee
) const {
271 if (!container
.contains(containee
)) {
274 for (int32_t iter
= -1; ;) {
275 const UHashElement
*hashEl
= uhash_nextElement(fScriptSetSet
, &iter
);
276 if (hashEl
== NULL
) {
279 ScriptSet
*alternatives
= static_cast<ScriptSet
*>(hashEl
->key
.pointer
);
280 if (!container
.intersects(*alternatives
)) {
287 UnicodeString
&IdentifierInfo::displayAlternates(UnicodeString
&dest
, const UHashtable
*alternates
, UErrorCode
&status
) {
288 UVector
sorted(status
);
289 if (U_FAILURE(status
)) {
292 for (int32_t pos
= -1; ;) {
293 const UHashElement
*el
= uhash_nextElement(alternates
, &pos
);
297 ScriptSet
*ss
= static_cast<ScriptSet
*>(el
->key
.pointer
);
298 sorted
.addElement(ss
, status
);
300 sorted
.sort(uhash_compareScriptSet
, status
);
301 UnicodeString separator
= UNICODE_STRING_SIMPLE("; ");
302 for (int32_t i
=0; i
<sorted
.size(); i
++) {
304 dest
.append(separator
);
306 ScriptSet
*ss
= static_cast<ScriptSet
*>(sorted
.elementAt(i
));
307 ss
->displayScripts(dest
);