2 ******************************************************************************
4 * Copyright (C) 2008-2013, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: uspoof_wsconf.cpp
10 * tab size: 8 (not used)
13 * created on: 2009Jan05 (refactoring earlier files)
14 * created by: Andy Heninger
16 * Internal functions for compililing Whole Script confusable source data
17 * into its binary (runtime) form. The binary data format is described
21 #include "unicode/utypes.h"
22 #include "unicode/uspoof.h"
24 #if !UCONFIG_NO_NORMALIZATION
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
28 #include "unicode/unorm.h"
29 #include "unicode/uregex.h"
30 #include "unicode/ustring.h"
32 #include "scriptset.h"
33 #include "uspoof_impl.h"
37 #include "uspoof_wsconf.h"
42 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
44 // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
45 // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
47 // | | | |---- Which table, Any Case or Lower Case (A or L)
48 // | | |----------Target script. We need this.
49 // | |----------------Src script. Should match the script of the source
50 // | code points. Beyond checking that, we don't keep it.
51 // |--------------------------------Source code points or range.
53 // The expression will match _all_ lines, including erroneous lines.
54 // The result of the parse is returned via the contents of the (match) groups.
55 static const char *parseExp
=
56 "(?m)" // Multi-line mode
57 "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1.
59 "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3.
60 "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4.
61 "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5.
62 "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7
63 "[ \\t]*(?:#.*?)?" // Trailing commment
65 "^(.*?)$"; // An error line. Group 8.
66 // Any line not matching the preceding
67 // parts of the expression.will match
68 // this, and thus be flagged as an error
71 // Extract a regular expression match group into a char * string.
72 // The group must contain only invariant characters.
73 // Used for script names
75 static void extractGroup(
76 URegularExpression
*e
, int32_t group
, char *destBuf
, int32_t destCapacity
, UErrorCode
&status
) {
81 int32_t len
= uregex_group(e
, group
, ubuf
, 50, &status
);
82 if (U_FAILURE(status
) || len
== -1 || len
>= destCapacity
) {
85 UnicodeString
s(FALSE
, ubuf
, len
); // Aliasing constructor
86 s
.extract(0, len
, destBuf
, destCapacity
, US_INV
);
93 // Build the Whole Script Confusable data
95 // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
96 // because everything is local to this one build function anyhow,
98 // break this function into more reasonably sized pieces, with
99 // state in WSConfusableDataBuilder.
101 void buildWSConfusableData(SpoofImpl
*spImpl
, const char * confusablesWS
,
102 int32_t confusablesWSLen
, UParseError
*pe
, UErrorCode
&status
)
104 if (U_FAILURE(status
)) {
107 URegularExpression
*parseRegexp
= NULL
;
108 int32_t inputLen
= 0;
112 UVector
*scriptSets
= NULL
;
113 uint32_t rtScriptSetsCount
= 2;
115 UTrie2
*anyCaseTrie
= NULL
;
116 UTrie2
*lowerCaseTrie
= NULL
;
118 anyCaseTrie
= utrie2_open(0, 0, &status
);
119 lowerCaseTrie
= utrie2_open(0, 0, &status
);
121 UnicodeString
pattern(parseExp
, -1, US_INV
);
123 // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
125 // Reserved TRIE values:
126 // 0: Code point has no whole script confusables.
127 // 1: Code point is of script Common or Inherited.
128 // These code points do not participate in whole script confusable detection.
129 // (This is logically equivalent to saying that they contain confusables in
132 // Because Trie values are indexes into the ScriptSets vector, pre-fill
133 // vector positions 0 and 1 to avoid conflicts with the reserved values.
135 scriptSets
= new UVector(status
);
136 if (scriptSets
== NULL
) {
137 status
= U_MEMORY_ALLOCATION_ERROR
;
140 scriptSets
->addElement((void *)NULL
, status
);
141 scriptSets
->addElement((void *)NULL
, status
);
143 // Convert the user input data from UTF-8 to UChar (UTF-16)
144 u_strFromUTF8(NULL
, 0, &inputLen
, confusablesWS
, confusablesWSLen
, &status
);
145 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
148 status
= U_ZERO_ERROR
;
149 input
= static_cast<UChar
*>(uprv_malloc((inputLen
+1) * sizeof(UChar
)));
151 status
= U_MEMORY_ALLOCATION_ERROR
;
154 u_strFromUTF8(input
, inputLen
+1, NULL
, confusablesWS
, confusablesWSLen
, &status
);
156 parseRegexp
= uregex_open(pattern
.getBuffer(), pattern
.length(), 0, NULL
, &status
);
158 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
159 // given the syntax of the input.
160 if (*input
== 0xfeff) {
164 // Parse the input, one line per iteration of this loop.
165 uregex_setText(parseRegexp
, input
, inputLen
, &status
);
166 while (uregex_findNext(parseRegexp
, &status
)) {
168 if (uregex_start(parseRegexp
, 1, &status
) >= 0) {
169 // this was a blank or comment line.
172 if (uregex_start(parseRegexp
, 8, &status
) >= 0) {
173 // input file syntax error.
174 status
= U_PARSE_ERROR
;
177 if (U_FAILURE(status
)) {
181 // Pick up the start and optional range end code points from the parsed line.
182 UChar32 startCodePoint
= SpoofImpl::ScanHex(
183 input
, uregex_start(parseRegexp
, 2, &status
), uregex_end(parseRegexp
, 2, &status
), status
);
184 UChar32 endCodePoint
= startCodePoint
;
185 if (uregex_start(parseRegexp
, 3, &status
) >=0) {
186 endCodePoint
= SpoofImpl::ScanHex(
187 input
, uregex_start(parseRegexp
, 3, &status
), uregex_end(parseRegexp
, 3, &status
), status
);
190 // Extract the two script names from the source line. We need these in an 8 bit
191 // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
192 // to the ICU u_getPropertyValueEnum() function. Ugh.
193 char srcScriptName
[20];
194 char targScriptName
[20];
195 extractGroup(parseRegexp
, 4, srcScriptName
, sizeof(srcScriptName
), status
);
196 extractGroup(parseRegexp
, 5, targScriptName
, sizeof(targScriptName
), status
);
197 UScriptCode srcScript
=
198 static_cast<UScriptCode
>(u_getPropertyValueEnum(UCHAR_SCRIPT
, srcScriptName
));
199 UScriptCode targScript
=
200 static_cast<UScriptCode
>(u_getPropertyValueEnum(UCHAR_SCRIPT
, targScriptName
));
201 if (U_FAILURE(status
)) {
204 if (srcScript
== USCRIPT_INVALID_CODE
|| targScript
== USCRIPT_INVALID_CODE
) {
205 status
= U_INVALID_FORMAT_ERROR
;
209 // select the table - (A) any case or (L) lower case only
210 UTrie2
*table
= anyCaseTrie
;
211 if (uregex_start(parseRegexp
, 7, &status
) >= 0) {
212 table
= lowerCaseTrie
;
215 // Build the set of scripts containing confusable characters for
216 // the code point(s) specified in this input line.
217 // Sanity check that the script of the source code point is the same
218 // as the source script indicated in the input file. Failure of this check is
219 // an error in the input file.
220 // Include the source script in the set (needed for Mixed Script Confusable detection).
223 for (cp
=startCodePoint
; cp
<=endCodePoint
; cp
++) {
224 int32_t setIndex
= utrie2_get32(table
, cp
);
225 BuilderScriptSet
*bsset
= NULL
;
227 U_ASSERT(setIndex
< scriptSets
->size());
228 bsset
= static_cast<BuilderScriptSet
*>(scriptSets
->elementAt(setIndex
));
230 bsset
= new BuilderScriptSet();
232 status
= U_MEMORY_ALLOCATION_ERROR
;
235 bsset
->codePoint
= cp
;
237 bsset
->sset
= new ScriptSet();
238 setIndex
= scriptSets
->size();
239 bsset
->index
= setIndex
;
241 if (bsset
->sset
== NULL
) {
242 status
= U_MEMORY_ALLOCATION_ERROR
;
245 scriptSets
->addElement(bsset
, status
);
246 utrie2_set32(table
, cp
, setIndex
, &status
);
248 bsset
->sset
->set(targScript
, status
);
249 bsset
->sset
->set(srcScript
, status
);
251 if (U_FAILURE(status
)) {
254 UScriptCode cpScript
= uscript_getScript(cp
, &status
);
255 if (cpScript
!= srcScript
) {
256 status
= U_INVALID_FORMAT_ERROR
;
262 // Eliminate duplicate script sets. At this point we have a separate
263 // script set for every code point that had data in the input file.
265 // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
267 // printf("Number of scriptSets: %d\n", scriptSets->size());
269 int32_t duplicateCount
= 0;
270 rtScriptSetsCount
= 2;
271 for (int32_t outeri
=2; outeri
<scriptSets
->size(); outeri
++) {
272 BuilderScriptSet
*outerSet
= static_cast<BuilderScriptSet
*>(scriptSets
->elementAt(outeri
));
273 if (outerSet
->index
!= static_cast<uint32_t>(outeri
)) {
274 // This set was already identified as a duplicate.
275 // It will not be allocated a position in the runtime array of ScriptSets.
278 outerSet
->rindex
= rtScriptSetsCount
++;
279 for (int32_t inneri
=outeri
+1; inneri
<scriptSets
->size(); inneri
++) {
280 BuilderScriptSet
*innerSet
= static_cast<BuilderScriptSet
*>(scriptSets
->elementAt(inneri
));
281 if (*(outerSet
->sset
) == *(innerSet
->sset
) && outerSet
->sset
!= innerSet
->sset
) {
282 delete innerSet
->sset
;
283 innerSet
->scriptSetOwned
= FALSE
;
284 innerSet
->sset
= outerSet
->sset
;
285 innerSet
->index
= outeri
;
286 innerSet
->rindex
= outerSet
->rindex
;
289 // But this doesn't get all. We need to fix the TRIE.
292 // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
297 // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
298 // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
299 // are unused, which is why the loop index starts at 2.)
301 for (int32_t i
=2; i
<scriptSets
->size(); i
++) {
302 BuilderScriptSet
*bSet
= static_cast<BuilderScriptSet
*>(scriptSets
->elementAt(i
));
303 if (bSet
->rindex
!= (uint32_t)i
) {
304 utrie2_set32(bSet
->trie
, bSet
->codePoint
, bSet
->rindex
, &status
);
309 // For code points with script==Common or script==Inherited,
310 // Set the reserved value of 1 into both Tries. These characters do not participate
311 // in Whole Script Confusable detection; this reserved value is the means
312 // by which they are detected.
314 UnicodeSet ignoreSet
;
315 ignoreSet
.applyIntPropertyValue(UCHAR_SCRIPT
, USCRIPT_COMMON
, status
);
316 UnicodeSet inheritedSet
;
317 inheritedSet
.applyIntPropertyValue(UCHAR_SCRIPT
, USCRIPT_INHERITED
, status
);
318 ignoreSet
.addAll(inheritedSet
);
319 for (int32_t rn
=0; rn
<ignoreSet
.getRangeCount(); rn
++) {
320 UChar32 rangeStart
= ignoreSet
.getRangeStart(rn
);
321 UChar32 rangeEnd
= ignoreSet
.getRangeEnd(rn
);
322 utrie2_setRange32(anyCaseTrie
, rangeStart
, rangeEnd
, 1, TRUE
, &status
);
323 utrie2_setRange32(lowerCaseTrie
, rangeStart
, rangeEnd
, 1, TRUE
, &status
);
327 // Serialize the data to the Spoof Detector
329 utrie2_freeze(anyCaseTrie
, UTRIE2_16_VALUE_BITS
, &status
);
330 int32_t size
= utrie2_serialize(anyCaseTrie
, NULL
, 0, &status
);
331 // printf("Any case Trie size: %d\n", size);
332 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
335 status
= U_ZERO_ERROR
;
336 spImpl
->fSpoofData
->fRawData
->fAnyCaseTrie
= spImpl
->fSpoofData
->fMemLimit
;
337 spImpl
->fSpoofData
->fRawData
->fAnyCaseTrieLength
= size
;
338 spImpl
->fSpoofData
->fAnyCaseTrie
= anyCaseTrie
;
339 void *where
= spImpl
->fSpoofData
->reserveSpace(size
, status
);
340 utrie2_serialize(anyCaseTrie
, where
, size
, &status
);
342 utrie2_freeze(lowerCaseTrie
, UTRIE2_16_VALUE_BITS
, &status
);
343 size
= utrie2_serialize(lowerCaseTrie
, NULL
, 0, &status
);
344 // printf("Lower case Trie size: %d\n", size);
345 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
348 status
= U_ZERO_ERROR
;
349 spImpl
->fSpoofData
->fRawData
->fLowerCaseTrie
= spImpl
->fSpoofData
->fMemLimit
;
350 spImpl
->fSpoofData
->fRawData
->fLowerCaseTrieLength
= size
;
351 spImpl
->fSpoofData
->fLowerCaseTrie
= lowerCaseTrie
;
352 where
= spImpl
->fSpoofData
->reserveSpace(size
, status
);
353 utrie2_serialize(lowerCaseTrie
, where
, size
, &status
);
355 spImpl
->fSpoofData
->fRawData
->fScriptSets
= spImpl
->fSpoofData
->fMemLimit
;
356 spImpl
->fSpoofData
->fRawData
->fScriptSetsLength
= rtScriptSetsCount
;
357 ScriptSet
*rtScriptSets
= static_cast<ScriptSet
*>
358 (spImpl
->fSpoofData
->reserveSpace(rtScriptSetsCount
* sizeof(ScriptSet
), status
));
360 for (int32_t i
=2; i
<scriptSets
->size(); i
++) {
361 BuilderScriptSet
*bSet
= static_cast<BuilderScriptSet
*>(scriptSets
->elementAt(i
));
362 if (bSet
->rindex
< rindex
) {
363 // We have already copied this script set to the serialized data.
366 U_ASSERT(rindex
== bSet
->rindex
);
367 rtScriptSets
[rindex
] = *bSet
->sset
; // Assignment of a ScriptSet just copies the bits.
372 // Open new utrie2s from the serialized data. We don't want to keep the ones
373 // we just built because we would then have two copies of the data, one internal to
374 // the utries that we have already constructed, and one in the serialized data area.
375 // An alternative would be to not pre-serialize the Trie data, but that makes the
376 // spoof detector data different, depending on how the detector was constructed.
377 // It's simpler to keep the data always the same.
379 spImpl
->fSpoofData
->fAnyCaseTrie
= utrie2_openFromSerialized(
380 UTRIE2_16_VALUE_BITS
,
381 (const char *)spImpl
->fSpoofData
->fRawData
+ spImpl
->fSpoofData
->fRawData
->fAnyCaseTrie
,
382 spImpl
->fSpoofData
->fRawData
->fAnyCaseTrieLength
,
386 spImpl
->fSpoofData
->fLowerCaseTrie
= utrie2_openFromSerialized(
387 UTRIE2_16_VALUE_BITS
,
388 (const char *)spImpl
->fSpoofData
->fRawData
+ spImpl
->fSpoofData
->fRawData
->fLowerCaseTrie
,
389 spImpl
->fSpoofData
->fRawData
->fAnyCaseTrieLength
,
396 if (U_FAILURE(status
)) {
399 uregex_close(parseRegexp
);
403 if (scriptSets
!= NULL
) {
404 for (i
=0; i
<scriptSets
->size(); i
++) {
405 BuilderScriptSet
*bsset
= static_cast<BuilderScriptSet
*>(scriptSets
->elementAt(i
));
410 utrie2_close(anyCaseTrie
);
411 utrie2_close(lowerCaseTrie
);
419 BuilderScriptSet::BuilderScriptSet() {
425 scriptSetOwned
= TRUE
;
428 BuilderScriptSet::~BuilderScriptSet() {
429 if (scriptSetOwned
) {
435 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS