2 ******************************************************************************
4 * Copyright (C) 2008-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * file name: uspoof_conf.cpp
10 * tab size: 8 (not used)
13 * created on: 2009Jan05 (refactoring earlier files)
14 * created by: Andy Heninger
16 * Internal classes for compililing confusable data into its binary (runtime) form.
19 #include "unicode/utypes.h"
20 #include "unicode/uspoof.h"
21 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
22 #if !UCONFIG_NO_NORMALIZATION
24 #include "unicode/unorm.h"
25 #include "unicode/uregex.h"
26 #include "unicode/ustring.h"
28 #include "uspoof_impl.h"
33 #include "uspoof_conf.h"
38 //---------------------------------------------------------------------
40 // buildConfusableData Compile the source confusable data, as defined by
41 // the Unicode data file confusables.txt, into the binary
42 // structures used by the confusable detector.
44 // The binary structures are described in uspoof_impl.h
46 // 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA
47 // tables. Each maps from a UChar32 to a String.
49 // 2. Sort all of the strings encountered by length, since they will need to
50 // be stored in that order in the final string table.
52 // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the
53 // list because that will be the ordering of our runtime table.
55 // 4. Generate the run time string table. This is generated before the key & value
56 // tables because we need the string indexes when building those tables.
58 // 5. Build the run-time key and value tables. These are parallel tables, and are built
62 SPUString::SPUString(UnicodeString
*s
) {
68 SPUString::~SPUString() {
73 SPUStringPool::SPUStringPool(UErrorCode
&status
) : fVec(NULL
), fHash(NULL
) {
74 fVec
= new UVector(status
);
75 fHash
= uhash_open(uhash_hashUnicodeString
, // key hash function
76 uhash_compareUnicodeString
, // Key Comparator
77 NULL
, // Value Comparator
82 SPUStringPool::~SPUStringPool() {
84 for (i
=fVec
->size()-1; i
>=0; i
--) {
85 SPUString
*s
= static_cast<SPUString
*>(fVec
->elementAt(i
));
93 int32_t SPUStringPool::size() {
97 SPUString
*SPUStringPool::getByIndex(int32_t index
) {
98 SPUString
*retString
= (SPUString
*)fVec
->elementAt(index
);
103 // Comparison function for ordering strings in the string pool.
104 // Compare by length first, then, within a group of the same length,
105 // by code point order.
106 // Conforms to the type signature for a USortComparator in uvector.h
108 static int8_t U_CALLCONV
SPUStringCompare(UHashTok left
, UHashTok right
) {
109 const SPUString
*sL
= const_cast<const SPUString
*>(
110 static_cast<SPUString
*>(left
.pointer
));
111 const SPUString
*sR
= const_cast<const SPUString
*>(
112 static_cast<SPUString
*>(right
.pointer
));
113 int32_t lenL
= sL
->fStr
->length();
114 int32_t lenR
= sR
->fStr
->length();
117 } else if (lenL
> lenR
) {
120 return sL
->fStr
->compare(*(sR
->fStr
));
124 void SPUStringPool::sort(UErrorCode
&status
) {
125 fVec
->sort(SPUStringCompare
, status
);
129 SPUString
*SPUStringPool::addString(UnicodeString
*src
, UErrorCode
&status
) {
130 SPUString
*hashedString
= static_cast<SPUString
*>(uhash_get(fHash
, src
));
131 if (hashedString
!= NULL
) {
134 hashedString
= new SPUString(src
);
135 uhash_put(fHash
, src
, hashedString
, &status
);
136 fVec
->addElement(hashedString
, status
);
143 ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl
*spImpl
, UErrorCode
&status
) :
154 fStringLengthsTable(NULL
),
160 if (U_FAILURE(status
)) {
163 fSLTable
= uhash_open(uhash_hashLong
, uhash_compareLong
, NULL
, &status
);
164 fSATable
= uhash_open(uhash_hashLong
, uhash_compareLong
, NULL
, &status
);
165 fMLTable
= uhash_open(uhash_hashLong
, uhash_compareLong
, NULL
, &status
);
166 fMATable
= uhash_open(uhash_hashLong
, uhash_compareLong
, NULL
, &status
);
167 fKeySet
= new UnicodeSet();
168 fKeyVec
= new UVector(status
);
169 fValueVec
= new UVector(status
);
170 stringPool
= new SPUStringPool(status
);
174 ConfusabledataBuilder::~ConfusabledataBuilder() {
176 uregex_close(fParseLine
);
177 uregex_close(fParseHexNum
);
178 uhash_close(fSLTable
);
179 uhash_close(fSATable
);
180 uhash_close(fMLTable
);
181 uhash_close(fMATable
);
185 delete fStringLengthsTable
;
191 void ConfusabledataBuilder::buildConfusableData(SpoofImpl
* spImpl
, const char * confusables
,
192 int32_t confusablesLen
, int32_t *errorType
, UParseError
*pe
, UErrorCode
&status
) {
194 if (U_FAILURE(status
)) {
197 ConfusabledataBuilder
builder(spImpl
, status
);
198 builder
.build(confusables
, confusablesLen
, status
);
199 if (U_FAILURE(status
) && errorType
!= NULL
) {
200 *errorType
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
201 pe
->line
= builder
.fLineNum
;
206 void ConfusabledataBuilder::build(const char * confusables
, int32_t confusablesLen
,
207 UErrorCode
&status
) {
209 // Convert the user input data from UTF-8 to UChar (UTF-16)
210 int32_t inputLen
= 0;
211 if (U_FAILURE(status
)) {
214 u_strFromUTF8(NULL
, 0, &inputLen
, confusables
, confusablesLen
, &status
);
215 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
218 status
= U_ZERO_ERROR
;
219 fInput
= static_cast<UChar
*>(uprv_malloc((inputLen
+1) * sizeof(UChar
)));
220 if (fInput
== NULL
) {
221 status
= U_MEMORY_ALLOCATION_ERROR
;
223 u_strFromUTF8(fInput
, inputLen
+1, NULL
, confusables
, confusablesLen
, &status
);
226 // Regular Expression to parse a line from Confusables.txt. The expression will match
227 // any line. What was matched is determined by examining which capture groups have a match.
228 // Capture Group 1: the source char
229 // Capture Group 2: the replacement chars
230 // Capture Group 3-6 the table type, SL, SA, ML, or MA
231 // Capture Group 7: A blank or comment only line.
232 // Capture Group 8: A syntactically invalid line. Anything that didn't match before.
233 // Example Line from the confusables.txt source file:
234 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
235 fParseLine
= uregex_openC(
236 "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char
237 "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s)
238 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued)
239 "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type
240 "[ \\t]*(?:#.*?)?$" // Match any trailing #comment
241 "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment
242 "|^(.*?)$", // OR match any line, which catches illegal lines.
245 // Regular expression for parsing a hex number out of a space-separated list of them.
246 // Capture group 1 gets the number, with spaces removed.
247 fParseHexNum
= uregex_openC("\\s*([0-9A-F]+)", 0, NULL
, &status
);
249 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
250 // given the syntax of the input.
251 if (*fInput
== 0xfeff) {
255 // Parse the input, one line per iteration of this loop.
256 uregex_setText(fParseLine
, fInput
, inputLen
, &status
);
257 while (uregex_findNext(fParseLine
, &status
)) {
259 if (uregex_start(fParseLine
, 7, &status
) >= 0) {
260 // this was a blank or comment line.
263 if (uregex_start(fParseLine
, 8, &status
) >= 0) {
264 // input file syntax error.
265 status
= U_PARSE_ERROR
;
269 // We have a good input line. Extract the key character and mapping string, and
270 // put them into the appropriate mapping table.
271 UChar32 keyChar
= SpoofImpl::ScanHex(fInput
, uregex_start(fParseLine
, 1, &status
),
272 uregex_end(fParseLine
, 1, &status
), status
);
274 int32_t mapStringStart
= uregex_start(fParseLine
, 2, &status
);
275 int32_t mapStringLength
= uregex_end(fParseLine
, 2, &status
) - mapStringStart
;
276 uregex_setText(fParseHexNum
, &fInput
[mapStringStart
], mapStringLength
, &status
);
278 UnicodeString
*mapString
= new UnicodeString();
279 if (mapString
== NULL
) {
280 status
= U_MEMORY_ALLOCATION_ERROR
;
283 while (uregex_findNext(fParseHexNum
, &status
)) {
284 UChar32 c
= SpoofImpl::ScanHex(&fInput
[mapStringStart
], uregex_start(fParseHexNum
, 1, &status
),
285 uregex_end(fParseHexNum
, 1, &status
), status
);
286 mapString
->append(c
);
288 U_ASSERT(mapString
->length() >= 1);
290 // Put the map (value) string into the string pool
291 // This a little like a Java intern() - any duplicates will be eliminated.
292 SPUString
*smapString
= stringPool
->addString(mapString
, status
);
294 // Add the UChar32 -> string mapping to the appropriate table.
295 UHashtable
*table
= uregex_start(fParseLine
, 3, &status
) >= 0 ? fSLTable
:
296 uregex_start(fParseLine
, 4, &status
) >= 0 ? fSATable
:
297 uregex_start(fParseLine
, 5, &status
) >= 0 ? fMLTable
:
298 uregex_start(fParseLine
, 6, &status
) >= 0 ? fMATable
:
300 U_ASSERT(table
!= NULL
);
301 uhash_iput(table
, keyChar
, smapString
, &status
);
302 fKeySet
->add(keyChar
);
303 if (U_FAILURE(status
)) {
308 // Input data is now all parsed and collected.
309 // Now create the run-time binary form of the data.
311 // This is done in two steps. First the data is assembled into vectors and strings,
312 // for ease of construction, then the contents of these collections are dumped
313 // into the actual raw-bytes data storage.
315 // Build up the string array, and record the index of each string therein
316 // in the (build time only) string pool.
317 // Strings of length one are not entered into the strings array.
318 // At the same time, build up the string lengths table, which records the
319 // position in the string table of the first string of each length >= 4.
320 // (Strings in the table are sorted by length)
321 stringPool
->sort(status
);
322 fStringTable
= new UnicodeString();
323 fStringLengthsTable
= new UVector(status
);
324 int32_t previousStringLength
= 0;
325 int32_t previousStringIndex
= 0;
326 int32_t poolSize
= stringPool
->size();
328 for (i
=0; i
<poolSize
; i
++) {
329 SPUString
*s
= stringPool
->getByIndex(i
);
330 int32_t strLen
= s
->fStr
->length();
331 int32_t strIndex
= fStringTable
->length();
332 U_ASSERT(strLen
>= previousStringLength
);
334 // strings of length one do not get an entry in the string table.
335 // Keep the single string character itself here, which is the same
336 // convention that is used in the final run-time string table index.
337 s
->fStrTableIndex
= s
->fStr
->charAt(0);
339 if ((strLen
> previousStringLength
) && (previousStringLength
>= 4)) {
340 fStringLengthsTable
->addElement(previousStringIndex
, status
);
341 fStringLengthsTable
->addElement(previousStringLength
, status
);
343 s
->fStrTableIndex
= strIndex
;
344 fStringTable
->append(*(s
->fStr
));
346 previousStringLength
= strLen
;
347 previousStringIndex
= strIndex
;
349 // Make the final entry to the string lengths table.
350 // (it holds an entry for the _last_ string of each length, so adding the
351 // final one doesn't happen in the main loop because no longer string was encountered.)
352 if (previousStringLength
>= 4) {
353 fStringLengthsTable
->addElement(previousStringIndex
, status
);
354 fStringLengthsTable
->addElement(previousStringLength
, status
);
357 // Construct the compile-time Key and Value tables
359 // For each key code point, check which mapping tables it applies to,
360 // and create the final data for the key & value structures.
362 // The four logical mapping tables are conflated into one combined table.
363 // If multiple logical tables have the same mapping for some key, they
364 // share a single entry in the combined table.
365 // If more than one mapping exists for the same key code point, multiple
366 // entries will be created in the table
368 for (int32_t range
=0; range
<fKeySet
->getRangeCount(); range
++) {
369 // It is an oddity of the UnicodeSet API that simply enumerating the contained
370 // code points requires a nested loop.
371 for (UChar32 keyChar
=fKeySet
->getRangeStart(range
);
372 keyChar
<= fKeySet
->getRangeEnd(range
); keyChar
++) {
373 addKeyEntry(keyChar
, fSLTable
, USPOOF_SL_TABLE_FLAG
, status
);
374 addKeyEntry(keyChar
, fSATable
, USPOOF_SA_TABLE_FLAG
, status
);
375 addKeyEntry(keyChar
, fMLTable
, USPOOF_ML_TABLE_FLAG
, status
);
376 addKeyEntry(keyChar
, fMATable
, USPOOF_MA_TABLE_FLAG
, status
);
380 // Put the assembled data into the flat runtime array
383 // All of the intermediate allocated data belongs to the ConfusabledataBuilder
384 // object (this), and is deleted in the destructor.
389 // outputData The confusable data has been compiled and stored in intermediate
390 // collections and strings. Copy it from there to the final flat
393 // Note that as each section is added to the output data, the
394 // expand (reserveSpace() function will likely relocate it in memory.
395 // Be careful with pointers.
397 void ConfusabledataBuilder::outputData(UErrorCode
&status
) {
399 U_ASSERT(fSpoofImpl
->fSpoofData
->fDataOwned
== TRUE
);
402 // While copying the keys to the runtime array,
403 // also sanity check that they are sorted.
405 int32_t numKeys
= fKeyVec
->size();
407 static_cast<int32_t *>(fSpoofImpl
->fSpoofData
->reserveSpace(numKeys
*sizeof(int32_t), status
));
408 if (U_FAILURE(status
)) {
412 int32_t previousKey
= 0;
413 for (i
=0; i
<numKeys
; i
++) {
414 int32_t key
= fKeyVec
->elementAti(i
);
415 U_ASSERT((key
& 0x00ffffff) >= (previousKey
& 0x00ffffff));
416 U_ASSERT((key
& 0xff000000) != 0);
420 SpoofDataHeader
*rawData
= fSpoofImpl
->fSpoofData
->fRawData
;
421 rawData
->fCFUKeys
= (int32_t)((char *)keys
- (char *)rawData
);
422 rawData
->fCFUKeysSize
= numKeys
;
423 fSpoofImpl
->fSpoofData
->fCFUKeys
= keys
;
426 // The Value Table, parallels the key table
427 int32_t numValues
= fValueVec
->size();
428 U_ASSERT(numKeys
== numValues
);
430 static_cast<uint16_t *>(fSpoofImpl
->fSpoofData
->reserveSpace(numKeys
*sizeof(uint16_t), status
));
431 if (U_FAILURE(status
)) {
434 for (i
=0; i
<numValues
; i
++) {
435 uint32_t value
= static_cast<uint32_t>(fValueVec
->elementAti(i
));
436 U_ASSERT(value
< 0xffff);
437 values
[i
] = static_cast<uint16_t>(value
);
439 rawData
= fSpoofImpl
->fSpoofData
->fRawData
;
440 rawData
->fCFUStringIndex
= (int32_t)((char *)values
- (char *)rawData
);
441 rawData
->fCFUStringIndexSize
= numValues
;
442 fSpoofImpl
->fSpoofData
->fCFUValues
= values
;
444 // The Strings Table.
446 uint32_t stringsLength
= fStringTable
->length();
447 // Reserve an extra space so the string will be nul-terminated. This is
448 // only a convenience, for when debugging; it is not needed otherwise.
450 static_cast<UChar
*>(fSpoofImpl
->fSpoofData
->reserveSpace(stringsLength
*sizeof(UChar
)+2, status
));
451 if (U_FAILURE(status
)) {
454 fStringTable
->extract(strings
, stringsLength
+1, status
);
455 rawData
= fSpoofImpl
->fSpoofData
->fRawData
;
456 U_ASSERT(rawData
->fCFUStringTable
== 0);
457 rawData
->fCFUStringTable
= (int32_t)((char *)strings
- (char *)rawData
);
458 rawData
->fCFUStringTableLen
= stringsLength
;
459 fSpoofImpl
->fSpoofData
->fCFUStrings
= strings
;
461 // The String Lengths Table
462 // While copying into the runtime array do some sanity checks on the values
463 // Each complete entry contains two fields, an index and an offset.
464 // Lengths should increase with each entry.
465 // Offsets should be less than the size of the string table.
466 int32_t lengthTableLength
= fStringLengthsTable
->size();
467 uint16_t *stringLengths
=
468 static_cast<uint16_t *>(fSpoofImpl
->fSpoofData
->reserveSpace(lengthTableLength
*sizeof(uint16_t), status
));
469 if (U_FAILURE(status
)) {
472 int32_t destIndex
= 0;
473 uint32_t previousLength
= 0;
474 for (i
=0; i
<lengthTableLength
; i
+=2) {
475 uint32_t offset
= static_cast<uint32_t>(fStringLengthsTable
->elementAti(i
));
476 uint32_t length
= static_cast<uint32_t>(fStringLengthsTable
->elementAti(i
+1));
477 U_ASSERT(offset
< stringsLength
);
478 U_ASSERT(length
< 40);
479 U_ASSERT(length
> previousLength
);
480 stringLengths
[destIndex
++] = static_cast<uint16_t>(offset
);
481 stringLengths
[destIndex
++] = static_cast<uint16_t>(length
);
482 previousLength
= length
;
484 rawData
= fSpoofImpl
->fSpoofData
->fRawData
;
485 rawData
->fCFUStringLengths
= (int32_t)((char *)stringLengths
- (char *)rawData
);
486 // Note: StringLengthsSize in the raw data is the number of complete entries,
487 // each consisting of a pair of 16 bit values, hence the divide by 2.
488 rawData
->fCFUStringLengthsSize
= lengthTableLength
/ 2;
489 fSpoofImpl
->fSpoofData
->fCFUStringLengths
=
490 reinterpret_cast<SpoofStringLengthsElement
*>(stringLengths
);
495 // addKeyEntry Construction of the confusable Key and Mapping Values tables.
496 // This is an intermediate point in the building process.
497 // We already have the mappings in the hash tables fSLTable, etc.
498 // This function builds corresponding run-time style table entries into
499 // fKeyVec and fValueVec
501 void ConfusabledataBuilder::addKeyEntry(
502 UChar32 keyChar
, // The key character
503 UHashtable
*table
, // The table, one of SATable, MATable, etc.
504 int32_t tableFlag
, // One of USPOOF_SA_TABLE_FLAG, etc.
505 UErrorCode
&status
) {
507 SPUString
*targetMapping
= static_cast<SPUString
*>(uhash_iget(table
, keyChar
));
508 if (targetMapping
== NULL
) {
509 // No mapping for this key character.
510 // (This function is called for all four tables for each key char that
511 // is seen anywhere, so this no entry cases are very much expected.)
515 // Check whether there is already an entry with the correct mapping.
516 // If so, simply set the flag in the keyTable saying that the existing entry
517 // applies to the table that we're doing now.
519 UBool keyHasMultipleValues
= FALSE
;
521 for (i
=fKeyVec
->size()-1; i
>=0 ; i
--) {
522 int32_t key
= fKeyVec
->elementAti(i
);
523 if ((key
& 0x0ffffff) != keyChar
) {
524 // We have now checked all existing key entries for this key char (if any)
525 // without finding one with the same mapping.
528 UnicodeString mapping
= getMapping(i
);
529 if (mapping
== *(targetMapping
->fStr
)) {
530 // The run time entry we are currently testing has the correct mapping.
531 // Set the flag in it indicating that it applies to the new table also.
533 fKeyVec
->setElementAt(key
, i
);
536 keyHasMultipleValues
= TRUE
;
539 // Need to add a new entry to the binary data being built for this mapping.
540 // Includes adding entries to both the key table and the parallel values table.
542 int32_t newKey
= keyChar
| tableFlag
;
543 if (keyHasMultipleValues
) {
544 newKey
|= USPOOF_KEY_MULTIPLE_VALUES
;
546 int32_t adjustedMappingLength
= targetMapping
->fStr
->length() - 1;
547 if (adjustedMappingLength
>3) {
548 adjustedMappingLength
= 3;
550 newKey
|= adjustedMappingLength
<< USPOOF_KEY_LENGTH_SHIFT
;
552 int32_t newData
= targetMapping
->fStrTableIndex
;
554 fKeyVec
->addElement(newKey
, status
);
555 fValueVec
->addElement(newData
, status
);
557 // If the preceding key entry is for the same key character (but with a different mapping)
558 // set the multiple-values flag on it.
559 if (keyHasMultipleValues
) {
560 int32_t previousKeyIndex
= fKeyVec
->size() - 2;
561 int32_t previousKey
= fKeyVec
->elementAti(previousKeyIndex
);
562 previousKey
|= USPOOF_KEY_MULTIPLE_VALUES
;
563 fKeyVec
->setElementAt(previousKey
, previousKeyIndex
);
569 UnicodeString
ConfusabledataBuilder::getMapping(int32_t index
) {
570 int32_t key
= fKeyVec
->elementAti(index
);
571 int32_t value
= fValueVec
->elementAti(index
);
572 int32_t length
= USPOOF_KEY_LENGTH_FIELD(key
);
573 int32_t lastIndexWithLen
;
576 return UnicodeString(static_cast<UChar
>(value
));
579 return UnicodeString(*fStringTable
, value
, length
+1);
583 for (i
=0; i
<fStringLengthsTable
->size(); i
+=2) {
584 lastIndexWithLen
= fStringLengthsTable
->elementAti(i
);
585 if (value
<= lastIndexWithLen
) {
586 length
= fStringLengthsTable
->elementAti(i
+1);
591 return UnicodeString(*fStringTable
, value
, length
);
595 return UnicodeString();
599 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS