1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 2008-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
10 * file name: uspoof_conf.cpp
12 * tab size: 8 (not used)
15 * created on: 2009Jan05 (refactoring earlier files)
16 * created by: Andy Heninger
18 * Internal classes for compililing confusable data into its binary (runtime) form.
21 #include "unicode/utypes.h"
22 #include "unicode/uspoof.h"
23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
24 #if !UCONFIG_NO_NORMALIZATION
26 #include "unicode/unorm.h"
27 #include "unicode/uregex.h"
28 #include "unicode/ustring.h"
30 #include "uspoof_impl.h"
35 #include "uspoof_conf.h"
40 //---------------------------------------------------------------------
42 // buildConfusableData Compile the source confusable data, as defined by
43 // the Unicode data file confusables.txt, into the binary
44 // structures used by the confusable detector.
46 // The binary structures are described in uspoof_impl.h
48 // 1. Parse the data, making a hash table mapping from a UChar32 to a String.
50 // 2. Sort all of the strings encountered by length, since they will need to
51 // be stored in that order in the final string table.
52 // TODO: Sorting these strings by length is no longer needed since the removal of
53 // the string lengths table. This logic can be removed to save processing time
54 // when building confusables data.
56 // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the
57 // list because that will be the ordering of our runtime table.
59 // 4. Generate the run time string table. This is generated before the key & value
60 // tables because we need the string indexes when building those tables.
62 // 5. Build the run-time key and value tables. These are parallel tables, and are built
66 SPUString::SPUString(UnicodeString
*s
) {
68 fCharOrStrTableIndex
= 0;
72 SPUString::~SPUString() {
77 SPUStringPool::SPUStringPool(UErrorCode
&status
) : fVec(NULL
), fHash(NULL
) {
78 fVec
= new UVector(status
);
80 status
= U_MEMORY_ALLOCATION_ERROR
;
83 fHash
= uhash_open(uhash_hashUnicodeString
, // key hash function
84 uhash_compareUnicodeString
, // Key Comparator
85 NULL
, // Value Comparator
90 SPUStringPool::~SPUStringPool() {
92 for (i
=fVec
->size()-1; i
>=0; i
--) {
93 SPUString
*s
= static_cast<SPUString
*>(fVec
->elementAt(i
));
101 int32_t SPUStringPool::size() {
105 SPUString
*SPUStringPool::getByIndex(int32_t index
) {
106 SPUString
*retString
= (SPUString
*)fVec
->elementAt(index
);
111 // Comparison function for ordering strings in the string pool.
112 // Compare by length first, then, within a group of the same length,
113 // by code point order.
114 // Conforms to the type signature for a USortComparator in uvector.h
116 static int8_t U_CALLCONV
SPUStringCompare(UHashTok left
, UHashTok right
) {
117 const SPUString
*sL
= const_cast<const SPUString
*>(
118 static_cast<SPUString
*>(left
.pointer
));
119 const SPUString
*sR
= const_cast<const SPUString
*>(
120 static_cast<SPUString
*>(right
.pointer
));
121 int32_t lenL
= sL
->fStr
->length();
122 int32_t lenR
= sR
->fStr
->length();
125 } else if (lenL
> lenR
) {
128 return sL
->fStr
->compare(*(sR
->fStr
));
132 void SPUStringPool::sort(UErrorCode
&status
) {
133 fVec
->sort(SPUStringCompare
, status
);
137 SPUString
*SPUStringPool::addString(UnicodeString
*src
, UErrorCode
&status
) {
138 SPUString
*hashedString
= static_cast<SPUString
*>(uhash_get(fHash
, src
));
139 if (hashedString
!= NULL
) {
142 hashedString
= new SPUString(src
);
143 if (hashedString
== NULL
) {
144 status
= U_MEMORY_ALLOCATION_ERROR
;
147 uhash_put(fHash
, src
, hashedString
, &status
);
148 fVec
->addElement(hashedString
, status
);
155 ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl
*spImpl
, UErrorCode
&status
) :
168 if (U_FAILURE(status
)) {
172 fTable
= uhash_open(uhash_hashLong
, uhash_compareLong
, NULL
, &status
);
174 fKeySet
= new UnicodeSet();
175 if (fKeySet
== NULL
) {
176 status
= U_MEMORY_ALLOCATION_ERROR
;
180 fKeyVec
= new UVector(status
);
181 if (fKeyVec
== NULL
) {
182 status
= U_MEMORY_ALLOCATION_ERROR
;
186 fValueVec
= new UVector(status
);
187 if (fValueVec
== NULL
) {
188 status
= U_MEMORY_ALLOCATION_ERROR
;
192 stringPool
= new SPUStringPool(status
);
193 if (stringPool
== NULL
) {
194 status
= U_MEMORY_ALLOCATION_ERROR
;
200 ConfusabledataBuilder::~ConfusabledataBuilder() {
202 uregex_close(fParseLine
);
203 uregex_close(fParseHexNum
);
213 void ConfusabledataBuilder::buildConfusableData(SpoofImpl
* spImpl
, const char * confusables
,
214 int32_t confusablesLen
, int32_t *errorType
, UParseError
*pe
, UErrorCode
&status
) {
216 if (U_FAILURE(status
)) {
219 ConfusabledataBuilder
builder(spImpl
, status
);
220 builder
.build(confusables
, confusablesLen
, status
);
221 if (U_FAILURE(status
) && errorType
!= NULL
) {
222 *errorType
= USPOOF_SINGLE_SCRIPT_CONFUSABLE
;
223 pe
->line
= builder
.fLineNum
;
228 void ConfusabledataBuilder::build(const char * confusables
, int32_t confusablesLen
,
229 UErrorCode
&status
) {
231 // Convert the user input data from UTF-8 to UChar (UTF-16)
232 int32_t inputLen
= 0;
233 if (U_FAILURE(status
)) {
236 u_strFromUTF8(NULL
, 0, &inputLen
, confusables
, confusablesLen
, &status
);
237 if (status
!= U_BUFFER_OVERFLOW_ERROR
) {
240 status
= U_ZERO_ERROR
;
241 fInput
= static_cast<UChar
*>(uprv_malloc((inputLen
+1) * sizeof(UChar
)));
242 if (fInput
== NULL
) {
243 status
= U_MEMORY_ALLOCATION_ERROR
;
246 u_strFromUTF8(fInput
, inputLen
+1, NULL
, confusables
, confusablesLen
, &status
);
249 // Regular Expression to parse a line from Confusables.txt. The expression will match
250 // any line. What was matched is determined by examining which capture groups have a match.
251 // Capture Group 1: the source char
252 // Capture Group 2: the replacement chars
253 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
254 // Capture Group 7: A blank or comment only line.
255 // Capture Group 8: A syntactically invalid line. Anything that didn't match before.
256 // Example Line from the confusables.txt source file:
257 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
258 UnicodeString
pattern(
259 "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char
260 "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s)
261 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued)
262 "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type
263 "[ \\t]*(?:#.*?)?$" // Match any trailing #comment
264 "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment
265 "|^(.*?)$", -1, US_INV
); // OR match any line, which catches illegal lines.
266 // TODO: Why are we using the regex C API here? C++ would just take UnicodeString...
267 fParseLine
= uregex_open(pattern
.getBuffer(), pattern
.length(), 0, NULL
, &status
);
269 // Regular expression for parsing a hex number out of a space-separated list of them.
270 // Capture group 1 gets the number, with spaces removed.
271 pattern
= UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)");
272 fParseHexNum
= uregex_open(pattern
.getBuffer(), pattern
.length(), 0, NULL
, &status
);
274 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
275 // given the syntax of the input.
276 if (*fInput
== 0xfeff) {
280 // Parse the input, one line per iteration of this loop.
281 uregex_setText(fParseLine
, fInput
, inputLen
, &status
);
282 while (uregex_findNext(fParseLine
, &status
)) {
284 if (uregex_start(fParseLine
, 7, &status
) >= 0) {
285 // this was a blank or comment line.
288 if (uregex_start(fParseLine
, 8, &status
) >= 0) {
289 // input file syntax error.
290 status
= U_PARSE_ERROR
;
294 // We have a good input line. Extract the key character and mapping string, and
295 // put them into the appropriate mapping table.
296 UChar32 keyChar
= SpoofImpl::ScanHex(fInput
, uregex_start(fParseLine
, 1, &status
),
297 uregex_end(fParseLine
, 1, &status
), status
);
299 int32_t mapStringStart
= uregex_start(fParseLine
, 2, &status
);
300 int32_t mapStringLength
= uregex_end(fParseLine
, 2, &status
) - mapStringStart
;
301 uregex_setText(fParseHexNum
, &fInput
[mapStringStart
], mapStringLength
, &status
);
303 UnicodeString
*mapString
= new UnicodeString();
304 if (mapString
== NULL
) {
305 status
= U_MEMORY_ALLOCATION_ERROR
;
308 while (uregex_findNext(fParseHexNum
, &status
)) {
309 UChar32 c
= SpoofImpl::ScanHex(&fInput
[mapStringStart
], uregex_start(fParseHexNum
, 1, &status
),
310 uregex_end(fParseHexNum
, 1, &status
), status
);
311 mapString
->append(c
);
313 U_ASSERT(mapString
->length() >= 1);
315 // Put the map (value) string into the string pool
316 // This a little like a Java intern() - any duplicates will be eliminated.
317 SPUString
*smapString
= stringPool
->addString(mapString
, status
);
319 // Add the UChar32 -> string mapping to the table.
320 // For Unicode 8, the SL, SA and ML tables have been discontinued.
321 // All input data from confusables.txt is tagged MA.
322 uhash_iput(fTable
, keyChar
, smapString
, &status
);
323 if (U_FAILURE(status
)) { return; }
324 fKeySet
->add(keyChar
);
327 // Input data is now all parsed and collected.
328 // Now create the run-time binary form of the data.
330 // This is done in two steps. First the data is assembled into vectors and strings,
331 // for ease of construction, then the contents of these collections are dumped
332 // into the actual raw-bytes data storage.
334 // Build up the string array, and record the index of each string therein
335 // in the (build time only) string pool.
336 // Strings of length one are not entered into the strings array.
337 // (Strings in the table are sorted by length)
338 stringPool
->sort(status
);
339 fStringTable
= new UnicodeString();
340 int32_t poolSize
= stringPool
->size();
342 for (i
=0; i
<poolSize
; i
++) {
343 SPUString
*s
= stringPool
->getByIndex(i
);
344 int32_t strLen
= s
->fStr
->length();
345 int32_t strIndex
= fStringTable
->length();
347 // strings of length one do not get an entry in the string table.
348 // Keep the single string character itself here, which is the same
349 // convention that is used in the final run-time string table index.
350 s
->fCharOrStrTableIndex
= s
->fStr
->charAt(0);
352 s
->fCharOrStrTableIndex
= strIndex
;
353 fStringTable
->append(*(s
->fStr
));
357 // Construct the compile-time Key and Value tables
359 // For each key code point, check which mapping tables it applies to,
360 // and create the final data for the key & value structures.
362 // The four logical mapping tables are conflated into one combined table.
363 // If multiple logical tables have the same mapping for some key, they
364 // share a single entry in the combined table.
365 // If more than one mapping exists for the same key code point, multiple
366 // entries will be created in the table
368 for (int32_t range
=0; range
<fKeySet
->getRangeCount(); range
++) {
369 // It is an oddity of the UnicodeSet API that simply enumerating the contained
370 // code points requires a nested loop.
371 for (UChar32 keyChar
=fKeySet
->getRangeStart(range
);
372 keyChar
<= fKeySet
->getRangeEnd(range
); keyChar
++) {
373 SPUString
*targetMapping
= static_cast<SPUString
*>(uhash_iget(fTable
, keyChar
));
374 U_ASSERT(targetMapping
!= NULL
);
376 // Set an error code if trying to consume a long string. Otherwise,
377 // codePointAndLengthToKey will abort on a U_ASSERT.
378 if (targetMapping
->fStr
->length() > 256) {
379 status
= U_ILLEGAL_ARGUMENT_ERROR
;
383 int32_t key
= ConfusableDataUtils::codePointAndLengthToKey(keyChar
,
384 targetMapping
->fStr
->length());
385 int32_t value
= targetMapping
->fCharOrStrTableIndex
;
387 fKeyVec
->addElement(key
, status
);
388 fValueVec
->addElement(value
, status
);
392 // Put the assembled data into the flat runtime array
395 // All of the intermediate allocated data belongs to the ConfusabledataBuilder
396 // object (this), and is deleted in the destructor.
401 // outputData The confusable data has been compiled and stored in intermediate
402 // collections and strings. Copy it from there to the final flat
405 // Note that as each section is added to the output data, the
406 // expand (reserveSpace() function will likely relocate it in memory.
407 // Be careful with pointers.
409 void ConfusabledataBuilder::outputData(UErrorCode
&status
) {
411 U_ASSERT(fSpoofImpl
->fSpoofData
->fDataOwned
== TRUE
);
414 // While copying the keys to the runtime array,
415 // also sanity check that they are sorted.
417 int32_t numKeys
= fKeyVec
->size();
419 static_cast<int32_t *>(fSpoofImpl
->fSpoofData
->reserveSpace(numKeys
*sizeof(int32_t), status
));
420 if (U_FAILURE(status
)) {
424 UChar32 previousCodePoint
= 0;
425 for (i
=0; i
<numKeys
; i
++) {
426 int32_t key
= fKeyVec
->elementAti(i
);
427 UChar32 codePoint
= ConfusableDataUtils::keyToCodePoint(key
);
428 (void)previousCodePoint
; // Suppress unused variable warning.
429 // strictly greater because there can be only one entry per code point
430 U_ASSERT(codePoint
> previousCodePoint
);
432 previousCodePoint
= codePoint
;
434 SpoofDataHeader
*rawData
= fSpoofImpl
->fSpoofData
->fRawData
;
435 rawData
->fCFUKeys
= (int32_t)((char *)keys
- (char *)rawData
);
436 rawData
->fCFUKeysSize
= numKeys
;
437 fSpoofImpl
->fSpoofData
->fCFUKeys
= keys
;
440 // The Value Table, parallels the key table
441 int32_t numValues
= fValueVec
->size();
442 U_ASSERT(numKeys
== numValues
);
444 static_cast<uint16_t *>(fSpoofImpl
->fSpoofData
->reserveSpace(numKeys
*sizeof(uint16_t), status
));
445 if (U_FAILURE(status
)) {
448 for (i
=0; i
<numValues
; i
++) {
449 uint32_t value
= static_cast<uint32_t>(fValueVec
->elementAti(i
));
450 U_ASSERT(value
< 0xffff);
451 values
[i
] = static_cast<uint16_t>(value
);
453 rawData
= fSpoofImpl
->fSpoofData
->fRawData
;
454 rawData
->fCFUStringIndex
= (int32_t)((char *)values
- (char *)rawData
);
455 rawData
->fCFUStringIndexSize
= numValues
;
456 fSpoofImpl
->fSpoofData
->fCFUValues
= values
;
458 // The Strings Table.
460 uint32_t stringsLength
= fStringTable
->length();
461 // Reserve an extra space so the string will be nul-terminated. This is
462 // only a convenience, for when debugging; it is not needed otherwise.
464 static_cast<UChar
*>(fSpoofImpl
->fSpoofData
->reserveSpace(stringsLength
*sizeof(UChar
)+2, status
));
465 if (U_FAILURE(status
)) {
468 fStringTable
->extract(strings
, stringsLength
+1, status
);
469 rawData
= fSpoofImpl
->fSpoofData
->fRawData
;
470 U_ASSERT(rawData
->fCFUStringTable
== 0);
471 rawData
->fCFUStringTable
= (int32_t)((char *)strings
- (char *)rawData
);
472 rawData
->fCFUStringTableLen
= stringsLength
;
473 fSpoofImpl
->fSpoofData
->fCFUStrings
= strings
;
477 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS