]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/uspoof_conf.cpp
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / i18n / uspoof_conf.cpp
1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2008-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: uspoof_conf.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009Jan05 (refactoring earlier files)
14 * created by: Andy Heninger
15 *
16 * Internal classes for compililing confusable data into its binary (runtime) form.
17 */
18
19 #include "unicode/utypes.h"
20 #include "unicode/uspoof.h"
21 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
22 #if !UCONFIG_NO_NORMALIZATION
23
24 #include "unicode/unorm.h"
25 #include "unicode/uregex.h"
26 #include "unicode/ustring.h"
27 #include "cmemory.h"
28 #include "uspoof_impl.h"
29 #include "uhash.h"
30 #include "uvector.h"
31 #include "uassert.h"
32 #include "uarrsort.h"
33 #include "uspoof_conf.h"
34
35 U_NAMESPACE_USE
36
37
38 //---------------------------------------------------------------------
39 //
40 // buildConfusableData Compile the source confusable data, as defined by
41 // the Unicode data file confusables.txt, into the binary
42 // structures used by the confusable detector.
43 //
44 // The binary structures are described in uspoof_impl.h
45 //
46 // 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA
47 // tables. Each maps from a UChar32 to a String.
48 //
49 // 2. Sort all of the strings encountered by length, since they will need to
50 // be stored in that order in the final string table.
51 //
52 // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the
53 // list because that will be the ordering of our runtime table.
54 //
55 // 4. Generate the run time string table. This is generated before the key & value
56 // tables because we need the string indexes when building those tables.
57 //
58 // 5. Build the run-time key and value tables. These are parallel tables, and are built
59 // at the same time
60 //
61
62 SPUString::SPUString(UnicodeString *s) {
63 fStr = s;
64 fStrTableIndex = 0;
65 }
66
67
68 SPUString::~SPUString() {
69 delete fStr;
70 }
71
72
73 SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) {
74 fVec = new UVector(status);
75 fHash = uhash_open(uhash_hashUnicodeString, // key hash function
76 uhash_compareUnicodeString, // Key Comparator
77 NULL, // Value Comparator
78 &status);
79 }
80
81
82 SPUStringPool::~SPUStringPool() {
83 int i;
84 for (i=fVec->size()-1; i>=0; i--) {
85 SPUString *s = static_cast<SPUString *>(fVec->elementAt(i));
86 delete s;
87 }
88 delete fVec;
89 uhash_close(fHash);
90 }
91
92
93 int32_t SPUStringPool::size() {
94 return fVec->size();
95 }
96
97 SPUString *SPUStringPool::getByIndex(int32_t index) {
98 SPUString *retString = (SPUString *)fVec->elementAt(index);
99 return retString;
100 }
101
102
103 // Comparison function for ordering strings in the string pool.
104 // Compare by length first, then, within a group of the same length,
105 // by code point order.
106 // Conforms to the type signature for a USortComparator in uvector.h
107
108 static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) {
109 const SPUString *sL = const_cast<const SPUString *>(
110 static_cast<SPUString *>(left.pointer));
111 const SPUString *sR = const_cast<const SPUString *>(
112 static_cast<SPUString *>(right.pointer));
113 int32_t lenL = sL->fStr->length();
114 int32_t lenR = sR->fStr->length();
115 if (lenL < lenR) {
116 return -1;
117 } else if (lenL > lenR) {
118 return 1;
119 } else {
120 return sL->fStr->compare(*(sR->fStr));
121 }
122 }
123
124 void SPUStringPool::sort(UErrorCode &status) {
125 fVec->sort(SPUStringCompare, status);
126 }
127
128
129 SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
130 SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src));
131 if (hashedString != NULL) {
132 delete src;
133 } else {
134 hashedString = new SPUString(src);
135 uhash_put(fHash, src, hashedString, &status);
136 fVec->addElement(hashedString, status);
137 }
138 return hashedString;
139 }
140
141
142
143 ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
144 fSpoofImpl(spImpl),
145 fInput(NULL),
146 fSLTable(NULL),
147 fSATable(NULL),
148 fMLTable(NULL),
149 fMATable(NULL),
150 fKeySet(NULL),
151 fKeyVec(NULL),
152 fValueVec(NULL),
153 fStringTable(NULL),
154 fStringLengthsTable(NULL),
155 stringPool(NULL),
156 fParseLine(NULL),
157 fParseHexNum(NULL),
158 fLineNum(0)
159 {
160 if (U_FAILURE(status)) {
161 return;
162 }
163 fSLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
164 fSATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
165 fMLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
166 fMATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
167 fKeySet = new UnicodeSet();
168 fKeyVec = new UVector(status);
169 fValueVec = new UVector(status);
170 stringPool = new SPUStringPool(status);
171 }
172
173
174 ConfusabledataBuilder::~ConfusabledataBuilder() {
175 uprv_free(fInput);
176 uregex_close(fParseLine);
177 uregex_close(fParseHexNum);
178 uhash_close(fSLTable);
179 uhash_close(fSATable);
180 uhash_close(fMLTable);
181 uhash_close(fMATable);
182 delete fKeySet;
183 delete fKeyVec;
184 delete fStringTable;
185 delete fStringLengthsTable;
186 delete fValueVec;
187 delete stringPool;
188 }
189
190
191 void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables,
192 int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) {
193
194 if (U_FAILURE(status)) {
195 return;
196 }
197 ConfusabledataBuilder builder(spImpl, status);
198 builder.build(confusables, confusablesLen, status);
199 if (U_FAILURE(status) && errorType != NULL) {
200 *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
201 pe->line = builder.fLineNum;
202 }
203 }
204
205
206 void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
207 UErrorCode &status) {
208
209 // Convert the user input data from UTF-8 to UChar (UTF-16)
210 int32_t inputLen = 0;
211 if (U_FAILURE(status)) {
212 return;
213 }
214 u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status);
215 if (status != U_BUFFER_OVERFLOW_ERROR) {
216 return;
217 }
218 status = U_ZERO_ERROR;
219 fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
220 if (fInput == NULL) {
221 status = U_MEMORY_ALLOCATION_ERROR;
222 }
223 u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status);
224
225
226 // Regular Expression to parse a line from Confusables.txt. The expression will match
227 // any line. What was matched is determined by examining which capture groups have a match.
228 // Capture Group 1: the source char
229 // Capture Group 2: the replacement chars
230 // Capture Group 3-6 the table type, SL, SA, ML, or MA
231 // Capture Group 7: A blank or comment only line.
232 // Capture Group 8: A syntactically invalid line. Anything that didn't match before.
233 // Example Line from the confusables.txt source file:
234 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
235 fParseLine = uregex_openC(
236 "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char
237 "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s)
238 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued)
239 "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type
240 "[ \\t]*(?:#.*?)?$" // Match any trailing #comment
241 "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment
242 "|^(.*?)$", // OR match any line, which catches illegal lines.
243 0, NULL, &status);
244
245 // Regular expression for parsing a hex number out of a space-separated list of them.
246 // Capture group 1 gets the number, with spaces removed.
247 fParseHexNum = uregex_openC("\\s*([0-9A-F]+)", 0, NULL, &status);
248
249 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
250 // given the syntax of the input.
251 if (*fInput == 0xfeff) {
252 *fInput = 0x20;
253 }
254
255 // Parse the input, one line per iteration of this loop.
256 uregex_setText(fParseLine, fInput, inputLen, &status);
257 while (uregex_findNext(fParseLine, &status)) {
258 fLineNum++;
259 if (uregex_start(fParseLine, 7, &status) >= 0) {
260 // this was a blank or comment line.
261 continue;
262 }
263 if (uregex_start(fParseLine, 8, &status) >= 0) {
264 // input file syntax error.
265 status = U_PARSE_ERROR;
266 return;
267 }
268
269 // We have a good input line. Extract the key character and mapping string, and
270 // put them into the appropriate mapping table.
271 UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status),
272 uregex_end(fParseLine, 1, &status), status);
273
274 int32_t mapStringStart = uregex_start(fParseLine, 2, &status);
275 int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart;
276 uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);
277
278 UnicodeString *mapString = new UnicodeString();
279 if (mapString == NULL) {
280 status = U_MEMORY_ALLOCATION_ERROR;
281 return;
282 }
283 while (uregex_findNext(fParseHexNum, &status)) {
284 UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status),
285 uregex_end(fParseHexNum, 1, &status), status);
286 mapString->append(c);
287 }
288 U_ASSERT(mapString->length() >= 1);
289
290 // Put the map (value) string into the string pool
291 // This a little like a Java intern() - any duplicates will be eliminated.
292 SPUString *smapString = stringPool->addString(mapString, status);
293
294 // Add the UChar32 -> string mapping to the appropriate table.
295 UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable :
296 uregex_start(fParseLine, 4, &status) >= 0 ? fSATable :
297 uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable :
298 uregex_start(fParseLine, 6, &status) >= 0 ? fMATable :
299 NULL;
300 U_ASSERT(table != NULL);
301 uhash_iput(table, keyChar, smapString, &status);
302 fKeySet->add(keyChar);
303 if (U_FAILURE(status)) {
304 return;
305 }
306 }
307
308 // Input data is now all parsed and collected.
309 // Now create the run-time binary form of the data.
310 //
311 // This is done in two steps. First the data is assembled into vectors and strings,
312 // for ease of construction, then the contents of these collections are dumped
313 // into the actual raw-bytes data storage.
314
315 // Build up the string array, and record the index of each string therein
316 // in the (build time only) string pool.
317 // Strings of length one are not entered into the strings array.
318 // At the same time, build up the string lengths table, which records the
319 // position in the string table of the first string of each length >= 4.
320 // (Strings in the table are sorted by length)
321 stringPool->sort(status);
322 fStringTable = new UnicodeString();
323 fStringLengthsTable = new UVector(status);
324 int32_t previousStringLength = 0;
325 int32_t previousStringIndex = 0;
326 int32_t poolSize = stringPool->size();
327 int32_t i;
328 for (i=0; i<poolSize; i++) {
329 SPUString *s = stringPool->getByIndex(i);
330 int32_t strLen = s->fStr->length();
331 int32_t strIndex = fStringTable->length();
332 U_ASSERT(strLen >= previousStringLength);
333 if (strLen == 1) {
334 // strings of length one do not get an entry in the string table.
335 // Keep the single string character itself here, which is the same
336 // convention that is used in the final run-time string table index.
337 s->fStrTableIndex = s->fStr->charAt(0);
338 } else {
339 if ((strLen > previousStringLength) && (previousStringLength >= 4)) {
340 fStringLengthsTable->addElement(previousStringIndex, status);
341 fStringLengthsTable->addElement(previousStringLength, status);
342 }
343 s->fStrTableIndex = strIndex;
344 fStringTable->append(*(s->fStr));
345 }
346 previousStringLength = strLen;
347 previousStringIndex = strIndex;
348 }
349 // Make the final entry to the string lengths table.
350 // (it holds an entry for the _last_ string of each length, so adding the
351 // final one doesn't happen in the main loop because no longer string was encountered.)
352 if (previousStringLength >= 4) {
353 fStringLengthsTable->addElement(previousStringIndex, status);
354 fStringLengthsTable->addElement(previousStringLength, status);
355 }
356
357 // Construct the compile-time Key and Value tables
358 //
359 // For each key code point, check which mapping tables it applies to,
360 // and create the final data for the key & value structures.
361 //
362 // The four logical mapping tables are conflated into one combined table.
363 // If multiple logical tables have the same mapping for some key, they
364 // share a single entry in the combined table.
365 // If more than one mapping exists for the same key code point, multiple
366 // entries will be created in the table
367
368 for (int32_t range=0; range<fKeySet->getRangeCount(); range++) {
369 // It is an oddity of the UnicodeSet API that simply enumerating the contained
370 // code points requires a nested loop.
371 for (UChar32 keyChar=fKeySet->getRangeStart(range);
372 keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
373 addKeyEntry(keyChar, fSLTable, USPOOF_SL_TABLE_FLAG, status);
374 addKeyEntry(keyChar, fSATable, USPOOF_SA_TABLE_FLAG, status);
375 addKeyEntry(keyChar, fMLTable, USPOOF_ML_TABLE_FLAG, status);
376 addKeyEntry(keyChar, fMATable, USPOOF_MA_TABLE_FLAG, status);
377 }
378 }
379
380 // Put the assembled data into the flat runtime array
381 outputData(status);
382
383 // All of the intermediate allocated data belongs to the ConfusabledataBuilder
384 // object (this), and is deleted in the destructor.
385 return;
386 }
387
388 //
389 // outputData The confusable data has been compiled and stored in intermediate
390 // collections and strings. Copy it from there to the final flat
391 // binary array.
392 //
393 // Note that as each section is added to the output data, the
394 // expand (reserveSpace() function will likely relocate it in memory.
395 // Be careful with pointers.
396 //
397 void ConfusabledataBuilder::outputData(UErrorCode &status) {
398
399 U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE);
400
401 // The Key Table
402 // While copying the keys to the runtime array,
403 // also sanity check that they are sorted.
404
405 int32_t numKeys = fKeyVec->size();
406 int32_t *keys =
407 static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status));
408 if (U_FAILURE(status)) {
409 return;
410 }
411 int i;
412 int32_t previousKey = 0;
413 for (i=0; i<numKeys; i++) {
414 int32_t key = fKeyVec->elementAti(i);
415 U_ASSERT((key & 0x00ffffff) >= (previousKey & 0x00ffffff));
416 U_ASSERT((key & 0xff000000) != 0);
417 keys[i] = key;
418 previousKey = key;
419 }
420 SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
421 rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData);
422 rawData->fCFUKeysSize = numKeys;
423 fSpoofImpl->fSpoofData->fCFUKeys = keys;
424
425
426 // The Value Table, parallels the key table
427 int32_t numValues = fValueVec->size();
428 U_ASSERT(numKeys == numValues);
429 uint16_t *values =
430 static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status));
431 if (U_FAILURE(status)) {
432 return;
433 }
434 for (i=0; i<numValues; i++) {
435 uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i));
436 U_ASSERT(value < 0xffff);
437 values[i] = static_cast<uint16_t>(value);
438 }
439 rawData = fSpoofImpl->fSpoofData->fRawData;
440 rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData);
441 rawData->fCFUStringIndexSize = numValues;
442 fSpoofImpl->fSpoofData->fCFUValues = values;
443
444 // The Strings Table.
445
446 uint32_t stringsLength = fStringTable->length();
447 // Reserve an extra space so the string will be nul-terminated. This is
448 // only a convenience, for when debugging; it is not needed otherwise.
449 UChar *strings =
450 static_cast<UChar *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status));
451 if (U_FAILURE(status)) {
452 return;
453 }
454 fStringTable->extract(strings, stringsLength+1, status);
455 rawData = fSpoofImpl->fSpoofData->fRawData;
456 U_ASSERT(rawData->fCFUStringTable == 0);
457 rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData);
458 rawData->fCFUStringTableLen = stringsLength;
459 fSpoofImpl->fSpoofData->fCFUStrings = strings;
460
461 // The String Lengths Table
462 // While copying into the runtime array do some sanity checks on the values
463 // Each complete entry contains two fields, an index and an offset.
464 // Lengths should increase with each entry.
465 // Offsets should be less than the size of the string table.
466 int32_t lengthTableLength = fStringLengthsTable->size();
467 uint16_t *stringLengths =
468 static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(lengthTableLength*sizeof(uint16_t), status));
469 if (U_FAILURE(status)) {
470 return;
471 }
472 int32_t destIndex = 0;
473 uint32_t previousLength = 0;
474 for (i=0; i<lengthTableLength; i+=2) {
475 uint32_t offset = static_cast<uint32_t>(fStringLengthsTable->elementAti(i));
476 uint32_t length = static_cast<uint32_t>(fStringLengthsTable->elementAti(i+1));
477 U_ASSERT(offset < stringsLength);
478 U_ASSERT(length < 40);
479 U_ASSERT(length > previousLength);
480 stringLengths[destIndex++] = static_cast<uint16_t>(offset);
481 stringLengths[destIndex++] = static_cast<uint16_t>(length);
482 previousLength = length;
483 }
484 rawData = fSpoofImpl->fSpoofData->fRawData;
485 rawData->fCFUStringLengths = (int32_t)((char *)stringLengths - (char *)rawData);
486 // Note: StringLengthsSize in the raw data is the number of complete entries,
487 // each consisting of a pair of 16 bit values, hence the divide by 2.
488 rawData->fCFUStringLengthsSize = lengthTableLength / 2;
489 fSpoofImpl->fSpoofData->fCFUStringLengths =
490 reinterpret_cast<SpoofStringLengthsElement *>(stringLengths);
491 }
492
493
494
495 // addKeyEntry Construction of the confusable Key and Mapping Values tables.
496 // This is an intermediate point in the building process.
497 // We already have the mappings in the hash tables fSLTable, etc.
498 // This function builds corresponding run-time style table entries into
499 // fKeyVec and fValueVec
500
501 void ConfusabledataBuilder::addKeyEntry(
502 UChar32 keyChar, // The key character
503 UHashtable *table, // The table, one of SATable, MATable, etc.
504 int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc.
505 UErrorCode &status) {
506
507 SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(table, keyChar));
508 if (targetMapping == NULL) {
509 // No mapping for this key character.
510 // (This function is called for all four tables for each key char that
511 // is seen anywhere, so this no entry cases are very much expected.)
512 return;
513 }
514
515 // Check whether there is already an entry with the correct mapping.
516 // If so, simply set the flag in the keyTable saying that the existing entry
517 // applies to the table that we're doing now.
518
519 UBool keyHasMultipleValues = FALSE;
520 int32_t i;
521 for (i=fKeyVec->size()-1; i>=0 ; i--) {
522 int32_t key = fKeyVec->elementAti(i);
523 if ((key & 0x0ffffff) != keyChar) {
524 // We have now checked all existing key entries for this key char (if any)
525 // without finding one with the same mapping.
526 break;
527 }
528 UnicodeString mapping = getMapping(i);
529 if (mapping == *(targetMapping->fStr)) {
530 // The run time entry we are currently testing has the correct mapping.
531 // Set the flag in it indicating that it applies to the new table also.
532 key |= tableFlag;
533 fKeyVec->setElementAt(key, i);
534 return;
535 }
536 keyHasMultipleValues = TRUE;
537 }
538
539 // Need to add a new entry to the binary data being built for this mapping.
540 // Includes adding entries to both the key table and the parallel values table.
541
542 int32_t newKey = keyChar | tableFlag;
543 if (keyHasMultipleValues) {
544 newKey |= USPOOF_KEY_MULTIPLE_VALUES;
545 }
546 int32_t adjustedMappingLength = targetMapping->fStr->length() - 1;
547 if (adjustedMappingLength>3) {
548 adjustedMappingLength = 3;
549 }
550 newKey |= adjustedMappingLength << USPOOF_KEY_LENGTH_SHIFT;
551
552 int32_t newData = targetMapping->fStrTableIndex;
553
554 fKeyVec->addElement(newKey, status);
555 fValueVec->addElement(newData, status);
556
557 // If the preceding key entry is for the same key character (but with a different mapping)
558 // set the multiple-values flag on it.
559 if (keyHasMultipleValues) {
560 int32_t previousKeyIndex = fKeyVec->size() - 2;
561 int32_t previousKey = fKeyVec->elementAti(previousKeyIndex);
562 previousKey |= USPOOF_KEY_MULTIPLE_VALUES;
563 fKeyVec->setElementAt(previousKey, previousKeyIndex);
564 }
565 }
566
567
568
569 UnicodeString ConfusabledataBuilder::getMapping(int32_t index) {
570 int32_t key = fKeyVec->elementAti(index);
571 int32_t value = fValueVec->elementAti(index);
572 int32_t length = USPOOF_KEY_LENGTH_FIELD(key);
573 int32_t lastIndexWithLen;
574 switch (length) {
575 case 0:
576 return UnicodeString(static_cast<UChar>(value));
577 case 1:
578 case 2:
579 return UnicodeString(*fStringTable, value, length+1);
580 case 3:
581 length = 0;
582 int32_t i;
583 for (i=0; i<fStringLengthsTable->size(); i+=2) {
584 lastIndexWithLen = fStringLengthsTable->elementAti(i);
585 if (value <= lastIndexWithLen) {
586 length = fStringLengthsTable->elementAti(i+1);
587 break;
588 }
589 }
590 U_ASSERT(length>=3);
591 return UnicodeString(*fStringTable, value, length);
592 default:
593 U_ASSERT(FALSE);
594 }
595 return UnicodeString();
596 }
597
598 #endif
599 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
600