icuSources/i18n/uspoof_wsconf.cpp

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2008-2009, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *   file name:  uspoof_wsconf.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2009Jan05  (refactoring earlier files)
  14 *   created by: Andy Heninger
  15 *
  16 *   Internal functions for compililing Whole Script confusable source data
  17 *   into its binary (runtime) form.  The binary data format is described
  18 *   in uspoof_impl.h
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/uspoof.h"
  23
  24 #if !UCONFIG_NO_NORMALIZATION
  25
  26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  27
  28 #include "unicode/unorm.h"
  29 #include "unicode/uregex.h"
  30 #include "unicode/ustring.h"
  31 #include "cmemory.h"
  32 #include "uspoof_impl.h"
  33 #include "uhash.h"
  34 #include "uvector.h"
  35 #include "uassert.h"
  36 #include "uspoof_wsconf.h"
  37
  38 U_NAMESPACE_USE
  39
  40
  41 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
  42 // Example Lines:
  43 //   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
  44 //   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
  45 //    |               |     |    |
  46 //    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
  47 //    |               |     |----------Target script.   We need this.
  48 //    |               |----------------Src script.  Should match the script of the source
  49 //    |                                code points.  Beyond checking that, we don't keep it.
  50 //    |--------------------------------Source code points or range.
  51 //
  52 // The expression will match _all_ lines, including erroneous lines.
  53 // The result of the parse is returned via the contents of the (match) groups.
  54 static const char *parseExp =
  55
  56         "(?m)"                                         // Multi-line mode
  57         "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
  58         "|^(?:"                                        //   OR
  59         "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
  60         "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
  61         "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
  62         "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
  63         "[ \\t]*(?:#.*?)?"                             // Trailing commment
  64         ")$|"                                          //   OR
  65         "^(.*?)$";                                     // An error line.      Group 8.
  66                                                        //    Any line not matching the preceding
  67                                                        //    parts of the expression.will match
  68                                                        //    this, and thus be flagged as an error
  69
  70
  71 // Extract a regular expression match group into a char * string.
  72 //    The group must contain only invariant characters.
  73 //    Used for script names
  74 //
  75 static void extractGroup(
  76     URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
  77
  78     UChar ubuf[50];
  79     ubuf[0] = 0;
  80     destBuf[0] = 0;
  81     int32_t len = uregex_group(e, group, ubuf, 50, &status);
  82     if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
  83         return;
  84     }
  85     UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
  86     s.extract(0, len, destBuf, destCapacity, US_INV);
  87 }
  88
  89
  90
  91 //  Build the Whole Script Confusable data
  92 //
  93 //     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
  94 //                         because everything is local to this one build function anyhow,
  95 //                           OR
  96 //                         break this function into more reasonably sized pieces, with
  97 //                         state in WSConfusableDataBuilder.
  98 //
  99 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
 100           int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
 101 {
 102     if (U_FAILURE(status)) {
 103         return;
 104     }
 105     URegularExpression *parseRegexp = NULL;
 106     int32_t             inputLen    = 0;
 107     UChar              *input       = NULL;
 108     int32_t             lineNum     = 0;
 109
 110     UVector            *scriptSets        = NULL;
 111     uint32_t            rtScriptSetsCount = 2;
 112
 113     UTrie2             *anyCaseTrie   = NULL;
 114     UTrie2             *lowerCaseTrie = NULL;
 115
 116     anyCaseTrie = utrie2_open(0, 0, &status);
 117     lowerCaseTrie = utrie2_open(0, 0, &status);
 118
 119
 120     // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
 121     //
 122     // Reserved TRIE values:
 123     //   0:  Code point has no whole script confusables.
 124     //   1:  Code point is of script Common or Inherited.
 125     //       These code points do not participate in whole script confusable detection.
 126     //       (This is logically equivalent to saying that they contain confusables in
 127     //        all scripts)
 128     //
 129     // Because Trie values are indexes into the ScriptSets vector, pre-fill
 130     // vector positions 0 and 1 to avoid conflicts with the reserved values.
 131
 132     scriptSets = new UVector(status);
 133     if (scriptSets == NULL) {
 134         status = U_MEMORY_ALLOCATION_ERROR;
 135         goto cleanup;
 136     }
 137     scriptSets->addElement((void *)NULL, status);
 138     scriptSets->addElement((void *)NULL, status);
 139
 140     // Convert the user input data from UTF-8 to UChar (UTF-16)
 141     u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
 142     if (status != U_BUFFER_OVERFLOW_ERROR) {
 143         goto cleanup;
 144     }
 145     status = U_ZERO_ERROR;
 146     input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
 147     if (input == NULL) {
 148         status = U_MEMORY_ALLOCATION_ERROR;
 149         goto cleanup;
 150     }
 151     u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
 152
 153
 154
 155     parseRegexp = uregex_openC(parseExp, 0, NULL, &status);
 156
 157     // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
 158     //   given the syntax of the input.
 159     if (*input == 0xfeff) {
 160         *input = 0x20;
 161     }
 162
 163     // Parse the input, one line per iteration of this loop.
 164     uregex_setText(parseRegexp, input, inputLen, &status);
 165     while (uregex_findNext(parseRegexp, &status)) {
 166         lineNum++;
 167         UChar  line[200];
 168         uregex_group(parseRegexp, 0, line, 200, &status);
 169         if (uregex_start(parseRegexp, 1, &status) >= 0) {
 170             // this was a blank or comment line.
 171             continue;
 172         }
 173         if (uregex_start(parseRegexp, 8, &status) >= 0) {
 174             // input file syntax error.
 175             status = U_PARSE_ERROR;
 176             goto cleanup;
 177         }
 178         if (U_FAILURE(status)) {
 179             goto cleanup;
 180         }
 181
 182         // Pick up the start and optional range end code points from the parsed line.
 183         UChar32  startCodePoint = SpoofImpl::ScanHex(
 184             input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
 185         UChar32  endCodePoint = startCodePoint;
 186         if (uregex_start(parseRegexp, 3, &status) >=0) {
 187             endCodePoint = SpoofImpl::ScanHex(
 188                 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
 189         }
 190
 191         // Extract the two script names from the source line.  We need these in an 8 bit
 192         //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
 193         //   to the ICU u_getPropertyValueEnum() function.  Ugh.
 194         char  srcScriptName[20];
 195         char  targScriptName[20];
 196         extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
 197         extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
 198         UScriptCode srcScript  =
 199             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
 200         UScriptCode targScript =
 201             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
 202         if (U_FAILURE(status)) {
 203             goto cleanup;
 204         }
 205         if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
 206             status = U_INVALID_FORMAT_ERROR;
 207             goto cleanup;
 208         }
 209
 210         // select the table - (A) any case or (L) lower case only
 211         UTrie2 *table = anyCaseTrie;
 212         if (uregex_start(parseRegexp, 7, &status) >= 0) {
 213             table = lowerCaseTrie;
 214         }
 215
 216         // Build the set of scripts containing confusable characters for
 217         //   the code point(s) specified in this input line.
 218         // Sanity check that the script of the source code point is the same
 219         //   as the source script indicated in the input file.  Failure of this check is
 220         //   an error in the input file.
 221         // Include the source script in the set (needed for Mixed Script Confusable detection).
 222         //
 223         UChar32 cp;
 224         for (cp=startCodePoint; cp<=endCodePoint; cp++) {
 225             int32_t setIndex = utrie2_get32(table, cp);
 226             BuilderScriptSet *bsset = NULL;
 227             if (setIndex > 0) {
 228                 U_ASSERT(setIndex < scriptSets->size());
 229                 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
 230             } else {
 231                 bsset = new BuilderScriptSet();
 232                 if (bsset == NULL) {
 233                     status = U_MEMORY_ALLOCATION_ERROR;
 234                     goto cleanup;
 235                 }
 236                 bsset->codePoint = cp;
 237                 bsset->trie = table;
 238                 bsset->sset = new ScriptSet();
 239                 setIndex = scriptSets->size();
 240                 bsset->index = setIndex;
 241                 bsset->rindex = 0;
 242                 if (bsset->sset == NULL) {
 243                     status = U_MEMORY_ALLOCATION_ERROR;
 244                     goto cleanup;
 245                 }
 246                 scriptSets->addElement(bsset, status);
 247                 utrie2_set32(table, cp, setIndex, &status);
 248             }
 249             bsset->sset->Union(targScript);
 250             bsset->sset->Union(srcScript);
 251
 252             if (U_FAILURE(status)) {
 253                 goto cleanup;
 254             }
 255             UScriptCode cpScript = uscript_getScript(cp, &status);
 256             if (cpScript != srcScript) {
 257                 status = U_INVALID_FORMAT_ERROR;
 258                 goto cleanup;
 259             }
 260         }
 261     }
 262
 263     // Eliminate duplicate script sets.  At this point we have a separate
 264     // script set for every code point that had data in the input file.
 265     //
 266     // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
 267     //
 268     // printf("Number of scriptSets: %d\n", scriptSets->size());
 269     {
 270         int32_t duplicateCount = 0;
 271         rtScriptSetsCount = 2;
 272         for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
 273             BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
 274             if (outerSet->index != static_cast<uint32_t>(outeri)) {
 275                 // This set was already identified as a duplicate.
 276                 //   It will not be allocated a position in the runtime array of ScriptSets.
 277                 continue;
 278             }
 279             outerSet->rindex = rtScriptSetsCount++;
 280             for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
 281                 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
 282                 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
 283                     delete innerSet->sset;
 284                     innerSet->scriptSetOwned = FALSE;
 285                     innerSet->sset = outerSet->sset;
 286                     innerSet->index = outeri;
 287                     innerSet->rindex = outerSet->rindex;
 288                     duplicateCount++;
 289                 }
 290                 // But this doesn't get all.  We need to fix the TRIE.
 291             }
 292         }
 293         // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
 294     }
 295
 296
 297
 298     // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
 299     //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
 300     //     are unused, which is why the loop index starts at 2.)
 301     {
 302         for (int32_t i=2; i<scriptSets->size(); i++) {
 303             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
 304             if (bSet->rindex != (uint32_t)i) {
 305                 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
 306             }
 307         }
 308     }
 309
 310     // For code points with script==Common or script==Inherited,
 311     //   Set the reserved value of 1 into both Tries.  These characters do not participate
 312     //   in Whole Script Confusable detection; this reserved value is the means
 313     //   by which they are detected.
 314     {
 315         UnicodeSet ignoreSet;
 316         ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
 317         UnicodeSet inheritedSet;
 318         inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
 319         ignoreSet.addAll(inheritedSet);
 320         for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
 321             UChar32 rangeStart = ignoreSet.getRangeStart(rn);
 322             UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
 323             utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
 324             utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
 325         }
 326     }
 327
 328     // Serialize the data to the Spoof Detector
 329     {
 330         utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
 331         int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
 332         // printf("Any case Trie size: %d\n", size);
 333         if (status != U_BUFFER_OVERFLOW_ERROR) {
 334             goto cleanup;
 335         }
 336         status = U_ZERO_ERROR;
 337         spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
 338         spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
 339         spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
 340         void *where = spImpl->fSpoofData->reserveSpace(size, status);
 341         utrie2_serialize(anyCaseTrie, where, size, &status);
 342
 343         utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
 344         size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
 345         // printf("Lower case Trie size: %d\n", size);
 346         if (status != U_BUFFER_OVERFLOW_ERROR) {
 347             goto cleanup;
 348         }
 349         status = U_ZERO_ERROR;
 350         spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
 351         spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
 352         spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
 353         where = spImpl->fSpoofData->reserveSpace(size, status);
 354         utrie2_serialize(lowerCaseTrie, where, size, &status);
 355
 356         spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
 357         spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
 358         ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
 359             (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
 360         uint32_t rindex = 2;
 361         for (int32_t i=2; i<scriptSets->size(); i++) {
 362             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
 363             if (bSet->rindex < rindex) {
 364                 // We have already copied this script set to the serialized data.
 365                 continue;
 366             }
 367             U_ASSERT(rindex == bSet->rindex);
 368             rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
 369             rindex++;
 370         }
 371     }
 372
 373     // Open new utrie2s from the serialized data.  We don't want to keep the ones
 374     //   we just built because we would then have two copies of the data, one internal to
 375     //   the utries that we have already constructed, and one in the serialized data area.
 376     //   An alternative would be to not pre-serialize the Trie data, but that makes the
 377     //   spoof detector data different, depending on how the detector was constructed.
 378     //   It's simpler to keep the data always the same.
 379
 380     spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
 381             UTRIE2_16_VALUE_BITS,
 382             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
 383             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
 384             NULL,
 385             &status);
 386
 387     spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
 388             UTRIE2_16_VALUE_BITS,
 389             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
 390             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
 391             NULL,
 392             &status);
 393
 394
 395
 396 cleanup:
 397     if (U_FAILURE(status)) {
 398         pe->line = lineNum;
 399     }
 400     uregex_close(parseRegexp);
 401     uprv_free(input);
 402
 403     int32_t i;
 404     for (i=0; i<scriptSets->size(); i++) {
 405         BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
 406         delete bsset;
 407     }
 408     delete scriptSets;
 409     utrie2_close(anyCaseTrie);
 410     utrie2_close(lowerCaseTrie);
 411     return;
 412 }
 413
 414
 415
 416
 417
 418 BuilderScriptSet::BuilderScriptSet() {
 419     codePoint = -1;
 420     trie = NULL;
 421     sset = NULL;
 422     index = 0;
 423     rindex = 0;
 424     scriptSetOwned = TRUE;
 425 }
 426
 427 BuilderScriptSet::~BuilderScriptSet() {
 428     if (scriptSetOwned) {
 429         delete sset;
 430     }
 431 }
 432
 433 #endif
 434 #endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS
 435