icuSources/i18n/uspoof_wsconf.cpp

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2008-2013, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *   file name:  uspoof_wsconf.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2009Jan05  (refactoring earlier files)
  14 *   created by: Andy Heninger
  15 *
  16 *   Internal functions for compililing Whole Script confusable source data
  17 *   into its binary (runtime) form.  The binary data format is described
  18 *   in uspoof_impl.h
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/uspoof.h"
  23
  24 #if !UCONFIG_NO_NORMALIZATION
  25
  26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  27
  28 #include "unicode/unorm.h"
  29 #include "unicode/uregex.h"
  30 #include "unicode/ustring.h"
  31 #include "cmemory.h"
  32 #include "scriptset.h"
  33 #include "uspoof_impl.h"
  34 #include "uhash.h"
  35 #include "uvector.h"
  36 #include "uassert.h"
  37 #include "uspoof_wsconf.h"
  38
  39 U_NAMESPACE_USE
  40
  41
  42 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
  43 // Example Lines:
  44 //   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
  45 //   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
  46 //    |               |     |    |
  47 //    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
  48 //    |               |     |----------Target script.   We need this.
  49 //    |               |----------------Src script.  Should match the script of the source
  50 //    |                                code points.  Beyond checking that, we don't keep it.
  51 //    |--------------------------------Source code points or range.
  52 //
  53 // The expression will match _all_ lines, including erroneous lines.
  54 // The result of the parse is returned via the contents of the (match) groups.
  55 static const char *parseExp =
  56         "(?m)"                                         // Multi-line mode
  57         "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
  58         "|^(?:"                                        //   OR
  59         "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
  60         "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
  61         "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
  62         "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
  63         "[ \\t]*(?:#.*?)?"                             // Trailing commment
  64         ")$|"                                          //   OR
  65         "^(.*?)$";                                     // An error line.      Group 8.
  66                                                        //    Any line not matching the preceding
  67                                                        //    parts of the expression.will match
  68                                                        //    this, and thus be flagged as an error
  69
  70
  71 // Extract a regular expression match group into a char * string.
  72 //    The group must contain only invariant characters.
  73 //    Used for script names
  74 //
  75 static void extractGroup(
  76     URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
  77
  78     UChar ubuf[50];
  79     ubuf[0] = 0;
  80     destBuf[0] = 0;
  81     int32_t len = uregex_group(e, group, ubuf, 50, &status);
  82     if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
  83         return;
  84     }
  85     UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
  86     s.extract(0, len, destBuf, destCapacity, US_INV);
  87 }
  88
  89
  90
  91 U_NAMESPACE_BEGIN
  92
  93 //  Build the Whole Script Confusable data
  94 //
  95 //     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
  96 //                         because everything is local to this one build function anyhow,
  97 //                           OR
  98 //                         break this function into more reasonably sized pieces, with
  99 //                         state in WSConfusableDataBuilder.
 100 //
 101 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
 102           int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
 103 {
 104     if (U_FAILURE(status)) {
 105         return;
 106     }
 107     URegularExpression *parseRegexp = NULL;
 108     int32_t             inputLen    = 0;
 109     UChar              *input       = NULL;
 110     int32_t             lineNum     = 0;
 111
 112     UVector            *scriptSets        = NULL;
 113     uint32_t            rtScriptSetsCount = 2;
 114
 115     UTrie2             *anyCaseTrie   = NULL;
 116     UTrie2             *lowerCaseTrie = NULL;
 117
 118     anyCaseTrie = utrie2_open(0, 0, &status);
 119     lowerCaseTrie = utrie2_open(0, 0, &status);
 120
 121     UnicodeString pattern(parseExp, -1, US_INV);
 122
 123     // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
 124     //
 125     // Reserved TRIE values:
 126     //   0:  Code point has no whole script confusables.
 127     //   1:  Code point is of script Common or Inherited.
 128     //       These code points do not participate in whole script confusable detection.
 129     //       (This is logically equivalent to saying that they contain confusables in
 130     //        all scripts)
 131     //
 132     // Because Trie values are indexes into the ScriptSets vector, pre-fill
 133     // vector positions 0 and 1 to avoid conflicts with the reserved values.
 134
 135     scriptSets = new UVector(status);
 136     if (scriptSets == NULL) {
 137         status = U_MEMORY_ALLOCATION_ERROR;
 138         goto cleanup;
 139     }
 140     scriptSets->addElement((void *)NULL, status);
 141     scriptSets->addElement((void *)NULL, status);
 142
 143     // Convert the user input data from UTF-8 to UChar (UTF-16)
 144     u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
 145     if (status != U_BUFFER_OVERFLOW_ERROR) {
 146         goto cleanup;
 147     }
 148     status = U_ZERO_ERROR;
 149     input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
 150     if (input == NULL) {
 151         status = U_MEMORY_ALLOCATION_ERROR;
 152         goto cleanup;
 153     }
 154     u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
 155
 156     parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
 157
 158     // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
 159     //   given the syntax of the input.
 160     if (*input == 0xfeff) {
 161         *input = 0x20;
 162     }
 163
 164     // Parse the input, one line per iteration of this loop.
 165     uregex_setText(parseRegexp, input, inputLen, &status);
 166     while (uregex_findNext(parseRegexp, &status)) {
 167         lineNum++;
 168         if (uregex_start(parseRegexp, 1, &status) >= 0) {
 169             // this was a blank or comment line.
 170             continue;
 171         }
 172         if (uregex_start(parseRegexp, 8, &status) >= 0) {
 173             // input file syntax error.
 174             status = U_PARSE_ERROR;
 175             goto cleanup;
 176         }
 177         if (U_FAILURE(status)) {
 178             goto cleanup;
 179         }
 180
 181         // Pick up the start and optional range end code points from the parsed line.
 182         UChar32  startCodePoint = SpoofImpl::ScanHex(
 183             input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
 184         UChar32  endCodePoint = startCodePoint;
 185         if (uregex_start(parseRegexp, 3, &status) >=0) {
 186             endCodePoint = SpoofImpl::ScanHex(
 187                 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
 188         }
 189
 190         // Extract the two script names from the source line.  We need these in an 8 bit
 191         //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
 192         //   to the ICU u_getPropertyValueEnum() function.  Ugh.
 193         char  srcScriptName[20];
 194         char  targScriptName[20];
 195         extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
 196         extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
 197         UScriptCode srcScript  =
 198             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
 199         UScriptCode targScript =
 200             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
 201         if (U_FAILURE(status)) {
 202             goto cleanup;
 203         }
 204         if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
 205             status = U_INVALID_FORMAT_ERROR;
 206             goto cleanup;
 207         }
 208
 209         // select the table - (A) any case or (L) lower case only
 210         UTrie2 *table = anyCaseTrie;
 211         if (uregex_start(parseRegexp, 7, &status) >= 0) {
 212             table = lowerCaseTrie;
 213         }
 214
 215         // Build the set of scripts containing confusable characters for
 216         //   the code point(s) specified in this input line.
 217         // Sanity check that the script of the source code point is the same
 218         //   as the source script indicated in the input file.  Failure of this check is
 219         //   an error in the input file.
 220         // Include the source script in the set (needed for Mixed Script Confusable detection).
 221         //
 222         UChar32 cp;
 223         for (cp=startCodePoint; cp<=endCodePoint; cp++) {
 224             int32_t setIndex = utrie2_get32(table, cp);
 225             BuilderScriptSet *bsset = NULL;
 226             if (setIndex > 0) {
 227                 U_ASSERT(setIndex < scriptSets->size());
 228                 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
 229             } else {
 230                 bsset = new BuilderScriptSet();
 231                 if (bsset == NULL) {
 232                     status = U_MEMORY_ALLOCATION_ERROR;
 233                     goto cleanup;
 234                 }
 235                 bsset->codePoint = cp;
 236                 bsset->trie = table;
 237                 bsset->sset = new ScriptSet();
 238                 setIndex = scriptSets->size();
 239                 bsset->index = setIndex;
 240                 bsset->rindex = 0;
 241                 if (bsset->sset == NULL) {
 242                     status = U_MEMORY_ALLOCATION_ERROR;
 243                     goto cleanup;
 244                 }
 245                 scriptSets->addElement(bsset, status);
 246                 utrie2_set32(table, cp, setIndex, &status);
 247             }
 248             bsset->sset->set(targScript, status);
 249             bsset->sset->set(srcScript, status);
 250
 251             if (U_FAILURE(status)) {
 252                 goto cleanup;
 253             }
 254             UScriptCode cpScript = uscript_getScript(cp, &status);
 255             if (cpScript != srcScript) {
 256                 status = U_INVALID_FORMAT_ERROR;
 257                 goto cleanup;
 258             }
 259         }
 260     }
 261
 262     // Eliminate duplicate script sets.  At this point we have a separate
 263     // script set for every code point that had data in the input file.
 264     //
 265     // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
 266     //
 267     // printf("Number of scriptSets: %d\n", scriptSets->size());
 268     {
 269         int32_t duplicateCount = 0;
 270         rtScriptSetsCount = 2;
 271         for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
 272             BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
 273             if (outerSet->index != static_cast<uint32_t>(outeri)) {
 274                 // This set was already identified as a duplicate.
 275                 //   It will not be allocated a position in the runtime array of ScriptSets.
 276                 continue;
 277             }
 278             outerSet->rindex = rtScriptSetsCount++;
 279             for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
 280                 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
 281                 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
 282                     delete innerSet->sset;
 283                     innerSet->scriptSetOwned = FALSE;
 284                     innerSet->sset = outerSet->sset;
 285                     innerSet->index = outeri;
 286                     innerSet->rindex = outerSet->rindex;
 287                     duplicateCount++;
 288                 }
 289                 // But this doesn't get all.  We need to fix the TRIE.
 290             }
 291         }
 292         // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
 293     }
 294
 295
 296
 297     // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
 298     //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
 299     //     are unused, which is why the loop index starts at 2.)
 300     {
 301         for (int32_t i=2; i<scriptSets->size(); i++) {
 302             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
 303             if (bSet->rindex != (uint32_t)i) {
 304                 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
 305             }
 306         }
 307     }
 308
 309     // For code points with script==Common or script==Inherited,
 310     //   Set the reserved value of 1 into both Tries.  These characters do not participate
 311     //   in Whole Script Confusable detection; this reserved value is the means
 312     //   by which they are detected.
 313     {
 314         UnicodeSet ignoreSet;
 315         ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
 316         UnicodeSet inheritedSet;
 317         inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
 318         ignoreSet.addAll(inheritedSet);
 319         for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
 320             UChar32 rangeStart = ignoreSet.getRangeStart(rn);
 321             UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
 322             utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
 323             utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
 324         }
 325     }
 326
 327     // Serialize the data to the Spoof Detector
 328     {
 329         utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
 330         int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
 331         // printf("Any case Trie size: %d\n", size);
 332         if (status != U_BUFFER_OVERFLOW_ERROR) {
 333             goto cleanup;
 334         }
 335         status = U_ZERO_ERROR;
 336         spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
 337         spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
 338         spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
 339         void *where = spImpl->fSpoofData->reserveSpace(size, status);
 340         utrie2_serialize(anyCaseTrie, where, size, &status);
 341
 342         utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
 343         size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
 344         // printf("Lower case Trie size: %d\n", size);
 345         if (status != U_BUFFER_OVERFLOW_ERROR) {
 346             goto cleanup;
 347         }
 348         status = U_ZERO_ERROR;
 349         spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
 350         spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
 351         spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
 352         where = spImpl->fSpoofData->reserveSpace(size, status);
 353         utrie2_serialize(lowerCaseTrie, where, size, &status);
 354
 355         spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
 356         spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
 357         ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
 358             (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
 359         uint32_t rindex = 2;
 360         for (int32_t i=2; i<scriptSets->size(); i++) {
 361             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
 362             if (bSet->rindex < rindex) {
 363                 // We have already copied this script set to the serialized data.
 364                 continue;
 365             }
 366             U_ASSERT(rindex == bSet->rindex);
 367             rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
 368             rindex++;
 369         }
 370     }
 371
 372     // Open new utrie2s from the serialized data.  We don't want to keep the ones
 373     //   we just built because we would then have two copies of the data, one internal to
 374     //   the utries that we have already constructed, and one in the serialized data area.
 375     //   An alternative would be to not pre-serialize the Trie data, but that makes the
 376     //   spoof detector data different, depending on how the detector was constructed.
 377     //   It's simpler to keep the data always the same.
 378
 379     spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
 380             UTRIE2_16_VALUE_BITS,
 381             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
 382             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
 383             NULL,
 384             &status);
 385
 386     spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
 387             UTRIE2_16_VALUE_BITS,
 388             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
 389             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
 390             NULL,
 391             &status);
 392
 393
 394
 395 cleanup:
 396     if (U_FAILURE(status)) {
 397         pe->line = lineNum;
 398     }
 399     uregex_close(parseRegexp);
 400     uprv_free(input);
 401
 402     int32_t i;
 403     if (scriptSets != NULL) {
 404         for (i=0; i<scriptSets->size(); i++) {
 405             BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
 406             delete bsset;
 407         }
 408         delete scriptSets;
 409     }
 410     utrie2_close(anyCaseTrie);
 411     utrie2_close(lowerCaseTrie);
 412     return;
 413 }
 414
 415 U_NAMESPACE_END
 416
 417
 418
 419 BuilderScriptSet::BuilderScriptSet() {
 420     codePoint = -1;
 421     trie = NULL;
 422     sset = NULL;
 423     index = 0;
 424     rindex = 0;
 425     scriptSetOwned = TRUE;
 426 }
 427
 428 BuilderScriptSet::~BuilderScriptSet() {
 429     if (scriptSetOwned) {
 430         delete sset;
 431     }
 432 }
 433
 434 #endif
 435 #endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS
 436