icuSources/i18n/uspoof_wsconf.cpp

   1 /*
   2 ******************************************************************************
   3 *
   4 *   Copyright (C) 2008-2012, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 ******************************************************************************
   8 *   file name:  uspoof_wsconf.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2009Jan05  (refactoring earlier files)
  14 *   created by: Andy Heninger
  15 *
  16 *   Internal functions for compililing Whole Script confusable source data
  17 *   into its binary (runtime) form.  The binary data format is described
  18 *   in uspoof_impl.h
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicode/uspoof.h"
  23
  24 #if !UCONFIG_NO_NORMALIZATION
  25
  26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  27
  28 #include "unicode/unorm.h"
  29 #include "unicode/uregex.h"
  30 #include "unicode/ustring.h"
  31 #include "cmemory.h"
  32 #include "uspoof_impl.h"
  33 #include "uhash.h"
  34 #include "uvector.h"
  35 #include "uassert.h"
  36 #include "uspoof_wsconf.h"
  37
  38 U_NAMESPACE_USE
  39
  40
  41 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
  42 // Example Lines:
  43 //   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
  44 //   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
  45 //    |               |     |    |
  46 //    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
  47 //    |               |     |----------Target script.   We need this.
  48 //    |               |----------------Src script.  Should match the script of the source
  49 //    |                                code points.  Beyond checking that, we don't keep it.
  50 //    |--------------------------------Source code points or range.
  51 //
  52 // The expression will match _all_ lines, including erroneous lines.
  53 // The result of the parse is returned via the contents of the (match) groups.
  54 static const char *parseExp =
  55         "(?m)"                                         // Multi-line mode
  56         "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
  57         "|^(?:"                                        //   OR
  58         "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
  59         "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
  60         "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
  61         "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
  62         "[ \\t]*(?:#.*?)?"                             // Trailing commment
  63         ")$|"                                          //   OR
  64         "^(.*?)$";                                     // An error line.      Group 8.
  65                                                        //    Any line not matching the preceding
  66                                                        //    parts of the expression.will match
  67                                                        //    this, and thus be flagged as an error
  68
  69
  70 // Extract a regular expression match group into a char * string.
  71 //    The group must contain only invariant characters.
  72 //    Used for script names
  73 //
  74 static void extractGroup(
  75     URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
  76
  77     UChar ubuf[50];
  78     ubuf[0] = 0;
  79     destBuf[0] = 0;
  80     int32_t len = uregex_group(e, group, ubuf, 50, &status);
  81     if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
  82         return;
  83     }
  84     UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
  85     s.extract(0, len, destBuf, destCapacity, US_INV);
  86 }
  87
  88
  89
  90 U_NAMESPACE_BEGIN
  91
  92 //  Build the Whole Script Confusable data
  93 //
  94 //     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
  95 //                         because everything is local to this one build function anyhow,
  96 //                           OR
  97 //                         break this function into more reasonably sized pieces, with
  98 //                         state in WSConfusableDataBuilder.
  99 //
 100 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
 101           int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
 102 {
 103     if (U_FAILURE(status)) {
 104         return;
 105     }
 106     URegularExpression *parseRegexp = NULL;
 107     int32_t             inputLen    = 0;
 108     UChar              *input       = NULL;
 109     int32_t             lineNum     = 0;
 110
 111     UVector            *scriptSets        = NULL;
 112     uint32_t            rtScriptSetsCount = 2;
 113
 114     UTrie2             *anyCaseTrie   = NULL;
 115     UTrie2             *lowerCaseTrie = NULL;
 116
 117     anyCaseTrie = utrie2_open(0, 0, &status);
 118     lowerCaseTrie = utrie2_open(0, 0, &status);
 119
 120     UnicodeString pattern(parseExp, -1, US_INV);
 121
 122     // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
 123     //
 124     // Reserved TRIE values:
 125     //   0:  Code point has no whole script confusables.
 126     //   1:  Code point is of script Common or Inherited.
 127     //       These code points do not participate in whole script confusable detection.
 128     //       (This is logically equivalent to saying that they contain confusables in
 129     //        all scripts)
 130     //
 131     // Because Trie values are indexes into the ScriptSets vector, pre-fill
 132     // vector positions 0 and 1 to avoid conflicts with the reserved values.
 133
 134     scriptSets = new UVector(status);
 135     if (scriptSets == NULL) {
 136         status = U_MEMORY_ALLOCATION_ERROR;
 137         goto cleanup;
 138     }
 139     scriptSets->addElement((void *)NULL, status);
 140     scriptSets->addElement((void *)NULL, status);
 141
 142     // Convert the user input data from UTF-8 to UChar (UTF-16)
 143     u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
 144     if (status != U_BUFFER_OVERFLOW_ERROR) {
 145         goto cleanup;
 146     }
 147     status = U_ZERO_ERROR;
 148     input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
 149     if (input == NULL) {
 150         status = U_MEMORY_ALLOCATION_ERROR;
 151         goto cleanup;
 152     }
 153     u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
 154
 155     parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
 156
 157     // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
 158     //   given the syntax of the input.
 159     if (*input == 0xfeff) {
 160         *input = 0x20;
 161     }
 162
 163     // Parse the input, one line per iteration of this loop.
 164     uregex_setText(parseRegexp, input, inputLen, &status);
 165     while (uregex_findNext(parseRegexp, &status)) {
 166         lineNum++;
 167         if (uregex_start(parseRegexp, 1, &status) >= 0) {
 168             // this was a blank or comment line.
 169             continue;
 170         }
 171         if (uregex_start(parseRegexp, 8, &status) >= 0) {
 172             // input file syntax error.
 173             status = U_PARSE_ERROR;
 174             goto cleanup;
 175         }
 176         if (U_FAILURE(status)) {
 177             goto cleanup;
 178         }
 179
 180         // Pick up the start and optional range end code points from the parsed line.
 181         UChar32  startCodePoint = SpoofImpl::ScanHex(
 182             input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
 183         UChar32  endCodePoint = startCodePoint;
 184         if (uregex_start(parseRegexp, 3, &status) >=0) {
 185             endCodePoint = SpoofImpl::ScanHex(
 186                 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
 187         }
 188
 189         // Extract the two script names from the source line.  We need these in an 8 bit
 190         //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
 191         //   to the ICU u_getPropertyValueEnum() function.  Ugh.
 192         char  srcScriptName[20];
 193         char  targScriptName[20];
 194         extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
 195         extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
 196         UScriptCode srcScript  =
 197             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
 198         UScriptCode targScript =
 199             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
 200         if (U_FAILURE(status)) {
 201             goto cleanup;
 202         }
 203         if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
 204             status = U_INVALID_FORMAT_ERROR;
 205             goto cleanup;
 206         }
 207
 208         // select the table - (A) any case or (L) lower case only
 209         UTrie2 *table = anyCaseTrie;
 210         if (uregex_start(parseRegexp, 7, &status) >= 0) {
 211             table = lowerCaseTrie;
 212         }
 213
 214         // Build the set of scripts containing confusable characters for
 215         //   the code point(s) specified in this input line.
 216         // Sanity check that the script of the source code point is the same
 217         //   as the source script indicated in the input file.  Failure of this check is
 218         //   an error in the input file.
 219         // Include the source script in the set (needed for Mixed Script Confusable detection).
 220         //
 221         UChar32 cp;
 222         for (cp=startCodePoint; cp<=endCodePoint; cp++) {
 223             int32_t setIndex = utrie2_get32(table, cp);
 224             BuilderScriptSet *bsset = NULL;
 225             if (setIndex > 0) {
 226                 U_ASSERT(setIndex < scriptSets->size());
 227                 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
 228             } else {
 229                 bsset = new BuilderScriptSet();
 230                 if (bsset == NULL) {
 231                     status = U_MEMORY_ALLOCATION_ERROR;
 232                     goto cleanup;
 233                 }
 234                 bsset->codePoint = cp;
 235                 bsset->trie = table;
 236                 bsset->sset = new ScriptSet();
 237                 setIndex = scriptSets->size();
 238                 bsset->index = setIndex;
 239                 bsset->rindex = 0;
 240                 if (bsset->sset == NULL) {
 241                     status = U_MEMORY_ALLOCATION_ERROR;
 242                     goto cleanup;
 243                 }
 244                 scriptSets->addElement(bsset, status);
 245                 utrie2_set32(table, cp, setIndex, &status);
 246             }
 247             bsset->sset->Union(targScript);
 248             bsset->sset->Union(srcScript);
 249
 250             if (U_FAILURE(status)) {
 251                 goto cleanup;
 252             }
 253             UScriptCode cpScript = uscript_getScript(cp, &status);
 254             if (cpScript != srcScript) {
 255                 status = U_INVALID_FORMAT_ERROR;
 256                 goto cleanup;
 257             }
 258         }
 259     }
 260
 261     // Eliminate duplicate script sets.  At this point we have a separate
 262     // script set for every code point that had data in the input file.
 263     //
 264     // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
 265     //
 266     // printf("Number of scriptSets: %d\n", scriptSets->size());
 267     {
 268         int32_t duplicateCount = 0;
 269         rtScriptSetsCount = 2;
 270         for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
 271             BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
 272             if (outerSet->index != static_cast<uint32_t>(outeri)) {
 273                 // This set was already identified as a duplicate.
 274                 //   It will not be allocated a position in the runtime array of ScriptSets.
 275                 continue;
 276             }
 277             outerSet->rindex = rtScriptSetsCount++;
 278             for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
 279                 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
 280                 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
 281                     delete innerSet->sset;
 282                     innerSet->scriptSetOwned = FALSE;
 283                     innerSet->sset = outerSet->sset;
 284                     innerSet->index = outeri;
 285                     innerSet->rindex = outerSet->rindex;
 286                     duplicateCount++;
 287                 }
 288                 // But this doesn't get all.  We need to fix the TRIE.
 289             }
 290         }
 291         // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
 292     }
 293
 294
 295
 296     // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
 297     //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
 298     //     are unused, which is why the loop index starts at 2.)
 299     {
 300         for (int32_t i=2; i<scriptSets->size(); i++) {
 301             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
 302             if (bSet->rindex != (uint32_t)i) {
 303                 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
 304             }
 305         }
 306     }
 307
 308     // For code points with script==Common or script==Inherited,
 309     //   Set the reserved value of 1 into both Tries.  These characters do not participate
 310     //   in Whole Script Confusable detection; this reserved value is the means
 311     //   by which they are detected.
 312     {
 313         UnicodeSet ignoreSet;
 314         ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
 315         UnicodeSet inheritedSet;
 316         inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
 317         ignoreSet.addAll(inheritedSet);
 318         for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
 319             UChar32 rangeStart = ignoreSet.getRangeStart(rn);
 320             UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
 321             utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
 322             utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
 323         }
 324     }
 325
 326     // Serialize the data to the Spoof Detector
 327     {
 328         utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
 329         int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
 330         // printf("Any case Trie size: %d\n", size);
 331         if (status != U_BUFFER_OVERFLOW_ERROR) {
 332             goto cleanup;
 333         }
 334         status = U_ZERO_ERROR;
 335         spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
 336         spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
 337         spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
 338         void *where = spImpl->fSpoofData->reserveSpace(size, status);
 339         utrie2_serialize(anyCaseTrie, where, size, &status);
 340
 341         utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
 342         size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
 343         // printf("Lower case Trie size: %d\n", size);
 344         if (status != U_BUFFER_OVERFLOW_ERROR) {
 345             goto cleanup;
 346         }
 347         status = U_ZERO_ERROR;
 348         spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
 349         spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
 350         spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
 351         where = spImpl->fSpoofData->reserveSpace(size, status);
 352         utrie2_serialize(lowerCaseTrie, where, size, &status);
 353
 354         spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
 355         spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
 356         ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
 357             (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
 358         uint32_t rindex = 2;
 359         for (int32_t i=2; i<scriptSets->size(); i++) {
 360             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
 361             if (bSet->rindex < rindex) {
 362                 // We have already copied this script set to the serialized data.
 363                 continue;
 364             }
 365             U_ASSERT(rindex == bSet->rindex);
 366             rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
 367             rindex++;
 368         }
 369     }
 370
 371     // Open new utrie2s from the serialized data.  We don't want to keep the ones
 372     //   we just built because we would then have two copies of the data, one internal to
 373     //   the utries that we have already constructed, and one in the serialized data area.
 374     //   An alternative would be to not pre-serialize the Trie data, but that makes the
 375     //   spoof detector data different, depending on how the detector was constructed.
 376     //   It's simpler to keep the data always the same.
 377
 378     spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
 379             UTRIE2_16_VALUE_BITS,
 380             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
 381             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
 382             NULL,
 383             &status);
 384
 385     spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
 386             UTRIE2_16_VALUE_BITS,
 387             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
 388             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
 389             NULL,
 390             &status);
 391
 392
 393
 394 cleanup:
 395     if (U_FAILURE(status)) {
 396         pe->line = lineNum;
 397     }
 398     uregex_close(parseRegexp);
 399     uprv_free(input);
 400
 401     int32_t i;
 402     if (scriptSets != NULL) {
 403         for (i=0; i<scriptSets->size(); i++) {
 404             BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
 405             delete bsset;
 406         }
 407         delete scriptSets;
 408     }
 409     utrie2_close(anyCaseTrie);
 410     utrie2_close(lowerCaseTrie);
 411     return;
 412 }
 413
 414 U_NAMESPACE_END
 415
 416
 417
 418 BuilderScriptSet::BuilderScriptSet() {
 419     codePoint = -1;
 420     trie = NULL;
 421     sset = NULL;
 422     index = 0;
 423     rindex = 0;
 424     scriptSetOwned = TRUE;
 425 }
 426
 427 BuilderScriptSet::~BuilderScriptSet() {
 428     if (scriptSetOwned) {
 429         delete sset;
 430     }
 431 }
 432
 433 #endif
 434 #endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS
 435