[apple/icu.git] / icuSources / i18n / uspoof_wsconf.cpp

/*
******************************************************************************
*
*   Copyright (C) 2008-2012, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
******************************************************************************
*   file name:  uspoof_wsconf.cpp
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2009Jan05  (refactoring earlier files)
*   created by: Andy Heninger
*
*   Internal functions for compililing Whole Script confusable source data
*   into its binary (runtime) form.  The binary data format is described
*   in uspoof_impl.h
*/

#include "unicode/utypes.h"
#include "unicode/uspoof.h"

#if !UCONFIG_NO_NORMALIZATION

#if !UCONFIG_NO_REGULAR_EXPRESSIONS 

#include "unicode/unorm.h"
#include "unicode/uregex.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "uspoof_impl.h"
#include "uhash.h"
#include "uvector.h"
#include "uassert.h"
#include "uspoof_wsconf.h"

U_NAMESPACE_USE


// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
// Example Lines:
//   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
//   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
//    |               |     |    |
//    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
//    |               |     |----------Target script.   We need this.
//    |               |----------------Src script.  Should match the script of the source
//    |                                code points.  Beyond checking that, we don't keep it.
//    |--------------------------------Source code points or range.
//
// The expression will match _all_ lines, including erroneous lines.
// The result of the parse is returned via the contents of the (match) groups.
static const char *parseExp = 
        "(?m)"                                         // Multi-line mode
        "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
        "|^(?:"                                        //   OR
        "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
        "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
        "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
        "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
        "[ \\t]*(?:#.*?)?"                             // Trailing commment
        ")$|"                                          //   OR
        "^(.*?)$";                                     // An error line.      Group 8.
                                                       //    Any line not matching the preceding
                                                       //    parts of the expression.will match
                                                       //    this, and thus be flagged as an error


// Extract a regular expression match group into a char * string.
//    The group must contain only invariant characters.
//    Used for script names
// 
static void extractGroup(
    URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {

    UChar ubuf[50];
    ubuf[0] = 0;
    destBuf[0] = 0;
    int32_t len = uregex_group(e, group, ubuf, 50, &status);
    if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
        return;
    }
    UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
    s.extract(0, len, destBuf, destCapacity, US_INV);
}


U_NAMESPACE_BEGIN

//  Build the Whole Script Confusable data
//
//     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
//                         because everything is local to this one build function anyhow,
//                           OR
//                         break this function into more reasonably sized pieces, with
//                         state in WSConfusableDataBuilder.
//
void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) 
{
    if (U_FAILURE(status)) {
        return;
    }
    URegularExpression *parseRegexp = NULL;
    int32_t             inputLen    = 0;
    UChar              *input       = NULL;
    int32_t             lineNum     = 0;
    
    UVector            *scriptSets        = NULL;
    uint32_t            rtScriptSetsCount = 2;

    UTrie2             *anyCaseTrie   = NULL;
    UTrie2             *lowerCaseTrie = NULL;

    anyCaseTrie = utrie2_open(0, 0, &status);
    lowerCaseTrie = utrie2_open(0, 0, &status);

    UnicodeString pattern(parseExp, -1, US_INV);

    // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
    //
    // Reserved TRIE values:
    //   0:  Code point has no whole script confusables.
    //   1:  Code point is of script Common or Inherited.
    //       These code points do not participate in whole script confusable detection.
    //       (This is logically equivalent to saying that they contain confusables in
    //        all scripts)
    //
    // Because Trie values are indexes into the ScriptSets vector, pre-fill
    // vector positions 0 and 1 to avoid conflicts with the reserved values.
    
    scriptSets = new UVector(status);
    if (scriptSets == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        goto cleanup;
    }
    scriptSets->addElement((void *)NULL, status);
    scriptSets->addElement((void *)NULL, status);

    // Convert the user input data from UTF-8 to UChar (UTF-16)
    u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
    if (status != U_BUFFER_OVERFLOW_ERROR) {
        goto cleanup;
    }
    status = U_ZERO_ERROR;
    input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
    if (input == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        goto cleanup;
    }
    u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);

    parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);

    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
    //   given the syntax of the input.
    if (*input == 0xfeff) {
        *input = 0x20;
    }

    // Parse the input, one line per iteration of this loop.
    uregex_setText(parseRegexp, input, inputLen, &status);
    while (uregex_findNext(parseRegexp, &status)) {
        lineNum++;
        if (uregex_start(parseRegexp, 1, &status) >= 0) {
            // this was a blank or comment line.
            continue;
        }
        if (uregex_start(parseRegexp, 8, &status) >= 0) {
            // input file syntax error.
            status = U_PARSE_ERROR;
            goto cleanup;
        }
        if (U_FAILURE(status)) {
            goto cleanup;
        }

        // Pick up the start and optional range end code points from the parsed line.
        UChar32  startCodePoint = SpoofImpl::ScanHex(
            input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
        UChar32  endCodePoint = startCodePoint;
        if (uregex_start(parseRegexp, 3, &status) >=0) {
            endCodePoint = SpoofImpl::ScanHex(
                input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
        }

        // Extract the two script names from the source line.  We need these in an 8 bit
        //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
        //   to the ICU u_getPropertyValueEnum() function.  Ugh.
        char  srcScriptName[20];
        char  targScriptName[20];
        extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
        extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
        UScriptCode srcScript  =
            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
        UScriptCode targScript =
            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
        if (U_FAILURE(status)) {
            goto cleanup;
        }
        if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
            status = U_INVALID_FORMAT_ERROR;
            goto cleanup;
        }

        // select the table - (A) any case or (L) lower case only
        UTrie2 *table = anyCaseTrie;
        if (uregex_start(parseRegexp, 7, &status) >= 0) {
            table = lowerCaseTrie;
        }

        // Build the set of scripts containing confusable characters for
        //   the code point(s) specified in this input line.
        // Sanity check that the script of the source code point is the same
        //   as the source script indicated in the input file.  Failure of this check is
        //   an error in the input file.
        // Include the source script in the set (needed for Mixed Script Confusable detection).
        //
        UChar32 cp;
        for (cp=startCodePoint; cp<=endCodePoint; cp++) {
            int32_t setIndex = utrie2_get32(table, cp);
            BuilderScriptSet *bsset = NULL;
            if (setIndex > 0) {
                U_ASSERT(setIndex < scriptSets->size());
                bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
            } else {
                bsset = new BuilderScriptSet();
                if (bsset == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    goto cleanup;
                }
                bsset->codePoint = cp;
                bsset->trie = table;
                bsset->sset = new ScriptSet();
                setIndex = scriptSets->size();
                bsset->index = setIndex;
                bsset->rindex = 0;
                if (bsset->sset == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    goto cleanup;
                }
                scriptSets->addElement(bsset, status);
                utrie2_set32(table, cp, setIndex, &status);
            }
            bsset->sset->Union(targScript);
            bsset->sset->Union(srcScript);

            if (U_FAILURE(status)) {
                goto cleanup;
            }
            UScriptCode cpScript = uscript_getScript(cp, &status);
            if (cpScript != srcScript) {
                status = U_INVALID_FORMAT_ERROR;
                goto cleanup;
            }
        }
    }

    // Eliminate duplicate script sets.  At this point we have a separate
    // script set for every code point that had data in the input file.
    //
    // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
    //
    // printf("Number of scriptSets: %d\n", scriptSets->size());
    {
        int32_t duplicateCount = 0;
        rtScriptSetsCount = 2;
        for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
            BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
            if (outerSet->index != static_cast<uint32_t>(outeri)) {
                // This set was already identified as a duplicate.
                //   It will not be allocated a position in the runtime array of ScriptSets.
                continue;
            }
            outerSet->rindex = rtScriptSetsCount++;
            for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
                BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
                if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
                    delete innerSet->sset;
                    innerSet->scriptSetOwned = FALSE;
                    innerSet->sset = outerSet->sset;
                    innerSet->index = outeri;
                    innerSet->rindex = outerSet->rindex;
                    duplicateCount++;
                }
                // But this doesn't get all.  We need to fix the TRIE.
            }
        }
        // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
    }

    
    // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
    //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
    //     are unused, which is why the loop index starts at 2.)
    {
        for (int32_t i=2; i<scriptSets->size(); i++) {
            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
            if (bSet->rindex != (uint32_t)i) {
                utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
            }
        }
    }

    // For code points with script==Common or script==Inherited,
    //   Set the reserved value of 1 into both Tries.  These characters do not participate
    //   in Whole Script Confusable detection; this reserved value is the means
    //   by which they are detected.
    {
        UnicodeSet ignoreSet;
        ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
        UnicodeSet inheritedSet;
        inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
        ignoreSet.addAll(inheritedSet);
        for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
            UChar32 rangeStart = ignoreSet.getRangeStart(rn);
            UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
            utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
            utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
        }
    }

    // Serialize the data to the Spoof Detector
    {
        utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
        int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
        // printf("Any case Trie size: %d\n", size);
        if (status != U_BUFFER_OVERFLOW_ERROR) {
            goto cleanup;
        }
        status = U_ZERO_ERROR;
        spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
        spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
        void *where = spImpl->fSpoofData->reserveSpace(size, status);
        utrie2_serialize(anyCaseTrie, where, size, &status);
        
        utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
        size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
        // printf("Lower case Trie size: %d\n", size);
        if (status != U_BUFFER_OVERFLOW_ERROR) {
            goto cleanup;
        }
        status = U_ZERO_ERROR;
        spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
        spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
        where = spImpl->fSpoofData->reserveSpace(size, status);
        utrie2_serialize(lowerCaseTrie, where, size, &status);

        spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
        ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
            (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
        uint32_t rindex = 2;
        for (int32_t i=2; i<scriptSets->size(); i++) {
            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
            if (bSet->rindex < rindex) {
                // We have already copied this script set to the serialized data.
                continue;
            }
            U_ASSERT(rindex == bSet->rindex);
            rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
            rindex++;
        }
    }

    // Open new utrie2s from the serialized data.  We don't want to keep the ones
    //   we just built because we would then have two copies of the data, one internal to
    //   the utries that we have already constructed, and one in the serialized data area.
    //   An alternative would be to not pre-serialize the Trie data, but that makes the
    //   spoof detector data different, depending on how the detector was constructed.
    //   It's simpler to keep the data always the same.
    
    spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
            UTRIE2_16_VALUE_BITS,
            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
            NULL,
            &status);

    spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
            UTRIE2_16_VALUE_BITS,
            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
            NULL,
            &status);

    
cleanup:
    if (U_FAILURE(status)) {
        pe->line = lineNum;
    }
    uregex_close(parseRegexp);
    uprv_free(input);

    int32_t i;
    if (scriptSets != NULL) {
        for (i=0; i<scriptSets->size(); i++) {
            BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
            delete bsset;
        }
        delete scriptSets;
    }
    utrie2_close(anyCaseTrie);
    utrie2_close(lowerCaseTrie);
    return;
}

U_NAMESPACE_END


BuilderScriptSet::BuilderScriptSet() {
    codePoint = -1;
    trie = NULL;
    sset = NULL;
    index = 0;
    rindex = 0;
    scriptSetOwned = TRUE;
}

BuilderScriptSet::~BuilderScriptSet() {
    if (scriptSetOwned) {
        delete sset;
    }
}

#endif
#endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS
Commit	Line	Data
729e4ab9 A	1	/*
	2	******************************************************************************
	3	*
4388f060	4	* Copyright (C) 2008-2012, International Business Machines
729e4ab9 A	5	* Corporation and others. All Rights Reserved.
	6	*
	7	******************************************************************************
	8	* file name: uspoof_wsconf.cpp
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2009Jan05 (refactoring earlier files)
	14	* created by: Andy Heninger
	15	*
	16	* Internal functions for compililing Whole Script confusable source data
	17	* into its binary (runtime) form. The binary data format is described
	18	* in uspoof_impl.h
	19	*/
	20
	21	#include "unicode/utypes.h"
	22	#include "unicode/uspoof.h"
	23
	24	#if !UCONFIG_NO_NORMALIZATION
	25
	26	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
	27
	28	#include "unicode/unorm.h"
	29	#include "unicode/uregex.h"
	30	#include "unicode/ustring.h"
	31	#include "cmemory.h"
	32	#include "uspoof_impl.h"
	33	#include "uhash.h"
	34	#include "uvector.h"
	35	#include "uassert.h"
	36	#include "uspoof_wsconf.h"
	37
	38	U_NAMESPACE_USE
	39
	40
	41	// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
	42	// Example Lines:
	43	// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
	44	// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
	45	// \| \| \| \|
	46	// \| \| \| \|---- Which table, Any Case or Lower Case (A or L)
	47	// \| \| \|----------Target script. We need this.
	48	// \| \|----------------Src script. Should match the script of the source
	49	// \| code points. Beyond checking that, we don't keep it.
	50	// \|--------------------------------Source code points or range.
	51	//
	52	// The expression will match _all_ lines, including erroneous lines.
	53	// The result of the parse is returned via the contents of the (match) groups.
	54	static const char *parseExp =
729e4ab9 A	55	"(?m)" // Multi-line mode
	56	"^([ \\t](?:#.?)?)$" // A blank or comment line. Matches Group 1.
	57	"\|^(?:" // OR
	58	"\\s([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s;" // Code point range. Groups 2 and 3.
	59	"\\s([A-Za-z]+)\\s;" // The source script. Group 4.
	60	"\\s([A-Za-z]+)\\s;" // The target script. Group 5.
	61	"\\s*(?:(A)\|(L))" // The table A or L. Group 6 or 7
	62	"[ \\t](?:#.?)?" // Trailing commment
	63	")$\|" // OR
	64	"^(.*?)$"; // An error line. Group 8.
	65	// Any line not matching the preceding
	66	// parts of the expression.will match
	67	// this, and thus be flagged as an error
	68
	69
	70	// Extract a regular expression match group into a char * string.
	71	// The group must contain only invariant characters.
	72	// Used for script names
	73	//
	74	static void extractGroup(
	75	URegularExpression e, int32_t group, char destBuf, int32_t destCapacity, UErrorCode &status) {
	76
	77	UChar ubuf[50];
	78	ubuf[0] = 0;
	79	destBuf[0] = 0;
	80	int32_t len = uregex_group(e, group, ubuf, 50, &status);
	81	if (U_FAILURE(status) \|\| len == -1 \|\| len >= destCapacity) {
	82	return;
	83	}
	84	UnicodeString s(FALSE, ubuf, len); // Aliasing constructor
	85	s.extract(0, len, destBuf, destCapacity, US_INV);
	86	}
	87
	88
	89
4388f060 A	90	U_NAMESPACE_BEGIN
4388f060 A	91
729e4ab9 A	92	// Build the Whole Script Confusable data
	93	//
	94	// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
	95	// because everything is local to this one build function anyhow,
	96	// OR
	97	// break this function into more reasonably sized pieces, with
	98	// state in WSConfusableDataBuilder.
	99	//
	100	void buildWSConfusableData(SpoofImpl spImpl, const char confusablesWS,
	101	int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
	102	{
	103	if (U_FAILURE(status)) {
	104	return;
	105	}
	106	URegularExpression *parseRegexp = NULL;
	107	int32_t inputLen = 0;
	108	UChar *input = NULL;
	109	int32_t lineNum = 0;
	110
	111	UVector *scriptSets = NULL;
	112	uint32_t rtScriptSetsCount = 2;
	113
	114	UTrie2 *anyCaseTrie = NULL;
	115	UTrie2 *lowerCaseTrie = NULL;
	116
	117	anyCaseTrie = utrie2_open(0, 0, &status);
	118	lowerCaseTrie = utrie2_open(0, 0, &status);
4388f060 A	119
4388f060 A	120	UnicodeString pattern(parseExp, -1, US_INV);
729e4ab9 A	121
	122	// The scriptSets vector provides a mapping from TRIE values to the set of scripts.
	123	//
	124	// Reserved TRIE values:
	125	// 0: Code point has no whole script confusables.
	126	// 1: Code point is of script Common or Inherited.
	127	// These code points do not participate in whole script confusable detection.
	128	// (This is logically equivalent to saying that they contain confusables in
	129	// all scripts)
	130	//
	131	// Because Trie values are indexes into the ScriptSets vector, pre-fill
	132	// vector positions 0 and 1 to avoid conflicts with the reserved values.
	133
	134	scriptSets = new UVector(status);
	135	if (scriptSets == NULL) {
	136	status = U_MEMORY_ALLOCATION_ERROR;
	137	goto cleanup;
	138	}
	139	scriptSets->addElement((void *)NULL, status);
	140	scriptSets->addElement((void *)NULL, status);
	141
	142	// Convert the user input data from UTF-8 to UChar (UTF-16)
	143	u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
	144	if (status != U_BUFFER_OVERFLOW_ERROR) {
	145	goto cleanup;
	146	}
	147	status = U_ZERO_ERROR;
	148	input = static_cast<UChar >(uprv_malloc((inputLen+1) sizeof(UChar)));
	149	if (input == NULL) {
	150	status = U_MEMORY_ALLOCATION_ERROR;
	151	goto cleanup;
	152	}
	153	u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
	154
4388f060	155	parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
729e4ab9	156
729e4ab9 A	157	// Zap any Byte Order Mark at the start of input. Changing it to a space is benign
	158	// given the syntax of the input.
	159	if (*input == 0xfeff) {
	160	*input = 0x20;
	161	}
	162
	163	// Parse the input, one line per iteration of this loop.
	164	uregex_setText(parseRegexp, input, inputLen, &status);
	165	while (uregex_findNext(parseRegexp, &status)) {
	166	lineNum++;
729e4ab9 A	167	if (uregex_start(parseRegexp, 1, &status) >= 0) {
	168	// this was a blank or comment line.
	169	continue;
	170	}
	171	if (uregex_start(parseRegexp, 8, &status) >= 0) {
	172	// input file syntax error.
	173	status = U_PARSE_ERROR;
	174	goto cleanup;
	175	}
	176	if (U_FAILURE(status)) {
	177	goto cleanup;
	178	}
	179
	180	// Pick up the start and optional range end code points from the parsed line.
	181	UChar32 startCodePoint = SpoofImpl::ScanHex(
	182	input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
	183	UChar32 endCodePoint = startCodePoint;
	184	if (uregex_start(parseRegexp, 3, &status) >=0) {
	185	endCodePoint = SpoofImpl::ScanHex(
	186	input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
	187	}
	188
	189	// Extract the two script names from the source line. We need these in an 8 bit
	190	// default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
	191	// to the ICU u_getPropertyValueEnum() function. Ugh.
	192	char srcScriptName[20];
	193	char targScriptName[20];
	194	extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
	195	extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
	196	UScriptCode srcScript =
	197	static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
	198	UScriptCode targScript =
	199	static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
	200	if (U_FAILURE(status)) {
	201	goto cleanup;
	202	}
	203	if (srcScript == USCRIPT_INVALID_CODE \|\| targScript == USCRIPT_INVALID_CODE) {
	204	status = U_INVALID_FORMAT_ERROR;
	205	goto cleanup;
	206	}
	207
	208	// select the table - (A) any case or (L) lower case only
	209	UTrie2 *table = anyCaseTrie;
	210	if (uregex_start(parseRegexp, 7, &status) >= 0) {
	211	table = lowerCaseTrie;
	212	}
	213
	214	// Build the set of scripts containing confusable characters for
	215	// the code point(s) specified in this input line.
	216	// Sanity check that the script of the source code point is the same
	217	// as the source script indicated in the input file. Failure of this check is
	218	// an error in the input file.
	219	// Include the source script in the set (needed for Mixed Script Confusable detection).
	220	//
	221	UChar32 cp;
	222	for (cp=startCodePoint; cp<=endCodePoint; cp++) {
	223	int32_t setIndex = utrie2_get32(table, cp);
	224	BuilderScriptSet *bsset = NULL;
	225	if (setIndex > 0) {
	226	U_ASSERT(setIndex < scriptSets->size());
	227	bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
	228	} else {
	229	bsset = new BuilderScriptSet();
	230	if (bsset == NULL) {
231	status = U_MEMORY_ALLOCATION_ERROR;
232	goto cleanup;
233	}
234	bsset->codePoint = cp;
235	bsset->trie = table;
236	bsset->sset = new ScriptSet();
237	setIndex = scriptSets->size();
238	bsset->index = setIndex;
239	bsset->rindex = 0;
240	if (bsset->sset == NULL) {
241	status = U_MEMORY_ALLOCATION_ERROR;
242	goto cleanup;
243	}
244	scriptSets->addElement(bsset, status);
245	utrie2_set32(table, cp, setIndex, &status);
246	}
247	bsset->sset->Union(targScript);
248	bsset->sset->Union(srcScript);
249
250	if (U_FAILURE(status)) {
251	goto cleanup;
252	}
253	UScriptCode cpScript = uscript_getScript(cp, &status);
254	if (cpScript != srcScript) {
255	status = U_INVALID_FORMAT_ERROR;
256	goto cleanup;
257	}
258	}
259	}
260
261	// Eliminate duplicate script sets. At this point we have a separate
262	// script set for every code point that had data in the input file.
263	//
264	// We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
265	//
266	// printf("Number of scriptSets: %d\n", scriptSets->size());
267	{
268	int32_t duplicateCount = 0;
269	rtScriptSetsCount = 2;
270	for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
271	BuilderScriptSet outerSet = static_cast<BuilderScriptSet >(scriptSets->elementAt(outeri));
272	if (outerSet->index != static_cast<uint32_t>(outeri)) {
273	// This set was already identified as a duplicate.
274	// It will not be allocated a position in the runtime array of ScriptSets.
275	continue;
276	}
277	outerSet->rindex = rtScriptSetsCount++;
278	for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
279	BuilderScriptSet innerSet = static_cast<BuilderScriptSet >(scriptSets->elementAt(inneri));
280	if ((outerSet->sset) == (innerSet->sset) && outerSet->sset != innerSet->sset) {
281	delete innerSet->sset;
282	innerSet->scriptSetOwned = FALSE;
283	innerSet->sset = outerSet->sset;
284	innerSet->index = outeri;
285	innerSet->rindex = outerSet->rindex;
286	duplicateCount++;
287	}
288	// But this doesn't get all. We need to fix the TRIE.
289	}
290	}
291	// printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
292	}
293
294
295
296	// Update the Trie values to be reflect the run time script indexes (after duplicate merging).
297	// (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
298	// are unused, which is why the loop index starts at 2.)
299	{
300	for (int32_t i=2; i<scriptSets->size(); i++) {
301	BuilderScriptSet bSet = static_cast<BuilderScriptSet >(scriptSets->elementAt(i));
302	if (bSet->rindex != (uint32_t)i) {
303	utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
304	}
305	}
306	}
307
308	// For code points with script==Common or script==Inherited,
309	// Set the reserved value of 1 into both Tries. These characters do not participate
310	// in Whole Script Confusable detection; this reserved value is the means
311	// by which they are detected.
312	{
313	UnicodeSet ignoreSet;
314	ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
315	UnicodeSet inheritedSet;
316	inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
317	ignoreSet.addAll(inheritedSet);
318	for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
319	UChar32 rangeStart = ignoreSet.getRangeStart(rn);
320	UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);
321	utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
322	utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
323	}
324	}
325
326	// Serialize the data to the Spoof Detector
327	{
328	utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);
329	int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
330	// printf("Any case Trie size: %d\n", size);
331	if (status != U_BUFFER_OVERFLOW_ERROR) {
332	goto cleanup;
333	}
334	status = U_ZERO_ERROR;
335	spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
336	spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
337	spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
338	void *where = spImpl->fSpoofData->reserveSpace(size, status);
339	utrie2_serialize(anyCaseTrie, where, size, &status);
340
341	utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
342	size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
343	// printf("Lower case Trie size: %d\n", size);
344	if (status != U_BUFFER_OVERFLOW_ERROR) {
345	goto cleanup;
346	}
347	status = U_ZERO_ERROR;
348	spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
349	spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
350	spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
351	where = spImpl->fSpoofData->reserveSpace(size, status);
352	utrie2_serialize(lowerCaseTrie, where, size, &status);
353
354	spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
355	spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
356	ScriptSet rtScriptSets = static_cast<ScriptSet >
357	(spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
358	uint32_t rindex = 2;
359	for (int32_t i=2; i<scriptSets->size(); i++) {
360	BuilderScriptSet bSet = static_cast<BuilderScriptSet >(scriptSets->elementAt(i));
361	if (bSet->rindex < rindex) {
362	// We have already copied this script set to the serialized data.
363	continue;
364	}
365	U_ASSERT(rindex == bSet->rindex);
366	rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.
367	rindex++;
368	}
369	}
370
371	// Open new utrie2s from the serialized data. We don't want to keep the ones
372	// we just built because we would then have two copies of the data, one internal to
373	// the utries that we have already constructed, and one in the serialized data area.
374	// An alternative would be to not pre-serialize the Trie data, but that makes the
375	// spoof detector data different, depending on how the detector was constructed.
376	// It's simpler to keep the data always the same.
377
378	spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
379	UTRIE2_16_VALUE_BITS,
380	(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
381	spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
382	NULL,
383	&status);
384
385	spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
386	UTRIE2_16_VALUE_BITS,
387	(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
388	spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
389	NULL,
390	&status);
391
392
393
394	cleanup:
395	if (U_FAILURE(status)) {
396	pe->line = lineNum;
397	}
398	uregex_close(parseRegexp);
399	uprv_free(input);
400
401	int32_t i;
4388f060 A	402	if (scriptSets != NULL) {
	403	for (i=0; i<scriptSets->size(); i++) {
	404	BuilderScriptSet bsset = static_cast<BuilderScriptSet >(scriptSets->elementAt(i));
	405	delete bsset;
	406	}
	407	delete scriptSets;
729e4ab9	408	}
729e4ab9 A	409	utrie2_close(anyCaseTrie);
	410	utrie2_close(lowerCaseTrie);
	411	return;
	412	}
	413
4388f060	414	U_NAMESPACE_END
729e4ab9 A	415
	416
	417
	418	BuilderScriptSet::BuilderScriptSet() {
	419	codePoint = -1;
	420	trie = NULL;
	421	sset = NULL;
	422	index = 0;
	423	rindex = 0;
	424	scriptSetOwned = TRUE;
	425	}
	426
	427	BuilderScriptSet::~BuilderScriptSet() {
	428	if (scriptSetOwned) {
	429	delete sset;
	430	}
	431	}
	432
	433	#endif
	434	#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
	435