X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/f3c0d7a59d99c2a94c6b8822291f0e42be3773c9..c5116b9f5a666b9d59f443b3770acd6ef64dc6c3:/icuSources/test/intltest/rbbitst.cpp?ds=sidebyside diff --git a/icuSources/test/intltest/rbbitst.cpp b/icuSources/test/intltest/rbbitst.cpp index 92958e72..ebd12364 100644 --- a/icuSources/test/intltest/rbbitst.cpp +++ b/icuSources/test/intltest/rbbitst.cpp @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include "unicode/brkiter.h" #include "unicode/localpointer.h" @@ -39,10 +41,12 @@ #include "cstr.h" #include "intltest.h" #include "rbbitst.h" +#include "rbbidata.h" #include "utypeinfo.h" // for 'typeid' to work #include "uvector.h" #include "uvectr32.h" + #if !UCONFIG_NO_FILTERED_BREAK_ITERATION #include "unicode/filteredbrk.h" #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION @@ -53,7 +57,6 @@ #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} - //--------------------------------------------- // runIndexedTest //--------------------------------------------- @@ -74,10 +77,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha #if !UCONFIG_NO_FILE_IO TESTCASE_AUTO(TestBug4153072); #endif - TESTCASE_AUTO(TestStatusReturn); #if !UCONFIG_NO_FILE_IO TESTCASE_AUTO(TestUnicodeFiles); - TESTCASE_AUTO(TestEmptyString); #endif TESTCASE_AUTO(TestGetAvailableLocales); TESTCASE_AUTO(TestGetDisplayName); @@ -107,151 +108,16 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha TESTCASE_AUTO(TestBug12918); TESTCASE_AUTO(TestBug12932); TESTCASE_AUTO(TestEmoji); + TESTCASE_AUTO(TestBug12519); + TESTCASE_AUTO(TestBug12677); + TESTCASE_AUTO(TestTableRedundancies); + TESTCASE_AUTO(TestBug13447); + TESTCASE_AUTO(TestReverse); + TESTCASE_AUTO(TestBug13692); TESTCASE_AUTO_END; } -//--------------------------------------------------------------------------- -// -// class BITestData Holds a set of Break iterator test data and results -// Includes -// - the string data to be broken -// - a vector of the expected break positions. -// - a vector of source line numbers for the data, -// (to help see where errors occured.) -// - The expected break tag values. -// - Vectors of actual break positions and tag values. -// - Functions for comparing actual with expected and -// reporting errors. -// -//---------------------------------------------------------------------------- -class BITestData { -public: - UnicodeString fDataToBreak; - UVector fExpectedBreakPositions; - UVector fExpectedTags; - UVector fLineNum; - UVector fActualBreakPositions; // Test Results. - UVector fActualTags; - - BITestData(UErrorCode &status); - void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); - void checkResults(const char *heading, RBBITest *test); - void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); - void clearResults(); -}; - -// -// Constructor. -// -BITestData::BITestData(UErrorCode &status) -: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), - fActualTags(status) -{ -} - -// -// addDataChunk. Add a section (non-breaking) piece if data to the test data. -// The macro form collects the line number, which is helpful -// when tracking down failures. -// -// A null data item is inserted at the start of each test's data -// to put the starting zero into the data list. The position saved for -// each non-null item is its ending position. -// -#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); -void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { - if (U_FAILURE(status)) {return;} - if (data != NULL) { - fDataToBreak.append(CharsToUnicodeString(data)); - } - fExpectedBreakPositions.addElement(fDataToBreak.length(), status); - fExpectedTags.addElement(tag, status); - fLineNum.addElement(lineNum, status); -} - - -// -// checkResults. Compare the actual and expected break positions, report any differences. -// -void BITestData::checkResults(const char *heading, RBBITest *test) { - int32_t expectedIndex = 0; - int32_t actualIndex = 0; - - for (;;) { - // If we've run through both the expected and actual results vectors, we're done. - // break out of the loop. - if (expectedIndex >= fExpectedBreakPositions.size() && - actualIndex >= fActualBreakPositions.size()) { - break; - } - - - if (expectedIndex >= fExpectedBreakPositions.size()) { - err(heading, test, expectedIndex-1, actualIndex); - actualIndex++; - continue; - } - - if (actualIndex >= fActualBreakPositions.size()) { - err(heading, test, expectedIndex, actualIndex-1); - expectedIndex++; - continue; - } - - if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { - err(heading, test, expectedIndex, actualIndex); - // Try to resync the positions of the indices, to avoid a rash of spurious erros. - if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { - actualIndex++; - } else { - expectedIndex++; - } - continue; - } - - if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { - test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", - heading, fLineNum.elementAt(expectedIndex), - fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); - } - - actualIndex++; - expectedIndex++; - } -} - -// -// err - An error was found. Report it, along with information about where the -// incorrectly broken test data appeared in the source file. -// -void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) -{ - int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); - int32_t actual = fActualBreakPositions.elementAti(actualIdx); - int32_t o = 0; - int32_t line = fLineNum.elementAti(expectedIdx); - if (expectedIdx > 0) { - // The line numbers are off by one because a premature break occurs somewhere - // within the previous item, rather than at the start of the current (expected) item. - // We want to report the offset of the unexpected break from the start of - // this previous item. - o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); - } - if (actual < expected) { - test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); - } else { - test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); - } -} - - -void BITestData::clearResults() { - fActualBreakPositions.removeAllElements(); - fActualTags.removeAllElements(); -} - - //-------------------------------------------------------------------------------------- // // RBBITest constructor and destructor @@ -266,51 +132,6 @@ RBBITest::RBBITest() { RBBITest::~RBBITest() { } -//----------------------------------------------------------------------------------- -// -// Test for status {tag} return value from break rules. -// TODO: a more thorough test. -// -//----------------------------------------------------------------------------------- -void RBBITest::TestStatusReturn() { - UnicodeString rulesString1("$Letters = [:L:];\n" - "$Numbers = [:N:];\n" - "$Letters+{1};\n" - "$Numbers+{2};\n" - "Help\\ /me\\!{4};\n" - "[^$Letters $Numbers];\n" - "!.*;\n", -1, US_INV); - UnicodeString testString1 = "abc123..abc Help me Help me!"; - // 01234567890123456789012345678 - int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; - int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; - - UErrorCode status=U_ZERO_ERROR; - UParseError parseError; - - LocalPointer bi(new RuleBasedBreakIterator(rulesString1, parseError, status)); - if(U_FAILURE(status)) { - dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status)); - return; - } - int32_t pos; - int32_t i = 0; - bi->setText(testString1); - for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { - if (pos != bounds1[i]) { - errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos); - break; - } - - int tag = bi->getRuleStatus(); - if (tag != brkStatus[i]) { - errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag); - break; - } - i++; - } -} - static void printStringBreaks(UText *tstr, int expected[], int expectedCount) { UErrorCode status = U_ZERO_ERROR; @@ -318,7 +139,7 @@ static void printStringBreaks(UText *tstr, int expected[], int expectedCount) { printf("code alpha extend alphanum type word sent line name\n"); int nextExpectedIndex = 0; utext_setNativeIndex(tstr, 0); - for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) { + for (int j = 0; j < static_cast(utext_nativeLength(tstr)); j=static_cast(utext_getNativeIndex(tstr))) { if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) { printf("------------------------------------------------ %d\n", j); ++nextExpectedIndex; @@ -391,277 +212,12 @@ void RBBITest::TestBug3818() { delete bi; } -//---------------------------------------------------------------------------- -// -// generalIteratorTest Given a break iterator and a set of test data, -// Run the tests and report the results. -// -//---------------------------------------------------------------------------- -void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) -{ - - bi.setText(td.fDataToBreak); - - testFirstAndNext(bi, td); - - testLastAndPrevious(bi, td); - - testFollowing(bi, td); - testPreceding(bi, td); - testIsBoundary(bi, td); - doMultipleSelectionTest(bi, td); -} - - -// -// testFirstAndNext. Run the iterator forwards in the obvious first(), next() -// kind of loop. -// -void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) -{ - UErrorCode status = U_ZERO_ERROR; - int32_t p; - int32_t lastP = -1; - int32_t tag; - - logln("Test first and next"); - bi.setText(td.fDataToBreak); - td.clearResults(); - - for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { - td.fActualBreakPositions.addElement(p, status); // Save result. - tag = bi.getRuleStatus(); - td.fActualTags.addElement(tag, status); - if (p <= lastP) { - // If the iterator is not making forward progress, stop. - // No need to raise an error here, it'll be detected in the normal check of results. - break; - } - lastP = p; - } - td.checkResults("testFirstAndNext", this); -} - - -// -// TestLastAndPrevious. Run the iterator backwards, starting with last(). -// -void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) -{ - UErrorCode status = U_ZERO_ERROR; - int32_t p; - int32_t lastP = 0x7ffffffe; - int32_t tag; - - logln("Test last and previous"); - bi.setText(td.fDataToBreak); - td.clearResults(); - - for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { - // Save break position. Insert it at start of vector of results, shoving - // already-saved results further towards the end. - td.fActualBreakPositions.insertElementAt(p, 0, status); - // bi.previous(); // TODO: Why does this fix things up???? - // bi.next(); - tag = bi.getRuleStatus(); - td.fActualTags.insertElementAt(tag, 0, status); - if (p >= lastP) { - // If the iterator is not making progress, stop. - // No need to raise an error here, it'll be detected in the normal check of results. - break; - } - lastP = p; - } - td.checkResults("testLastAndPrevious", this); -} - - -void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) -{ - UErrorCode status = U_ZERO_ERROR; - int32_t p; - int32_t tag; - int32_t lastP = -2; // A value that will never be returned as a break position. - // cannot be -1; that is returned for DONE. - int i; - - logln("testFollowing():"); - bi.setText(td.fDataToBreak); - td.clearResults(); - - // Save the starting point, since we won't get that out of following. - p = bi.first(); - td.fActualBreakPositions.addElement(p, status); // Save result. - tag = bi.getRuleStatus(); - td.fActualTags.addElement(tag, status); - - for (i = 0; i <= td.fDataToBreak.length()+1; i++) { - p = bi.following(i); - if (p != lastP) { - if (p == RuleBasedBreakIterator::DONE) { - break; - } - // We've reached a new break position. Save it. - td.fActualBreakPositions.addElement(p, status); // Save result. - tag = bi.getRuleStatus(); - td.fActualTags.addElement(tag, status); - lastP = p; - } - } - // The loop normally exits by means of the break in the middle. - // Make sure that the index was at the correct position for the break iterator to have - // returned DONE. - if (i != td.fDataToBreak.length()) { - errln("testFollowing(): iterator returned DONE prematurely."); - } - - // Full check of all results. - td.checkResults("testFollowing", this); -} - - - -void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { - UErrorCode status = U_ZERO_ERROR; - int32_t p; - int32_t tag; - int32_t lastP = 0x7ffffffe; - int i; - - logln("testPreceding():"); - bi.setText(td.fDataToBreak); - td.clearResults(); - - p = bi.last(); - td.fActualBreakPositions.addElement(p, status); - tag = bi.getRuleStatus(); - td.fActualTags.addElement(tag, status); - - for (i = td.fDataToBreak.length(); i>=-1; i--) { - p = bi.preceding(i); - if (p != lastP) { - if (p == RuleBasedBreakIterator::DONE) { - break; - } - // We've reached a new break position. Save it. - td.fActualBreakPositions.insertElementAt(p, 0, status); - lastP = p; - tag = bi.getRuleStatus(); - td.fActualTags.insertElementAt(tag, 0, status); - } - } - // The loop normally exits by means of the break in the middle. - // Make sure that the index was at the correct position for the break iterator to have - // returned DONE. - if (i != 0) { - errln("testPreceding(): iterator returned DONE prematurely."); - } - - // Full check of all results. - td.checkResults("testPreceding", this); -} - - - -void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { - UErrorCode status = U_ZERO_ERROR; - int i; - int32_t tag; - - logln("testIsBoundary():"); - bi.setText(td.fDataToBreak); - td.clearResults(); - - for (i = 0; i <= td.fDataToBreak.length(); i++) { - if (bi.isBoundary(i)) { - td.fActualBreakPositions.addElement(i, status); // Save result. - tag = bi.getRuleStatus(); - td.fActualTags.addElement(tag, status); - } - } - td.checkResults("testIsBoundary: ", this); -} - - - -void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) -{ - iterator.setText(td.fDataToBreak); - - RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); - int32_t offset = iterator.first(); - int32_t testOffset; - int32_t count = 0; - - logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); - - if (*testIterator != iterator) - errln("clone() or operator!= failed: two clones compared unequal"); - - do { - testOffset = testIterator->first(); - testOffset = testIterator->next(count); - if (offset != testOffset) - errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); - - if (offset != RuleBasedBreakIterator::DONE) { - count++; - offset = iterator.next(); - - if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { - errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); - if (count > 10000 || offset == -1) { - errln("operator== failed too many times. Stopping test."); - if (offset == -1) { - errln("Does (RuleBasedBreakIterator::DONE == -1)?"); - } - return; - } - } - } - } while (offset != RuleBasedBreakIterator::DONE); - - // now do it backwards... - offset = iterator.last(); - count = 0; - - do { - testOffset = testIterator->last(); - testOffset = testIterator->next(count); // next() with a negative arg is same as previous - if (offset != testOffset) - errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); - - if (offset != RuleBasedBreakIterator::DONE) { - count--; - offset = iterator.previous(); - } - } while (offset != RuleBasedBreakIterator::DONE); - - delete testIterator; -} - //--------------------------------------------- // // other tests // //--------------------------------------------- -void RBBITest::TestEmptyString() -{ - UnicodeString text = ""; - UErrorCode status = U_ZERO_ERROR; - - BITestData x(status); - ADD_DATACHUNK(x, "", 0, status); // Break at start of data - RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); - return; - } - generalIteratorTest(*bi, x); - delete bi; -} void RBBITest::TestGetAvailableLocales() { @@ -1019,8 +575,9 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) { // // Run the iterator backwards, verify that the same breaks are found. // - prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen. - for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { + prevBP = static_cast(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen. + bp = t->bi->last(); + while (bp != BreakIterator::DONE) { if (prevBP == bp) { // Fail for lack of progress. errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", @@ -1058,6 +615,7 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) { } prevBP = bp; + bp = t->bi->previous(); } // Verify that there were no missed breaks prior to the last one found @@ -1081,10 +639,10 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) { } // Check following() - for (i=0; i < utext_nativeLength(t->textToBreak); i++) { + for (i=0; i < static_cast(utext_nativeLength(t->textToBreak)); i++) { int32_t actualBreak = t->bi->following(i); int32_t expectedBreak = BreakIterator::DONE; - for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) { + for (int32_t j=i+1; j <= static_cast(utext_nativeLength(t->textToBreak)); j++) { if (t->getExpectedBreak(j) != 0) { expectedBreak = j; break; @@ -1098,7 +656,7 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) { } // Check preceding() - for (i=utext_nativeLength(t->textToBreak); i>=0; i--) { + for (i=static_cast(utext_nativeLength(t->textToBreak)); i>=0; i--) { int32_t actualBreak = t->bi->preceding(i); int32_t expectedBreak = BreakIterator::DONE; @@ -1108,7 +666,7 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) { // Therefore, start looking at the expected break data not at i-1, but at // the start of code point index - 1. utext_setNativeIndex(t->textToBreak, i); - int32_t j = utext_getNativeIndex(t->textToBreak) - 1; + int32_t j = static_cast(utext_getNativeIndex(t->textToBreak) - 1); for (; j >= 0; j--) { if (t->getExpectedBreak(j) != 0) { expectedBreak = j; @@ -1131,34 +689,27 @@ void RBBITest::TestExtended() { UErrorCode status = U_ZERO_ERROR; Locale locale(""); - UnicodeString rules; TestParams tp(status); - RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE(""), 0, status); + RegexMatcher localeMatcher(UnicodeString(u""), 0, status); if (U_FAILURE(status)) { dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); } - // // Open and read the test data file. // const char *testDataDirectory = IntlTest::getSourceTestData(status); - char testFileName[1000]; - if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { - errln("Can't open test data. Path too long."); - return; - } - strcpy(testFileName, testDataDirectory); - strcat(testFileName, "rbbitst.txt"); + CharString testFileName(testDataDirectory, -1, status); + testFileName.append("rbbitst.txt", -1, status); int len; - UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); + UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status); if (U_FAILURE(status)) { - return; /* something went wrong, error already output */ + errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status)); + return; } - bool skipTest = false; // Skip this test? // @@ -1170,7 +721,8 @@ void RBBITest::TestExtended() { PARSE_COMMENT, PARSE_TAG, PARSE_DATA, - PARSE_NUM + PARSE_NUM, + PARSE_RULES } parseState = PARSE_TAG; @@ -1181,7 +733,10 @@ void RBBITest::TestExtended() { int32_t column = 0; int32_t charIdx = 0; - int32_t tagValue = 0; // The numeric value of a tag. + int32_t tagValue = 0; // The numeric value of a tag. + + UnicodeString rules; // Holds rules from a ... block + int32_t rulesFirstLine; // Line number of the start of current block for (charIdx = 0; charIdx < len; ) { status = U_ZERO_ERROR; @@ -1215,41 +770,50 @@ void RBBITest::TestExtended() { if (u_isUWhiteSpace(c)) { break; } - if (testString.compare(charIdx-1, 6, "") == 0) { + if (testString.compare(charIdx-1, 6, u"") == 0) { delete tp.bi; tp.bi = BreakIterator::createWordInstance(locale, status); skipTest = false; charIdx += 5; break; } - if (testString.compare(charIdx-1, 6, "") == 0) { + if (testString.compare(charIdx-1, 6, u"") == 0) { delete tp.bi; tp.bi = BreakIterator::createCharacterInstance(locale, status); skipTest = false; charIdx += 5; break; } - if (testString.compare(charIdx-1, 6, "") == 0) { + if (testString.compare(charIdx-1, 6, u"") == 0) { delete tp.bi; tp.bi = BreakIterator::createLineInstance(locale, status); skipTest = false; charIdx += 5; break; } - if (testString.compare(charIdx-1, 6, "") == 0) { + if (testString.compare(charIdx-1, 6, u"") == 0) { delete tp.bi; tp.bi = BreakIterator::createSentenceInstance(locale, status); skipTest = false; charIdx += 5; break; } - if (testString.compare(charIdx-1, 7, "") == 0) { + if (testString.compare(charIdx-1, 7, u"<title>") == 0) { delete tp.bi; tp.bi = BreakIterator::createTitleInstance(locale, status); charIdx += 6; break; } + if (testString.compare(charIdx-1, 7, u"<rules>") == 0 || + testString.compare(charIdx-1, 10, u"<badrules>") == 0) { + charIdx = testString.indexOf(u'>', charIdx) + 1; + parseState = PARSE_RULES; + rules.remove(); + rulesFirstLine = lineNum; + break; + } + // <locale loc_name> localeMatcher.reset(testString); if (localeMatcher.lookingAt(charIdx-1, status)) { @@ -1261,7 +825,7 @@ void RBBITest::TestExtended() { TEST_ASSERT_SUCCESS(status); break; } - if (testString.compare(charIdx-1, 6, "<data>") == 0) { + if (testString.compare(charIdx-1, 6, u"<data>") == 0) { parseState = PARSE_DATA; charIdx += 5; tp.dataToBreak = ""; @@ -1278,6 +842,33 @@ void RBBITest::TestExtended() { } break; + case PARSE_RULES: + if (testString.compare(charIdx-1, 8, u"</rules>") == 0) { + charIdx += 7; + parseState = PARSE_TAG; + delete tp.bi; + UParseError pe; + tp.bi = new RuleBasedBreakIterator(rules, pe, status); + skipTest = U_FAILURE(status); + if (U_FAILURE(status)) { + errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.", + rulesFirstLine + pe.line - 1, u_errorName(status)); + } + } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) { + charIdx += 10; + parseState = PARSE_TAG; + UErrorCode ec = U_ZERO_ERROR; + UParseError pe; + RuleBasedBreakIterator bi(rules, pe, ec); + if (U_SUCCESS(ec)) { + errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.", + rulesFirstLine + pe.line - 1); + } + } else { + rules.append(c); + } + break; + case PARSE_DATA: if (c == u'\u2022') { // u'•' int32_t breakIdx = tp.dataToBreak.length(); @@ -1290,7 +881,7 @@ void RBBITest::TestExtended() { break; } - if (testString.compare(charIdx-1, 7, "</data>") == 0) { + if (testString.compare(charIdx-1, 7, u"</data>") == 0) { // Add final entry to mappings from break location to source file position. // Need one extra because last break position returned is after the // last char in the data, not at the last char. @@ -1316,7 +907,7 @@ void RBBITest::TestExtended() { break; } - if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { + if (testString.compare(charIdx-1, 3, u"\\N{") == 0) { // Named character, e.g. \N{COMBINING GRAVE ACCENT} // Get the code point from the name and insert it into the test data. // (Damn, no API takes names in Unicode !!! @@ -1355,8 +946,7 @@ void RBBITest::TestExtended() { - - if (testString.compare(charIdx-1, 2, "<>") == 0) { + if (testString.compare(charIdx-1, 2, u"<>") == 0) { charIdx++; int32_t breakIdx = tp.dataToBreak.length(); tp.expectedBreaks->setSize(breakIdx+1); @@ -1477,6 +1067,18 @@ void RBBITest::TestExtended() { } + // Reached end of test file. Raise an error if parseState indicates that we are + // within a block that should have been terminated. + + if (parseState == PARSE_RULES) { + errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.", + lineNum, rulesFirstLine); + } + if (parseState == PARSE_DATA) { + errln("rbbitst.txt:%d <data> block not closed.", lineNum); + } + + end_test: delete [] testFile; #endif @@ -1573,7 +1175,7 @@ UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char fileSize = ftell(f); fileBuf = new char[fileSize]; fseek(f, 0, SEEK_SET); - amt_read = fread(fileBuf, 1, fileSize, f); + amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f)); if (amt_read != fileSize || fileSize <= 0) { errln("Error reading test data file."); goto cleanUpAndReturn; @@ -1681,35 +1283,57 @@ void RBBITest::TestUnicodeFiles() { // Check for test cases from the Unicode test data files that are known to fail -// and should be skipped because ICU is not yet able to fully implement the spec. -// See ticket #7270. +// and should be skipped as known issues because ICU does not fully implement +// the Unicode specifications, or because ICU includes tailorings that differ from +// the Unicode standard. +// +// Test cases are identified by the test data sequence, which tends to be more stable +// across Unicode versions than the test file line numbers. +// +// The test case with ticket "10666" is a dummy, included as an example. UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) { static struct TestCase { + const char *fTicketNum; const char *fFileName; const UChar *fString; - } badTestCases[] = { // Line Numbers from Unicode 7.0.0 file. - {"LineBreakTest.txt", u"\u200B\u0020}"}, // Line 5198 - {"LineBreakTest.txt", u"\u200B\u0020)"}, // Line 5202 - {"LineBreakTest.txt", u"\u200B\u0020!"}, // Line 5214 - {"LineBreakTest.txt", u"\u200B\u0020,"}, // Line 5246 - {"LineBreakTest.txt", u"\u200B\u0020/"}, // Line 5298 - {"LineBreakTest.txt", u"\u200B\u0020\u2060"}, // Line 5302 - // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt - {"GraphemeBreakTest.txt", u"\u200D\u2640"}, // Line 656, old GB 11 test ZWJ x GAZ - {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG - {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier - - // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt - {"WordBreakTest.txt", u"\u200D\u261D"}, // Line 1356, ZWJ x EmojiNRK - {"WordBreakTest.txt", u"\u200D\U0001F3FB"}, // Line 1358, ZWJ x EmojiNRK + } badTestCases[] = { + {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration. + // Issue 8151, move the Finnish tailoring of the line break of hyphens to root. + // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time + // ICU is out of sync with Unicode. + {"8151", "LineBreakTest.txt", u"-#"}, + {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0023"}, + {"8151", "LineBreakTest.txt", u"\u002d\u00a7"}, + {"8151", "LineBreakTest.txt", u"\u002d\u0308\u00a7"}, + {"8151", "LineBreakTest.txt", u"\u002d\U00050005"}, + {"8151", "LineBreakTest.txt", u"\u002d\u0308\U00050005"}, + {"8151", "LineBreakTest.txt", u"\u002d\u0e01"}, + {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0e01"}, + + // Issue ICU-12017 Improve line break around numbers + {"12017", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0" + {"12017", "LineBreakTest.txt", u"\u002C\u0308\u0030"}, + {"12017", "LineBreakTest.txt", u"find .com"}, + {"12017", "LineBreakTest.txt", u"equals .35 cents"}, + {"12017", "LineBreakTest.txt", u"a.2 "}, + {"12017", "LineBreakTest.txt", u"a.2 \u0915"}, + {"12017", "LineBreakTest.txt", u"a.2 \u672C"}, + {"12017", "LineBreakTest.txt", u"a.2\u3000\u672C"}, + {"12017", "LineBreakTest.txt", u"a.2\u3000\u307E"}, + {"12017", "LineBreakTest.txt", u"a.2\u3000\u0033"}, + {"12017", "LineBreakTest.txt", u"A.1 \uBABB"}, + {"12017", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"}, + {"12017", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"}, + {"12017", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"}, + {"12017", "LineBreakTest.txt", u"a.2\u3000\u300C"}, }; for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) { const TestCase &badCase = badTestCases[n]; if (!strcmp(fileName, badCase.fFileName) && testCase == UnicodeString(badCase.fString)) { - return logKnownIssue("7270"); + return logKnownIssue(badCase.fTicketNum); } } return FALSE; @@ -1957,34 +1581,6 @@ static uint32_t m_rand() } -// -// Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267 -// -static const char16_t *gExtended_Pict = u"[" - "\\U0001F774-\\U0001F77F\\U00002700-\\U00002701\\U00002703-\\U00002704\\U0000270E\\U00002710-\\U00002711\\U00002765-\\U00002767" - "\\U0001F030-\\U0001F093\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5" - "\\U0001F260-\\U0001F265\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F25F" - "\\U0001F266-\\U0001F2FF\\U0001F7D5-\\U0001F7FF\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F" - "\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6" - "\\U0001F4FE\\U0001F53E-\\U0001F548\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586" - "\\U0001F588-\\U0001F589\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7" - "\\U0001F5A9-\\U0001F5B0\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB" - "\\U0001F5DF-\\U0001F5E0\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9" - "\\U00002605\\U00002607-\\U0000260D\\U0000260F-\\U00002610\\U00002612\\U00002616-\\U00002617\\U00002619-\\U0000261C" - "\\U0000261E-\\U0000261F\\U00002621\\U00002624-\\U00002625\\U00002627-\\U00002629\\U0000262B-\\U0000262D\\U00002630-\\U00002637" - "\\U0000263B-\\U00002647\\U00002654-\\U0000265F\\U00002661-\\U00002662\\U00002664\\U00002667\\U00002669-\\U0000267A" - "\\U0000267C-\\U0000267E\\U00002680-\\U00002691\\U00002695\\U00002698\\U0000269A\\U0000269D-\\U0000269F\\U000026A2-\\U000026A9" - "\\U000026AC-\\U000026AF\\U000026B2-\\U000026BC\\U000026BF-\\U000026C3\\U000026C6-\\U000026C7\\U000026C9-\\U000026CD" - "\\U000026D0\\U000026D2\\U000026D5-\\U000026E8\\U000026EB-\\U000026EF\\U000026F6\\U000026FB-\\U000026FC\\U000026FE-\\U000026FF" - "\\U00002388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5" - "\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F" - "\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF\\U0001F900-\\U0001F90B\\U0001F91F\\U0001F928-\\U0001F92F" - "\\U0001F931-\\U0001F932\\U0001F94C\\U0001F95F-\\U0001F96B\\U0001F992-\\U0001F997\\U0001F9D0-\\U0001F9E6\\U0001F90C-\\U0001F90F" - "\\U0001F93F\\U0001F94D-\\U0001F94F\\U0001F96C-\\U0001F97F\\U0001F998-\\U0001F9BF\\U0001F9C1-\\U0001F9CF\\U0001F9E7-\\U0001F9FF" - "\\U0001F6C6-\\U0001F6CA\\U0001F6D3-\\U0001F6D4\\U0001F6E6-\\U0001F6E8\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6F7-\\U0001F6F8" - "\\U0001F6D5-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F9-\\U0001F6FF" - "]"; - //------------------------------------------------------------------------------------------ // // class RBBICharMonkey Character (Grapheme Cluster) specific implementation @@ -2014,11 +1610,7 @@ private: UnicodeSet *fLVSet; UnicodeSet *fLVTSet; UnicodeSet *fHangulSet; - UnicodeSet *fEmojiBaseSet; - UnicodeSet *fEmojiModifierSet; UnicodeSet *fExtendedPictSet; - UnicodeSet *fEBGSet; - UnicodeSet *fEmojiNRKSet; UnicodeSet *fAnySet; const UnicodeString *fText; @@ -2050,12 +1642,7 @@ RBBICharMonkey::RBBICharMonkey() { fHangulSet->addAll(*fLVSet); fHangulSet->addAll(*fLVTSet); - fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status); - fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status); - fExtendedPictSet = new UnicodeSet(gExtended_Pict, status); - fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status); - fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE( - "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status); + fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status); fAnySet = new UnicodeSet(0, 0x10ffff); fSets = new UVector(status); @@ -2069,12 +1656,8 @@ RBBICharMonkey::RBBICharMonkey() { fSets->addElement(fSpacingSet, status); fSets->addElement(fHangulSet, status); fSets->addElement(fAnySet, status); - fSets->addElement(fEmojiBaseSet, status); - fSets->addElement(fEmojiModifierSet, status); fSets->addElement(fZWJSet, status); fSets->addElement(fExtendedPictSet, status); - fSets->addElement(fEBGSet, status); - fSets->addElement(fEmojiNRKSet,status); if (U_FAILURE(status)) { deferredStatus = status; } @@ -2194,22 +1777,8 @@ int32_t RBBICharMonkey::next(int32_t prevPos) { continue; } - // Rule (GB10) (Emoji_Base | EBG) Extend * x Emoji_Modifier - if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) { - continue; - } - if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) && - fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) { - continue; - } - - // Rule (GB11) (Glue_After_ZWJ | Emoji) Extend * ZWJ x (Glue_After_ZWJ | Emoji) - if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) && - (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) { - continue; - } - if ((fExtendedPictSet->contains(cBase) || fEmojiNRKSet->contains(cBase)) && fExtendSet->contains(c0) && fZWJSet->contains(c1) && - (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) { + // Rule (GB11) Extended_Pictographic Extend * ZWJ x Extended_Pictographic + if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) { continue; } @@ -2218,7 +1787,7 @@ int32_t RBBICharMonkey::next(int32_t prevPos) { // a break if there are three or more contiguous RIs. If there are // only two, a break following will occur via other rules, and will include // any trailing extend characters, which is needed behavior. - if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1) + if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { break; } @@ -2256,12 +1825,8 @@ RBBICharMonkey::~RBBICharMonkey() { delete fLVTSet; delete fHangulSet; delete fAnySet; - delete fEmojiBaseSet; - delete fEmojiModifierSet; delete fZWJSet; delete fExtendedPictSet; - delete fEBGSet; - delete fEmojiNRKSet; } //------------------------------------------------------------------------------------------ @@ -2297,13 +1862,10 @@ private: UnicodeSet *fOtherSet; UnicodeSet *fExtendSet; UnicodeSet *fExtendNumLetSet; + UnicodeSet *fWSegSpaceSet; UnicodeSet *fDictionarySet; - UnicodeSet *fEBaseSet; - UnicodeSet *fEBGSet; - UnicodeSet *fEModifierSet; UnicodeSet *fZWJSet; UnicodeSet *fExtendedPictSet; - UnicodeSet *fEmojiNRKSet; const UnicodeString *fText; }; @@ -2327,18 +1889,14 @@ RBBIWordMonkey::RBBIWordMonkey() fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status); fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]", status); fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status); - fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status); + fNumericSet = new UnicodeSet(u"[[\\p{Word_Break = Numeric}][\\uff10-\\uff19]]", status); fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status); fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status); fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status); + fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status); - fEBaseSet = new UnicodeSet(u"[[\\p{Word_Break = EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]", status); - fEBGSet = new UnicodeSet(u"[\\p{Word_Break = EBG}]", status); - fEModifierSet = new UnicodeSet(u"[\\p{Word_Break = EM}]", status); fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status); - fExtendedPictSet = new UnicodeSet(gExtended_Pict, status); - fEmojiNRKSet = new UnicodeSet( - u"[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]", status); + fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status); fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status); fDictionarySet->addAll(*fKatakanaSet); @@ -2366,15 +1924,12 @@ RBBIWordMonkey::RBBIWordMonkey() fOtherSet->removeAll(*fMidNumSet); fOtherSet->removeAll(*fNumericSet); fOtherSet->removeAll(*fExtendNumLetSet); + fOtherSet->removeAll(*fWSegSpaceSet); fOtherSet->removeAll(*fFormatSet); fOtherSet->removeAll(*fExtendSet); fOtherSet->removeAll(*fRegionalIndicatorSet); - fOtherSet->removeAll(*fEBaseSet); - fOtherSet->removeAll(*fEBGSet); - fOtherSet->removeAll(*fEModifierSet); fOtherSet->removeAll(*fZWJSet); fOtherSet->removeAll(*fExtendedPictSet); - fOtherSet->removeAll(*fEmojiNRKSet); // Inhibit dictionary characters from being tested at all. fOtherSet->removeAll(*fDictionarySet); @@ -2398,13 +1953,10 @@ RBBIWordMonkey::RBBIWordMonkey() fSets->addElement(fExtendSet, status); fSets->addElement(fOtherSet, status); fSets->addElement(fExtendNumLetSet, status); + fSets->addElement(fWSegSpaceSet, status); - fSets->addElement(fEBaseSet, status); - fSets->addElement(fEBGSet, status); - fSets->addElement(fEModifierSet, status); fSets->addElement(fZWJSet, status); fSets->addElement(fExtendedPictSet, status); - fSets->addElement(fEmojiNRKSet, status); if (U_FAILURE(status)) { deferredStatus = status; @@ -2483,12 +2035,17 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) { break; }; - // Rule (3c) ZWJ x (Glue_after_ZWJ | EmojiNRK). + // Rule (3c) ZWJ x Extended_Pictographic // Not ignoring extend chars, so peek into input text to // get the potential ZWJ, the character immediately preceding c2. // Sloppy UChar32 indexing: p2-1 may reference trail half // but char32At will get the full code point. - if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) { + if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) { + continue; + } + + // Rule (3d) Keep horizontal whitespace together. + if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) { continue; } @@ -2582,11 +2139,6 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) { continue; } - // WB 14 (E_Base | EBG) x E_Modifier - if ((fEBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) { - continue; - } - // Rule 15 - 17 Group pairs of Regional Indicators. if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) { break; @@ -2626,15 +2178,12 @@ RBBIWordMonkey::~RBBIWordMonkey() { delete fFormatSet; delete fExtendSet; delete fExtendNumLetSet; + delete fWSegSpaceSet; delete fRegionalIndicatorSet; delete fDictionarySet; delete fOtherSet; - delete fEBaseSet; - delete fEBGSet; - delete fEModifierSet; delete fZWJSet; delete fExtendedPictSet; - delete fEmojiNRKSet; } @@ -2996,6 +2545,7 @@ private: UnicodeSet *fB2; UnicodeSet *fBA; UnicodeSet *fBB; + UnicodeSet *fHH; UnicodeSet *fHY; UnicodeSet *fH2; UnicodeSet *fH3; @@ -3023,9 +2573,7 @@ private: UnicodeSet *fXX; UnicodeSet *fEB; UnicodeSet *fEM; - UnicodeSet *fZJ; - UnicodeSet *fExtendedPict; - UnicodeSet *fEmojiNRK; + UnicodeSet *fZWJ; BreakIterator *fCharBI; const UnicodeString *fText; @@ -3062,6 +2610,7 @@ RBBILineMonkey::RBBILineMonkey() : fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); + fHH = new UnicodeSet(); fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); @@ -3090,9 +2639,7 @@ RBBILineMonkey::RBBILineMonkey() : fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status); fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status); - fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status); - fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status); - fExtendedPict = new UnicodeSet(gExtended_Pict, status); + fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status); if (U_FAILURE(status)) { deferredStatus = status; @@ -3104,7 +2651,9 @@ RBBILineMonkey::RBBILineMonkey() : fAL->addAll(*fSG); // Default behavior for SG is identical to AL. fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. - fCM->addAll(*fZJ); // ZWJ behaves as a CM. + fCM->addAll(*fZWJ); // ZWJ behaves as a CM. + + fHH->add(u'\u2010'); // Hyphen, '‐' fSets->addElement(fBK, status); fSets->addElement(fCR, status); @@ -3146,14 +2695,13 @@ RBBILineMonkey::RBBILineMonkey() : fSets->addElement(fSG, status); fSets->addElement(fEB, status); fSets->addElement(fEM, status); - fSets->addElement(fZJ, status); - fSets->addElement(fExtendedPict, status); - fSets->addElement(fEmojiNRK, status); + fSets->addElement(fZWJ, status); const char *rules = "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?" "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?" + "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?" "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*" "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*" "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?" @@ -3332,18 +2880,50 @@ int32_t RBBILineMonkey::next(int32_t startPos) { } // LB 8 Break after zero width space - if (fZW->contains(prevChar)) { + // ZW SP* ÷ + // Scan backwards from prevChar for SP* ZW + tPos = prevPos; + while (tPos>0 && fSP->contains(fText->char32At(tPos))) { + tPos = fText->moveIndex32(tPos, -1); + } + if (fZW->contains(fText->char32At(tPos))) { break; } - // LB 8a ZWJ x (ID | ExtendedPict | Emoji) + // LB 25 Numbers + // Move this test up, before LB8a, because numbers can match a longer sequence that would + // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM) + if (fNumberMatcher->lookingAt(prevPos, status)) { + if (U_FAILURE(status)) { + break; + } + // Matched a number. But could have been just a single digit, which would + // not represent a "no break here" between prevChar and thisChar + int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num + if (numEndIdx > pos) { + // Number match includes at least our two chars being checked + if (numEndIdx > nextPos) { + // Number match includes additional chars. Update pos and nextPos + // so that next loop iteration will continue at the end of the number, + // checking for breaks between last char in number & whatever follows. + pos = nextPos = numEndIdx; + do { + pos = fText->moveIndex32(pos, -1); + thisChar = fText->char32At(pos); + } while (fCM->contains(thisChar)); + } + continue; + } + } + + // LB 8a ZWJ x // The monkey test's way of ignoring combining characters doesn't work // for this rule. ZJ is also a CM. Need to get the actual character // preceding "thisChar", not ignoring combining marks, possibly ZJ. { int32_t prevIdx = fText->moveIndex32(pos, -1); UChar32 prevC = fText->char32At(prevIdx); - if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) { + if (fZWJ->contains(prevC)) { continue; } } @@ -3374,17 +2954,12 @@ int32_t RBBILineMonkey::next(int32_t startPos) { continue; } - - // LB 13 Don't break before closings. - // NU x CL, NU x CP and NU x IS are not matched here so that they will - // fall into LB 17 and the more general number regular expression. // - if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || - (!fNU->contains(prevChar) && fCP->contains(thisChar)) || - fEX->contains(thisChar) || - (!fNU->contains(prevChar) && fIS->contains(thisChar)) || - (!fNU->contains(prevChar) && fSY->contains(thisChar))) { + if (fCL->contains(thisChar) || + fCP->contains(thisChar) || + fEX->contains(thisChar) || + fSY->contains(thisChar)) { continue; } @@ -3392,7 +2967,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) { // Scan backwards, checking for this sequence. // The OP char could include combining marks, so we actually check for // OP CM* SP* - // Another Twist: The Rule 67 fixes may have changed a SP CM + // Another Twist: The Rule 9 fixes may have changed a SP CM // sequence into a ID char, so before scanning back through spaces, // verify that prevChar is indeed a space. The prevChar variable // may differ from fText[prevPos] @@ -3410,6 +2985,21 @@ int32_t RBBILineMonkey::next(int32_t startPos) { } + // LB 14a Break before an IS that begins a number and follows a space + if (nextPos < fText->length()) { + // note: UnicodeString::char32At(length) returns ffff, not distinguishable + // from a legit ffff character. So test length separately. + UChar32 nextChar = fText->char32At(nextPos); + if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) { + break; + } + } + + // LB14b Do not break before numeric separators, even after spaces. + if (fIS->contains(thisChar)) { + continue; + } + // LB 15 QU SP* x OP if (fOP->contains(thisChar)) { // Scan backwards from prevChar to see if it is preceded by QU CM* SP* @@ -3478,6 +3068,15 @@ int32_t RBBILineMonkey::next(int32_t startPos) { break; } + // LB 20.09 Don't break between Hyphens and letters if a break precedes the hyphen. + // Formerly this was a Finnish tailoring. + // Moved to root in ICU 63. This is an ICU customization, not in UAX-14. + // ^($HY | $HH) $AL; + if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) && + prevPosX2 == -1) { + continue; + } + // LB 21 if (fBA->contains(thisChar) || fHY->contains(thisChar) || @@ -3522,11 +3121,11 @@ int32_t RBBILineMonkey::next(int32_t startPos) { // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes. // PR x (ID | EB | EM) // (ID | EB | EM) x PO - if (fPR->contains(prevChar) && + if (fPR->contains(prevChar) && (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) { continue; } - if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && + if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fPO->contains(thisChar)) { continue; } @@ -3543,32 +3142,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) { continue; } - - - // LB 25 Numbers - if (fNumberMatcher->lookingAt(prevPos, status)) { - if (U_FAILURE(status)) { - break; - } - // Matched a number. But could have been just a single digit, which would - // not represent a "no break here" between prevChar and thisChar - int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num - if (numEndIdx > pos) { - // Number match includes at least our two chars being checked - if (numEndIdx > nextPos) { - // Number match includes additional chars. Update pos and nextPos - // so that next loop iteration will continue at the end of the number, - // checking for breaks between last char in number & whatever follows. - pos = nextPos = numEndIdx; - do { - pos = fText->moveIndex32(pos, -1); - thisChar = fText->char32At(pos); - } while (fCM->contains(thisChar)); - } - continue; - } - } - + // LB 25 numbers match, moved up, before LB 8a, // LB 26 Do not break a Korean syllable. if (fJL->contains(prevChar) && (fJL->contains(thisChar) || @@ -3626,12 +3200,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) { continue; } - // LB30a RI RI <break> RI - // RI x RI + // LB30a RI RI ÷ RI + // RI x RI if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) { break; } if (fRI->contains(prevChar) && fRI->contains(thisChar)) { + // Two Regional Indicators have been paired. + // Over-write the trailing one (thisChar) to prevent it from forming another pair with a + // following RI. This is a hack. + thisChar = -1; continue; } @@ -3670,6 +3248,7 @@ RBBILineMonkey::~RBBILineMonkey() { delete fB2; delete fBA; delete fBB; + delete fHH; delete fHY; delete fH2; delete fH3; @@ -3698,9 +3277,7 @@ RBBILineMonkey::~RBBILineMonkey() { delete fXX; delete fEB; delete fEM; - delete fZJ; - delete fExtendedPict; - delete fEmojiNRK; + delete fZWJ; delete fCharBI; delete fNumberMatcher; @@ -3762,16 +3339,16 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { forward[count] = i; if (count < expectedcount && expected[count] != i) { - test->errln("break forward test failed: expected %d but got %d", - expected[count], i); + test->errln("%s:%d break forward test failed: expected %d but got %d", + __FILE__, __LINE__, expected[count], i); break; } count ++; } if (count != expectedcount) { printStringBreaks(ustr, expected, expectedcount); - test->errln("break forward test failed: missed %d match", - expectedcount - count); + test->errln("%s:%d break forward test failed: missed %d match", + __FILE__, __LINE__, expectedcount - count); return; } // testing boundaries @@ -3779,13 +3356,15 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, int j = expected[i - 1]; if (!bi->isBoundary(j)) { printStringBreaks(ustr, expected, expectedcount); - test->errln("isBoundary() failed. Expected boundary at position %d", j); + test->errln("%s:%d isBoundary() failed. Expected boundary at position %d", + __FILE__, __LINE__, j); return; } for (j = expected[i - 1] + 1; j < expected[i]; j ++) { if (bi->isBoundary(j)) { printStringBreaks(ustr, expected, expectedcount); - test->errln("isBoundary() failed. Not expecting boundary at position %d", j); + test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d", + __FILE__, __LINE__, j); return; } } @@ -3795,8 +3374,8 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, count --; if (forward[count] != i) { printStringBreaks(ustr, expected, expectedcount); - test->errln("happy break test previous() failed: expected %d but got %d", - forward[count], i); + test->errln("%s:%d happy break test previous() failed: expected %d but got %d", + __FILE__, __LINE__, forward[count], i); break; } } @@ -3811,9 +3390,12 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, // int j = expected[i] + 1; int j = ustr.moveIndex32(expected[i], 1); for (; j <= expected[i + 1]; j ++) { - if (bi->preceding(j) != expected[i]) { + int32_t expectedPreceding = expected[i]; + int32_t actualPreceding = bi->preceding(j); + if (actualPreceding != expectedPreceding) { printStringBreaks(ustr, expected, expectedcount); - test->errln("preceding(): Not expecting boundary at position %d", j); + test->errln("%s:%d preceding(%d): expected %d, got %d", + __FILE__, __LINE__, j, expectedPreceding, actualPreceding); return; } } @@ -3905,7 +3487,12 @@ void RBBITest::TestWordBoundary(void) Locale locale("en"); UErrorCode status = U_ZERO_ERROR; // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); - BreakIterator *bi = BreakIterator::createWordInstance(locale, status); + LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status); + if (U_FAILURE(status)) { + errcheckln(status, "%s:%d Creation of break iterator failed %s", + __FILE__, __LINE__, u_errorName(status)); + return; + } UChar str[50]; static const char *strlist[] = { @@ -3940,43 +3527,44 @@ void RBBITest::TestWordBoundary(void) "\\u003b\\u0027\\u00b7\\u47a3", }; int loop; - if (U_FAILURE(status)) { - errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); - return; - } for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) { - // printf("looping %d\n", loop); - u_unescape(strlist[loop], str, 20); + u_unescape(strlist[loop], str, UPRV_LENGTHOF(str)); UnicodeString ustr(str); int forward[50]; int count = 0; bi->setText(ustr); - int prev = 0; - int i; - for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { - forward[count ++] = i; - if (i > prev) { - int j; - for (j = prev + 1; j < i; j ++) { - if (bi->isBoundary(j)) { - printStringBreaks(ustr, forward, count); - errln("happy boundary test failed: expected %d not a boundary", - j); - return; - } + int prev = -1; + for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) { + ++count; + if (count >= UPRV_LENGTHOF(forward)) { + errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)", + __FILE__, __LINE__, loop, count, boundary); + return; + } + forward[count] = boundary; + if (boundary <= prev) { + errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n", + __FILE__, __LINE__, loop, prev, boundary); + break; + } + for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) { + if (bi->isBoundary(nonBoundary)) { + printStringBreaks(ustr, forward, count); + errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)", + __FILE__, __LINE__, loop, prev, nonBoundary, boundary); + return; } } - if (!bi->isBoundary(i)) { + if (!bi->isBoundary(boundary)) { printStringBreaks(ustr, forward, count); - errln("happy boundary test failed: expected %d a boundary", - i); + errln("%s:%d happy boundary test failed: expected %d a boundary", + __FILE__, __LINE__, boundary); return; } - prev = i; + prev = boundary; } } - delete bi; } void RBBITest::TestLineBreaks(void) @@ -4709,6 +4297,7 @@ void RBBITest::TestBug12932() { // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt // remain undevided by ICU char, word and line break. void RBBITest::TestEmoji() { +#if !UCONFIG_NO_REGULAR_EXPRESSIONS UErrorCode status = U_ZERO_ERROR; CharString testFileName; @@ -4789,15 +4378,255 @@ void RBBITest::TestEmoji() { } } } +#endif +} + + +// TestBug12519 - Correct handling of Locales by assignment / copy / clone + +void RBBITest::TestBug12519() { + UErrorCode status = U_ZERO_ERROR; + LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status)); + LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status)); + if (!assertSuccess(WHERE, status)) { + dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status)); + return; + } + assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status)); + + assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status)); + assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr); + + LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone()); + assertTrue(WHERE, *biEn == *cloneEn); + assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status)); + + LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone()); + assertTrue(WHERE, *biFr == *cloneFr); + assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status)); + + LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status)); + UnicodeString text("Hallo Welt"); + biDe->setText(text); + assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe); + *biDe = *biFr; + assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe); +} + +void RBBITest::TestBug12677() { + // Check that stripping of comments from rules for getRules() is not confused by + // the presence of '#' characters in the rules that do not introduce comments. + UnicodeString rules(u"!!forward; \n" + "$x = [ab#]; # a set with a # literal. \n" + " # .; # a comment that looks sort of like a rule. \n" + " '#' '?'; # a rule with a quoted # \n" + ); + + UErrorCode status = U_ZERO_ERROR; + UParseError pe; + RuleBasedBreakIterator bi(rules, pe, status); + assertSuccess(WHERE, status); + UnicodeString rtRules = bi.getRules(); + assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules); +} + + +void RBBITest::TestTableRedundancies() { + UErrorCode status = U_ZERO_ERROR; + + LocalPointer<RuleBasedBreakIterator> bi ( + (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status)); + assertSuccess(WHERE, status); + if (U_FAILURE(status)) return; + + RBBIDataWrapper *dw = bi->fData; + const RBBIStateTable *fwtbl = dw->fForwardTable; + int32_t numCharClasses = dw->fHeader->fCatCount; + // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates); + + // Check for duplicate columns (character categories) + + std::vector<UnicodeString> columns; + for (int32_t column = 0; column < numCharClasses; column++) { + UnicodeString s; + for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) { + RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r)); + s.append(row->fNextState[column]); + } + columns.push_back(s); + } + // Ignore column (char class) 0 while checking; it's special, and may have duplicates. + for (int c1=1; c1<numCharClasses; c1++) { + for (int c2 = c1+1; c2 < numCharClasses; c2++) { + if (columns.at(c1) == columns.at(c2)) { + errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2); + goto out; + } + } + } + out: + + // Check for duplicate states + std::vector<UnicodeString> rows; + for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) { + UnicodeString s; + RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r)); + assertTrue(WHERE, row->fAccepting >= -1); + s.append(row->fAccepting + 1); // values of -1 are expected. + s.append(row->fLookAhead); + s.append(row->fTagIdx); + for (int32_t column = 0; column < numCharClasses; column++) { + s.append(row->fNextState[column]); + } + rows.push_back(s); + } + for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) { + for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) { + if (rows.at(r1) == rows.at(r2)) { + errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2); + return; + } + } + } +} + +// Bug 13447: verify that getRuleStatus() returns the value corresponding to current(), +// even after next() has returned DONE. + +void RBBITest::TestBug13447() { + UErrorCode status = U_ZERO_ERROR; + LocalPointer<RuleBasedBreakIterator> bi( + (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status)); + assertSuccess(WHERE, status); + if (U_FAILURE(status)) return; + UnicodeString data(u"1234"); + bi->setText(data); + assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus()); + assertEquals(WHERE, 4, bi->next()); + assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus()); + assertEquals(WHERE, UBRK_DONE, bi->next()); + assertEquals(WHERE, 4, bi->current()); + assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus()); +} + +// TestReverse exercises both the synthesized safe reverse rules and the logic +// for filling the break iterator cache when starting from random positions +// in the text. +// +// It's a monkey test, working on random data, with the expected data obtained +// from forward iteration (no safe rules involved), comparing with results +// when indexing into the interior of the string (safe rules needed). + +void RBBITest::TestReverse() { + UErrorCode status = U_ZERO_ERROR; + + TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *) + BreakIterator::createCharacterInstance(Locale::getEnglish(), status))); + assertSuccess(WHERE, status, true); + status = U_ZERO_ERROR; + TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *) + BreakIterator::createWordInstance(Locale::getEnglish(), status))); + assertSuccess(WHERE, status, true); + status = U_ZERO_ERROR; + TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *) + BreakIterator::createLineInstance(Locale::getEnglish(), status))); + assertSuccess(WHERE, status, true); + status = U_ZERO_ERROR; + TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *) + BreakIterator::createSentenceInstance(Locale::getEnglish(), status))); + assertSuccess(WHERE, status, true); +} + +void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) { + if (!bi) { + return; + } + + // From the mapping trie in the break iterator's internal data, create a + // vector of UnicodeStrings, one for each character category, containing + // all of the code points that map to that category. Unicode planes 0 and 1 only, + // to avoid an execess of unassigned code points. + + RBBIDataWrapper *data = bi->fData; + int32_t categoryCount = data->fHeader->fCatCount; + UTrie2 *trie = data->fTrie; + + std::vector<UnicodeString> strings(categoryCount, UnicodeString()); + for (int cp=0; cp<0x1fff0; ++cp) { + int cat = utrie2_get32(trie, cp); + cat &= ~0x4000; // And off the dictionary bit from the category. + assertTrue(WHERE, cat < categoryCount && cat >= 0); + if (cat < 0 || cat >= categoryCount) return; + strings[cat].append(cp); + } + + icu_rand randomGen; + const int testStringLength = 10000; + UnicodeString testString; + + for (int i=0; i<testStringLength; ++i) { + int charClass = randomGen() % categoryCount; + if (strings[charClass].length() > 0) { + int cp = strings[charClass].char32At(randomGen() % strings[charClass].length()); + testString.append(cp); + } + } + + typedef std::pair<UBool, int32_t> Result; + std::vector<Result> expectedResults; + bi->setText(testString); + for (int i=0; i<testString.length(); ++i) { + bool isboundary = bi->isBoundary(i); + int ruleStatus = bi->getRuleStatus(); + expectedResults.push_back(std::make_pair(isboundary, ruleStatus)); + } + + for (int i=testString.length()-1; i>=0; --i) { + bi->setText(testString); // clears the internal break cache + Result expected = expectedResults[i]; + assertEquals(WHERE, expected.first, bi->isBoundary(i)); + assertEquals(WHERE, expected.second, bi->getRuleStatus()); + } } +// Ticket 13692 - finding word boundaries in very large numbers or words could +// be very time consuming. When the problem was present, this void test +// would run more than fifteen minutes, which is to say, the failure was noticeale. + +void RBBITest::TestBug13692() { + UErrorCode status = U_ZERO_ERROR; + LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *) + BreakIterator::createWordInstance(Locale::getEnglish(), status), status); + if (!assertSuccess(WHERE, status, true)) { + return; + } + constexpr int32_t LENGTH = 1000000; + UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH); + for (int i=0; i<20; i+=2) { + longNumber.setCharAt(i, u' '); + } + bi->setText(longNumber); + assertFalse(WHERE, bi->isBoundary(LENGTH-5)); + assertSuccess(WHERE, status); +} + // // TestDebug - A place-holder test for debugging purposes. // For putting in fragments of other tests that can be invoked // for tracing without a lot of unwanted extra stuff happening. // void RBBITest::TestDebug(void) { + UErrorCode status = U_ZERO_ERROR; + LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *) + BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status); + if (!assertSuccess(WHERE, status, true)) { + return; + } + const UnicodeString &rules = bi->getRules(); + UParseError pe; + LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status)); + assertSuccess(WHERE, status); } void RBBITest::TestProperties() { @@ -4811,4 +4640,4 @@ void RBBITest::TestProperties() { } } -#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ +#endif // #if !UCONFIG_NO_BREAK_ITERATION