X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..b25be06635768807f8f693286fa73bb2297bb06c:/icuSources/test/intltest/rbbitst.cpp diff --git a/icuSources/test/intltest/rbbitst.cpp b/icuSources/test/intltest/rbbitst.cpp index a0deb0cb..df5caf8e 100644 --- a/icuSources/test/intltest/rbbitst.cpp +++ b/icuSources/test/intltest/rbbitst.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1999-2003, International Business Machines Corporation and + * Copyright (c) 1999-2011, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /************************************************************************ @@ -9,6 +9,8 @@ * 01/12/2000 Madhu Updated for changed API and added new tests ************************************************************************/ +#include // for 'typeid' to work + #include "unicode/utypes.h" #if !UCONFIG_NO_BREAK_ITERATION @@ -22,16 +24,131 @@ #include "unicode/schriter.h" #include "unicode/uniset.h" #include "unicode/regex.h" // TODO: make conditional on regexp being built. - +#include "unicode/ustring.h" +#include "unicode/utext.h" #include "intltest.h" #include "rbbitst.h" #include #include "uvector.h" #include "uvectr32.h" +#include "triedict.h" #include #include #include +#define TEST_ASSERT(x) {if (!(x)) { \ + errln("Failure in file %s, line %d", __FILE__, __LINE__);}} + +#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ + errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} + + +//--------------------------------------------- +// runIndexedTest +//--------------------------------------------- + +void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) +{ + if (exec) logln("TestSuite RuleBasedBreakIterator: "); + + switch (index) { +#if !UCONFIG_NO_FILE_IO + case 0: name = "TestBug4153072"; + if(exec) TestBug4153072(); break; +#else + case 0: name = "skip"; + break; +#endif + + case 1: name = "TestJapaneseLineBreak"; + if(exec) TestJapaneseLineBreak(); break; + case 2: name = "TestStatusReturn"; + if(exec) TestStatusReturn(); break; + +#if !UCONFIG_NO_FILE_IO + case 3: name = "TestUnicodeFiles"; + if(exec) TestUnicodeFiles(); break; + case 4: name = "TestEmptyString"; + if(exec) TestEmptyString(); break; +#else + case 3: case 4: name = "skip"; + break; +#endif + + case 5: name = "TestGetAvailableLocales"; + if(exec) TestGetAvailableLocales(); break; + + case 6: name = "TestGetDisplayName"; + if(exec) TestGetDisplayName(); break; + +#if !UCONFIG_NO_FILE_IO + case 7: name = "TestEndBehaviour"; + if(exec) TestEndBehaviour(); break; + case 8: name = "TestMixedThaiLineBreak"; + if(exec) TestMixedThaiLineBreak(); break; + case 9: name = "TestThaiLineBreak"; + if(exec) TestThaiLineBreak(); break; + case 10: name = "TestMaiyamok"; + if(exec) TestMaiyamok(); break; + case 11: name = "TestWordBreaks"; + if(exec) TestWordBreaks(); break; + case 12: name = "TestWordBoundary"; + if(exec) TestWordBoundary(); break; + case 13: name = "TestLineBreaks"; + if(exec) TestLineBreaks(); break; + case 14: name = "TestSentBreaks"; + if(exec) TestSentBreaks(); break; + case 15: name = "TestExtended"; + if(exec) TestExtended(); break; +#else + case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip"; + break; +#endif + + case 16: + if(exec) { + #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO + name = "TestMonkey"; + TestMonkey(params); + #else + name = "skip"; + #endif + } + break; + +#if !UCONFIG_NO_FILE_IO + case 17: name = "TestBug3818"; + if(exec) TestBug3818(); break; + case 18: name = "TestJapaneseWordBreak"; + if(exec) TestJapaneseWordBreak(); break; +#else + case 17: case 18: name = "skip"; + break; +#endif + + case 19: name = "TestDebug"; + if(exec) TestDebug(); break; + case 20: name = "TestTrieDict"; + if(exec) TestTrieDict(); break; + +#if !UCONFIG_NO_FILE_IO + case 21: name = "TestBug5775"; + if (exec) TestBug5775(); break; + case 22: name = "TestThaiBreaks"; + if (exec) TestThaiBreaks(); break; + case 23: name = "TestTailoredBreaks"; + if (exec) TestTailoredBreaks(); break; +#else + case 21: case 22: case 23: name = "skip"; + break; +#endif + case 24: name = "TestDictRules"; + if (exec) TestDictRules(); break; + case 25: name = "TestBug5532"; + if (exec) TestBug5532(); break; + default: name = ""; break; //needed to end loop + } +} //--------------------------------------------------------------------------- @@ -71,7 +188,7 @@ BITestData::BITestData(UErrorCode &status) : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), fActualTags(status) { -}; +} // // addDataChunk. Add a section (non-breaking) piece if data to the test data. @@ -91,7 +208,7 @@ void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UE fExpectedBreakPositions.addElement(fDataToBreak.length(), status); fExpectedTags.addElement(tag, status); fLineNum.addElement(lineNum, status); -}; +} // @@ -162,9 +279,9 @@ void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); } if (actual < expected) { - test->errln("%s unexpected break at offset %d in test item from line %d", heading, o, line); + test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); } else { - test->errln("%s Failed to find break at end of item from line %d", heading, line); + test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); } } @@ -252,13 +369,13 @@ static const int T_IDEO = 400; // //----------------------------------------------------------------------------------- void RBBITest::TestStatusReturn() { - UnicodeString rulesString1 = "$Letters = [:L:];\n" + UnicodeString rulesString1("$Letters = [:L:];\n" "$Numbers = [:N:];\n" "$Letters+{1};\n" "$Numbers+{2};\n" "Help\\ {4}/me\\!;\n" "[^$Letters $Numbers];\n" - "!.*;\n"; + "!.*;\n", -1, US_INV); UnicodeString testString1 = "abc123..abc Help me Help me!"; // 01234567890123456789012345678 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; @@ -269,7 +386,7 @@ void RBBITest::TestStatusReturn() { RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); if(U_FAILURE(status)) { - errln("FAIL : in construction"); + dataerrln("FAIL : in construction - %s", u_errorName(status)); } else { int32_t pos; int32_t i = 0; @@ -292,6 +409,51 @@ void RBBITest::TestStatusReturn() { } +static void printStringBreaks(UnicodeString ustr, int expected[], + int expectedcount) +{ + UErrorCode status = U_ZERO_ERROR; + char name[100]; + printf("code alpha extend alphanum type word sent line name\n"); + int j; + for (j = 0; j < ustr.length(); j ++) { + if (expectedcount > 0) { + int k; + for (k = 0; k < expectedcount; k ++) { + if (j == expected[k]) { + printf("------------------------------------------------ %d\n", + j); + } + } + } + UChar32 c = ustr.char32At(j); + if (c > 0xffff) { + j ++; + } + u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); + printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, + u_isUAlphabetic(c), + u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), + u_isalnum(c), + u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, + u_charType(c), + U_SHORT_PROPERTY_NAME), + u_getPropertyValueName(UCHAR_WORD_BREAK, + u_getIntPropertyValue(c, + UCHAR_WORD_BREAK), + U_SHORT_PROPERTY_NAME), + u_getPropertyValueName(UCHAR_SENTENCE_BREAK, + u_getIntPropertyValue(c, + UCHAR_SENTENCE_BREAK), + U_SHORT_PROPERTY_NAME), + u_getPropertyValueName(UCHAR_LINE_BREAK, + u_getIntPropertyValue(c, + UCHAR_LINE_BREAK), + U_SHORT_PROPERTY_NAME), + name); + } +} + void RBBITest::TestThaiLineBreak() { UErrorCode status = U_ZERO_ERROR; BITestData thaiLineSelection(status); @@ -328,7 +490,7 @@ void RBBITest::TestThaiLineBreak() { Locale("th"), status); if (U_FAILURE(status)) { - errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n"); + errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status)); return; } @@ -345,66 +507,34 @@ void RBBITest::TestMixedThaiLineBreak() ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data - // Arabic numerals should always be separated from surrounding Thai text -/* - ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status); - thaiLineSelection->addElement("39"); - ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status); - - // words in non-Thai scripts should always be separated from surrounding Thai text - ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e14", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e2d\\u0e1a", 0, status); - thaiLineSelection->addElement("Java"); - ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e19", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 ", 0, status); - - // Thai numerals should always be separated from the text surrounding them - ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e53\\u0e59", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status); - - // Thai text should interact correctly with punctuation and symbols - ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21", 0, status); -// ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28", 0, status); -// ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e17\\u0e22)", 0, status); -ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)", 0, status); -// I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary - ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e1b\\u0e34\\u0e14", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status); -*/ - - // The Unicode Linebreak TR says do not break before or after quotes. - // So this test is changed ot not break around the quote. - // TODO: should Thai break around the around the quotes, like the original behavior here? -// ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"", 0, status); -// ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"" - "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34.", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e22.", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e35\\u0e49", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e32\\u0e04\\u0e32", 0, status); - ADD_DATACHUNK(thaiLineSelection, "$200", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e48\\u0e32", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19 ", 0, status); - ADD_DATACHUNK(thaiLineSelection, "(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\").", 0, status); + // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters + // start + + ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status); + ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status); + ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status); + ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status); + + // @suwit - end of changes + RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status); if (U_FAILURE(status)) { - errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n"); + errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status)); return; } @@ -425,9 +555,11 @@ void RBBITest::TestMaiyamok() ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status); ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status); ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status); ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status); - ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status); ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status); RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( @@ -435,139 +567,286 @@ void RBBITest::TestMaiyamok() if (U_FAILURE(status)) { - errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n"); + errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status)); return; } generalIteratorTest(*e, thaiLineSelection); delete e; } -void RBBITest::TestThaiWordBreak() { - UErrorCode status = U_ZERO_ERROR; - BITestData thaiWordSelection(status); - - ADD_DATACHUNK(thaiWordSelection, NULL, 0, status); // Break at start of data - ADD_DATACHUNK(thaiWordSelection, "\\u0E1A\\u0E17", 0, status); //2 - ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E35\\u0E48", 0, status); //5 - ADD_DATACHUNK(thaiWordSelection, "\\u0E51", 0, status); //6 - ADD_DATACHUNK(thaiWordSelection, "\\u0E1E\\u0E32\\u0E22\\u0E38", 0, status); //10 - ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19", 0, status); //16 - ADD_DATACHUNK(thaiWordSelection, "\\u000D\\u000A", 0, status); //18 - - // This is the correct result - //ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35", 0, status); //24 - //ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29 - - // and this is what the dictionary does... - ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14", 0, status); // 20 - ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29 - - ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E22\\u0E39\\u0E48", 0, status); //33 - // This is the correct result - //ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21", 0, status); //37 - //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41 - // and this is what the dictionary does - ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41 +void RBBITest::TestBug3818() { + UErrorCode status = U_ZERO_ERROR; - ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E38\\u0E48\\u0E07", 0, status); //45 - ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E2B\\u0E0D\\u0E48", 0, status); //49 - ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E19", 0, status); //51 + // Four Thai words... + static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, + 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; + UnicodeString thaiStr(thaiWordData); - // This is the correct result - //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A", 0, status); //57 - //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E31\\u0E1A", 0, status); //60 + RuleBasedBreakIterator* bi = + (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status); + if (U_FAILURE(status) || bi == NULL) { + errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); + return; + } + bi->setText(thaiStr); - // and this is what the dictionary does - ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19", 0, status); // 54 - ADD_DATACHUNK(thaiWordSelection, "\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A", 0, status); //60 + int32_t startOfSecondWord = bi->following(1); + if (startOfSecondWord != 4) { + errln("Fail at file %s, line %d expected start of word at 4, got %d", + __FILE__, __LINE__, startOfSecondWord); + } + startOfSecondWord = bi->following(0); + if (startOfSecondWord != 4) { + errln("Fail at file %s, line %d expected start of word at 4, got %d", + __FILE__, __LINE__, startOfSecondWord); + } + delete bi; +} - ADD_DATACHUNK(thaiWordSelection, "\\u0E25\\u0E38\\u0E07", 0, status); //63 - // This is the correct result - //ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35", 0, status); //68 - //ADD_DATACHUNK(thaiWordSelection, "\\u0E0A\\u0E32\\u0E27", 0, status); //71 - //ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E23\\u0E48", 0, status); //74 - //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E25\\u0E30", 0, status); //77 +void RBBITest::TestJapaneseWordBreak() { + UErrorCode status = U_ZERO_ERROR; + BITestData japaneseWordSelection(status); - // and this is what the dictionary does - ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E", 0, status); // 65 - ADD_DATACHUNK(thaiWordSelection, "\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30", 0, status); //77 + ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data + ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2 + ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5 + ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7 + ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10 + ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11 + ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance( - Locale("th"), status); + Locale("ja"), status); if (U_FAILURE(status)) { - errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n"); + errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n"); return; } - generalIteratorTest(*e, thaiWordSelection); + generalIteratorTest(*e, japaneseWordSelection); delete e; } +void RBBITest::TestTrieDict() { + UErrorCode status = U_ZERO_ERROR; -//--------------------------------------------- -// runIndexedTest -//--------------------------------------------- + // + // Open and read the test data file. + // + const char *testDataDirectory = IntlTest::getSourceTestData(status); + char testFileName[1000]; + if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) { + errln("Can't open test data. Path too long."); + return; + } + strcpy(testFileName, testDataDirectory); + strcat(testFileName, "riwords.txt"); + + // Items needing deleting at the end + MutableTrieDictionary *mutableDict = NULL; + CompactTrieDictionary *compactDict = NULL; + UnicodeSet *breaks = NULL; + UChar *testFile = NULL; + StringEnumeration *enumer1 = NULL; + StringEnumeration *enumer2 = NULL; + MutableTrieDictionary *mutable2 = NULL; + StringEnumeration *cloneEnum = NULL; + CompactTrieDictionary *compact2 = NULL; + + + const UnicodeString *originalWord = NULL; + const UnicodeString *cloneWord = NULL; + UChar *current; + UChar *word; + UChar uc; + int32_t wordLen; + int32_t wordCount; + int32_t testCount; -void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) -{ - if (exec) logln("TestSuite RuleBasedBreakIterator: "); - switch (index) { + int len; + testFile = ReadAndConvertFile(testFileName, len, NULL, status); + if (U_FAILURE(status)) { + goto cleanup; /* something went wrong, error already output */ + } - case 0: name = "TestExtended"; - if(exec) TestExtended(); break; - case 1: name = "TestJapaneseLineBrea"; - if(exec) TestJapaneseLineBreak(); break; - case 2: name = "TestStatusReturn"; - if(exec) TestStatusReturn(); break; + mutableDict = new MutableTrieDictionary(0x0E1C, status); + if (U_FAILURE(status)) { + errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)); + goto cleanup; + } - case 3: name = "TestLineBreakData"; - if(exec) TestLineBreakData(); break; - case 4: name = "TestSentenceInvariants"; - if(exec) TestSentenceInvariants(); break; - case 5: name = "TestCharacterInvariants"; - if(exec) TestCharacterInvariants(); break; - case 6: name = "TestWordInvariants"; - if(exec) TestWordInvariants(); break; + breaks = new UnicodeSet; + breaks->add(0x000A); // Line Feed + breaks->add(0x000D); // Carriage Return + breaks->add(0x2028); // Line Separator + breaks->add(0x2029); // Paragraph Separator + + // Now add each non-comment line of the file as a word. + current = testFile; + word = current; + uc = *current++; + wordLen = 0; + wordCount = 0; + + while (uc) { + if (uc == 0x0023) { // #comment line, skip + while (uc && !breaks->contains(uc)) { + uc = *current++; + } + } + else while (uc && !breaks->contains(uc)) { + ++wordLen; + uc = *current++; + } + if (wordLen > 0) { + mutableDict->addWord(word, wordLen, status); + if (U_FAILURE(status)) { + errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status)); + goto cleanup; + } + wordCount += 1; + } - case 7: name = "TestEmptyString"; - if(exec) TestEmptyString(); break; + // Find beginning of next line + while (uc && breaks->contains(uc)) { + uc = *current++; + } + word = current-1; + wordLen = 0; + } - case 8: name = "TestGetAvailableLocales"; - if(exec) TestGetAvailableLocales(); break; + if (wordCount < 50) { + errln("Word count (%d) unreasonably small\n", wordCount); + goto cleanup; + } - case 9: name = "TestGetDisplayName"; - if(exec) TestGetDisplayName(); break; + enumer1 = mutableDict->openWords(status); + if (U_FAILURE(status)) { + errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status)); + goto cleanup; + } - case 10: name = "TestEndBehaviour"; - if(exec) TestEndBehaviour(); break; - case 11: name = "TestBug4153072"; - if(exec) TestBug4153072(); break; - case 12: name = "TestMonkey"; - if(exec) { -#if !UCONFIG_NO_REGULAR_EXPRESSIONS - TestMonkey(params); -#else - logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)"); -#endif - } - break; + testCount = 0; + if (wordCount != (testCount = enumer1->count(status))) { + errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", + testCount, wordCount, u_errorName(status)); + goto cleanup; + } - case 13: name = "TestThaiLineBreak"; - if(exec) TestThaiLineBreak(); break; - case 14: name = "TestMixedThaiLineBreak"; - if(exec) TestMixedThaiLineBreak(); break; - case 15: name = "TestMaiyamok"; - if(exec) TestMaiyamok(); break; - case 16: name = "TestThaiWordBreak"; - if(exec) TestThaiWordBreak(); break; + // Now compact it + compactDict = new CompactTrieDictionary(*mutableDict, status); + if (U_FAILURE(status)) { + errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status)); + goto cleanup; + } + enumer2 = compactDict->openWords(status); + if (U_FAILURE(status)) { + errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status)); + goto cleanup; + } - default: name = ""; break; //needed to end loop + if (wordCount != (testCount = enumer2->count(status))) { + errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", + testCount, wordCount, u_errorName(status)); + goto cleanup; + } + + if (typeid(*enumer1) == typeid(*enumer2)) { + errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same"); + } + delete enumer1; + enumer1 = NULL; + delete enumer2; + enumer2 = NULL; + + // Now un-compact it + mutable2 = compactDict->cloneMutable(status); + if (U_FAILURE(status)) { + errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status)); + goto cleanup; + } + + cloneEnum = mutable2->openWords(status); + if (U_FAILURE(status)) { + errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status)); + goto cleanup; + } + + if (wordCount != (testCount = cloneEnum->count(status))) { + errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", + testCount, wordCount, u_errorName(status)); + goto cleanup; + } + + // Compact original dictionary to clone. Note that we can only compare the same kind of + // dictionary as the order of the enumerators is not guaranteed to be the same between + // different kinds + enumer1 = mutableDict->openWords(status); + if (U_FAILURE(status)) { + errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status)); + goto cleanup; + } + + originalWord = enumer1->snext(status); + cloneWord = cloneEnum->snext(status); + while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { + if (*originalWord != *cloneWord) { + errln("Original and cloned MutableTrieDictionary word mismatch\n"); + goto cleanup; + } + originalWord = enumer1->snext(status); + cloneWord = cloneEnum->snext(status); + } + + if (U_FAILURE(status)) { + errln("Enumeration failed: %s\n", u_errorName(status)); + goto cleanup; + } + + if (originalWord != cloneWord) { + errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n"); + goto cleanup; + } + + // Test the data copying constructor for CompactTrieDict, and the data access APIs. + compact2 = new CompactTrieDictionary(compactDict->data(), status); + if (U_FAILURE(status)) { + errln("CompactTrieDictionary(const void *,...) failed\n"); + goto cleanup; + } + + if (compact2->dataSize() == 0) { + errln("CompactTrieDictionary->dataSize() == 0\n"); + goto cleanup; + } + + // Now count the words via the second dictionary + delete enumer1; + enumer1 = compact2->openWords(status); + if (U_FAILURE(status)) { + errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status)); + goto cleanup; + } + + if (wordCount != (testCount = enumer1->count(status))) { + errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n", + testCount, wordCount, u_errorName(status)); + goto cleanup; } + +cleanup: + delete compactDict; + delete mutableDict; + delete breaks; + delete[] testFile; + delete enumer1; + delete mutable2; + delete cloneEnum; + delete compact2; } @@ -633,7 +912,7 @@ void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) int32_t lastP = 0x7ffffffe; int32_t tag; - logln("Test first and next"); + logln("Test last and previous"); bi.setText(td.fDataToBreak); td.clearResults(); @@ -788,8 +1067,16 @@ void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestD count++; offset = iterator.next(); - if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) - errln("operator== failed: Two unequal iterators compared equal."); + if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { + errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); + if (count > 10000 || offset == -1) { + errln("operator== failed too many times. Stopping test."); + if (offset == -1) { + errln("Does (RuleBasedBreakIterator::DONE == -1)?"); + } + return; + } + } } } while (offset != RuleBasedBreakIterator::DONE); @@ -813,184 +1100,6 @@ void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestD } - -//-------------------------------------------------------------------------------------------- -// -// Break Iterator Invariants Tests -// -//-------------------------------------------------------------------------------------------- - -void RBBITest::TestCharacterInvariants() -{ - UErrorCode status = U_ZERO_ERROR; - BreakIterator *e = BreakIterator::createCharacterInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n"); - return; - } - UnicodeString s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa"); - doBreakInvariantTest(*e, s); - s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa"); - doOtherInvariantTest(*e, s); - delete e; -} - - -void RBBITest::TestWordInvariants() -{ - UErrorCode status = U_ZERO_ERROR; - BreakIterator *e = BreakIterator::createWordInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n"); - return; - } - UnicodeString s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02"); - doBreakInvariantTest(*e, s); - s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02"); - doOtherInvariantTest(*e, s); - delete e; -} - - -void RBBITest::TestSentenceInvariants() -{ - UErrorCode status = U_ZERO_ERROR; - BreakIterator *e = BreakIterator::createSentenceInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n"); - return; - } - UnicodeString s = *cannedTestChars + CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff"); - doOtherInvariantTest(*e, s); - delete e; -} - - - - -void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars) -{ - UnicodeString work("aaa"); - int32_t errCount = 0, testCharsLen = testChars.length(), breaksLen; - - // a break should always occur after CR (unless followed by LF), LF, PS, and LS - UnicodeString breaks = CharsToUnicodeString("\r\n\\u2029\\u2028"); - int32_t i, j; - - breaksLen = breaks.length(); - for (i = 0; i < breaksLen; i++) { - UChar c1 = breaks[i]; - work.setCharAt(1, c1); - for (j = 0; j < testCharsLen; j++) { - UChar c0 = testChars[j]; - work.setCharAt(0, c0); - for (int k = 0; k < testCharsLen; k++) { - UChar c2 = testChars[k]; - work.setCharAt(2, c2); - - // if a cr is followed by lf, ps, ls or etx, don't do the check (that's - // not supposed to work) - if (c1 == '\r' && (c2 == '\n' || c2 == 0x2029 - || c2 == 0x2028 || c2 == 0x0003)) - continue; - - if (u_charType(c1) == U_CONTROL_CHAR && - (u_charType(c2) == U_NON_SPACING_MARK || - u_charType(c2) == U_ENCLOSING_MARK || - u_charType(c2) == U_COMBINING_SPACING_MARK) - ) { - // Combining marks don't combine with controls. - // TODO: enhance test to verify that the break actually occurs, - // not just ignore the case. - continue; - } - - - tb.setText(work); - UBool seen2 = FALSE; - for (int l = tb.first(); l != BreakIterator::DONE; l = tb.next()) { - if (l == 2) { - seen2 = TRUE; - break; - } - } - if (!seen2) { - errln("No Break between \\U%04x and \\U%04x", c1, c2); - errCount++; - if (errCount >= 75) - return; - } - } - } - } -} - - - -void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars) -{ - UnicodeString work("a\r\na"); - int32_t errCount = 0, testCharsLen = testChars.length(); - int32_t i, j; - int8_t type; - - // a break should never occur between CR and LF - for (i = 0; i < testCharsLen; i++) { - work.setCharAt(0, testChars[i]); - for (j = 0; j < testCharsLen; j++) { - work.setCharAt(3, testChars[j]); - tb.setText(work); - for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next()) - if (k == 2) { - errln("Break between CR and LF in string U\\%04x U\\%04x U\\%04x U\\%04x", - work[0], work[1], work[2], work[3]); - errCount++; - if (errCount >= 75) - return; - } - } - } - - // a break should never occur before a non-spacing mark, unless the preceding - // character is CR, LF, PS, or LS - // Or the general category == Control. - work.remove(); - work += "aaaa"; - for (i = 0; i < testCharsLen; i++) { - UChar c1 = testChars[i]; - if (c1 == '\n' || c1 == '\r' || c1 == 0x2029 || c1 == 0x2028 || c1 == 0x0003 || - u_charType(c1) == U_CONTROL_CHAR || u_charType(c1) == U_FORMAT_CHAR) { - continue; - } - work.setCharAt(1, c1); - for (j = 0; j < testCharsLen; j++) { - UChar c2 = testChars[j]; - type = u_charType(c2); - if ((type != U_NON_SPACING_MARK) && - (type != U_ENCLOSING_MARK)) { - continue; - } - work.setCharAt(2, c2); - tb.setText(work); - for (int k = tb.first(); k != BreakIterator::DONE; k = tb.next()) - if (k == 2) { - //errln("Break between U+" + UCharToUnicodeString(work[1]) - // + " and U+" + UCharToUnicodeString(work[2])); - errln("Unexpected Break between %6x and %6x", c1, c2); - errCount++; - if (errCount >= 75) - return; - } - } - } -} - - - - //--------------------------------------------- // // other tests @@ -1006,7 +1115,7 @@ void RBBITest::TestEmptyString() RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); if (U_FAILURE(status)) { - errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n"); + errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); return; } generalIteratorTest(*bi, x); @@ -1019,9 +1128,10 @@ void RBBITest::TestGetAvailableLocales() const Locale* locList = BreakIterator::getAvailableLocales(locCount); if (locCount == 0) - errln("getAvailableLocales() returned an empty list!"); + dataerrln("getAvailableLocales() returned an empty list!"); // Just make sure that it's returning good memory. - for (int32_t i = 0; i < locCount; ++i) { + int32_t i; + for (i = 0; i < locCount; ++i) { logln(locList[i].getName()); } } @@ -1033,12 +1143,12 @@ void RBBITest::TestGetDisplayName() BreakIterator::getDisplayName(Locale::getUS(), result); if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") - errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" + dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" + result); BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); if (result != "French (France)") - errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" + dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" + result); } /** @@ -1052,7 +1162,7 @@ void RBBITest::TestEndBehaviour() BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); if (U_FAILURE(status)) { - errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n"); + errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); return; } wb->setText(testString); @@ -1073,20 +1183,25 @@ void RBBITest::TestBug4153072() { BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); if (U_FAILURE(status)) { - errln("Failed to create the BreakIterator for default locale in TestBug4153072\n"); + errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); return; } UnicodeString str("...Hello, World!..."); int32_t begin = 3; int32_t end = str.length() - 3; - UBool dummy; + UBool onBoundary; StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); iter->adoptText(textIterator); - for (int index = -1; index < begin + 1; ++index) { - dummy = iter->isBoundary(index); - if (index < begin && dummy == TRUE) { - errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index + + int index; + // Note: with the switch to UText, there is no way to restrict the + // iteration range to begin at an index other than zero. + // String character iterators created with a non-zero bound are + // treated by RBBI as being empty. + for (index = -1; index < begin + 1; ++index) { + onBoundary = iter->isBoundary(index); + if (index == 0? !onBoundary : onBoundary) { + errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + " and begin index = " + begin); } } @@ -1094,13 +1209,53 @@ void RBBITest::TestBug4153072() { } -/** - * Test Japanese Line Break - * @bug 4095322 - */ -void RBBITest::TestJapaneseLineBreak() -{ - // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count +// +// Test for problem reported by Ashok Matoria on 9 July 2007 +// One.Two. +// +// Sentence break at start (0) and then on calling next() it breaks at +// 'T' of "Two". Now, at this point if I do next() and +// then previous(), it breaks at instead of 'T' of "Two". +// +void RBBITest::TestBug5775() { + UErrorCode status = U_ZERO_ERROR; + BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); + TEST_ASSERT_SUCCESS(status); + if (U_FAILURE(status)) { + return; + } +// Check for status first for better handling of no data errors. + TEST_ASSERT(bi != NULL); + if (bi == NULL) { + return; + } + + UnicodeString s("One.\\u00ad Two.", -1, US_INV); + // 01234 56789 + s = s.unescape(); + bi->setText(s); + int pos = bi->next(); + TEST_ASSERT(pos == 6); + pos = bi->next(); + TEST_ASSERT(pos == 10); + pos = bi->previous(); + TEST_ASSERT(pos == 6); + delete bi; +} + + + +/** + * Test Japanese Line Break + * @bug 4095322 + */ +void RBBITest::TestJapaneseLineBreak() +{ +#if 0 + // Test needs updating some more... Dump it for now. + + + // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count // as opening and closing punctuation for line breaking. // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars // from these tests. 6-13-2002 @@ -1157,6 +1312,7 @@ void RBBITest::TestJapaneseLineBreak() + "' (" + ((int)(followingChars[i])) + ")"); } delete iter; +#endif } @@ -1179,6 +1335,10 @@ void RBBITest::executeTest(TestParams *t) { int32_t prevBP; int32_t i; + if (t->bi == NULL) { + return; + } + t->bi->setText(t->dataToBreak); // // Run the iterator forward @@ -1196,14 +1356,18 @@ void RBBITest::executeTest(TestParams *t) { // and this one. for (i=prevBP+1; iexpectedBreaks->elementAti(i) != 0) { - errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", + int expected[] = {0, i}; + printStringBreaks(t->dataToBreak, expected, 2); + errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); } } // Check that the break we did find was expected if (t->expectedBreaks->elementAti(bp) == 0) { - errln("Forward Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", + int expected[] = {0, bp}; + printStringBreaks(t->dataToBreak, expected, 2); + errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp)); } else { // The break was expected. @@ -1212,11 +1376,12 @@ void RBBITest::executeTest(TestParams *t) { if (expectedTagVal == -1) { expectedTagVal = 0; } + int32_t line = t->srcLine->elementAti(bp); int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); if (rs != expectedTagVal) { - errln("Incorrect status for break. Pos=%4d File line,col= %4d,%4d.\n" + errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" " Actual, Expected status = %4d, %4d", - bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal); + bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); } } @@ -1227,7 +1392,7 @@ void RBBITest::executeTest(TestParams *t) { // Verify that there were no missed expected breaks after the last one found for (i=prevBP+1; iexpectedBreaks->size(); i++) { if (t->expectedBreaks->elementAti(i) != 0) { - errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", + errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", i, t->srcLine->elementAti(i), t->srcCol->elementAti(i)); } } @@ -1264,11 +1429,12 @@ void RBBITest::executeTest(TestParams *t) { if (expectedTagVal == -1) { expectedTagVal = 0; } + int line = t->srcLine->elementAti(bp); int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus(); if (rs != expectedTagVal) { - errln("Incorrect status for break. Pos=%4d File line,col= %4d,%4d.\n" + errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" " Actual, Expected status = %4d, %4d", - bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal); + bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal); } } @@ -1286,8 +1452,9 @@ void RBBITest::executeTest(TestParams *t) { void RBBITest::TestExtended() { +#if !UCONFIG_NO_REGULAR_EXPRESSIONS UErrorCode status = U_ZERO_ERROR; - Locale locale = Locale::getDefault(); + Locale locale(""); UnicodeString rules; TestParams tp; @@ -1296,29 +1463,30 @@ void RBBITest::TestExtended() { tp.srcLine = new UVector32(status); tp.srcCol = new UVector32(status); + RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE(""), 0, status); + if (U_FAILURE(status)) { + dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); + } + // // Open and read the test data file. // - const char *testDataDirectory = loadTestData(status); + const char *testDataDirectory = IntlTest::getSourceTestData(status); char testFileName[1000]; - if (strlen(testDataDirectory) >= sizeof(testFileName)) { + if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { errln("Can't open test data. Path too long."); return; } strcpy(testFileName, testDataDirectory); - char *p = strstr(testFileName, "/out/testdata"); - if (p == NULL) { - p = strstr(testFileName, "\\out\\testdata"); - if (p == NULL) { - errln("Can't open test data. Bad test data directory path.."); - return; - } - } - strcpy(p+1, "rbbitst.txt"); + strcat(testFileName, "rbbitst.txt"); int len; - UChar *testFile = ReadAndConvertFile(testFileName, len, status); + UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); + if (U_FAILURE(status)) { + return; /* something went wrong, error already output */ + } + @@ -1330,7 +1498,6 @@ void RBBITest::TestExtended() { enum EParseState{ PARSE_COMMENT, PARSE_TAG, - PARSE_RULE, PARSE_DATA, PARSE_NUM } @@ -1338,14 +1505,14 @@ void RBBITest::TestExtended() { EParseState savedState = PARSE_TAG; - const UChar CH_LF = 0x0a; - const UChar CH_CR = 0x0d; - const UChar CH_HASH = 0x23; - const UChar CH_PERIOD = 0x2e; - const UChar CH_LT = 0x3c; - const UChar CH_GT = 0x3e; - const UChar CH_BACKSLASH = 0x5c; - const UChar CH_BULLET = 0x2022; + static const UChar CH_LF = 0x0a; + static const UChar CH_CR = 0x0d; + static const UChar CH_HASH = 0x23; + /*static const UChar CH_PERIOD = 0x2e;*/ + static const UChar CH_LT = 0x3c; + static const UChar CH_GT = 0x3e; + static const UChar CH_BACKSLASH = 0x5c; + static const UChar CH_BULLET = 0x2022; int32_t lineNum = 1; int32_t colStart = 0; @@ -1355,6 +1522,7 @@ void RBBITest::TestExtended() { int32_t tagValue = 0; // The numeric value of a tag. for (charIdx = 0; charIdx < len; ) { + status = U_ZERO_ERROR; UChar c = testString.charAt(charIdx); charIdx++; if (c == CH_CR && charIdx") == 0) { delete tp.bi; + tp.bi = NULL; tp.bi = BreakIterator::createSentenceInstance(locale, status); charIdx += 5; break; @@ -1415,6 +1584,18 @@ void RBBITest::TestExtended() { charIdx += 6; break; } + + // + localeMatcher.reset(testString); + if (localeMatcher.lookingAt(charIdx-1, status)) { + UnicodeString localeName = localeMatcher.group(1, status); + char localeName8[100]; + localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); + locale = Locale::createFromName(localeName8); + charIdx += localeMatcher.group(0, status).length(); + TEST_ASSERT_SUCCESS(status); + break; + } if (testString.compare(charIdx-1, 6, "") == 0) { parseState = PARSE_DATA; charIdx += 5; @@ -1426,9 +1607,9 @@ void RBBITest::TestExtended() { } errln("line %d: Tag expected in test file.", lineNum); - goto end_test; parseState = PARSE_COMMENT; savedState = PARSE_DATA; + goto end_test; // Stop the test. } break; @@ -1452,14 +1633,14 @@ void RBBITest::TestExtended() { tp.srcCol ->addElement(column, status); parseState = PARSE_TAG; - charIdx += 7; + charIdx += 6; // RUN THE TEST! executeTest(&tp); break; } - if (testString.compare(charIdx-1, 3, "\\N{") == 0) { + if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { // Named character, e.g. \N{COMBINING GRAVE ACCENT} // Get the code point from the name and insert it into the test data. // (Damn, no API takes names in Unicode !!! @@ -1491,6 +1672,7 @@ void RBBITest::TestExtended() { } if (nameEndIdx > charIdx) { charIdx = nameEndIdx+1; + } break; } @@ -1604,8 +1786,8 @@ void RBBITest::TestExtended() { errln("Syntax Error in test file at line %d, col %d", lineNum, column); - goto end_test; parseState = PARSE_COMMENT; + goto end_test; // Stop the test break; } @@ -1613,8 +1795,8 @@ void RBBITest::TestExtended() { if (U_FAILURE(status)) { errln("ICU Error %s while parsing test file at line %d.", u_errorName(status), lineNum); - goto end_test; status = U_ZERO_ERROR; + goto end_test; // Stop the test } } @@ -1625,19 +1807,257 @@ end_test: delete tp.srcLine; delete tp.srcCol; delete [] testFile; +#endif +} + +void RBBITest::TestThaiBreaks() { + UErrorCode status=U_ZERO_ERROR; + BreakIterator* b; + Locale locale = Locale("th"); + int32_t p, index; + UChar c[]= { + 0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B, + 0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19, + 0x0E16, 0x0E49, 0x0E33, 0x0000 + }; + int32_t expectedWordResult[] = { + 2, 3, 6, 10, 11, 15, 17, 20, 22 + }; + int32_t expectedLineResult[] = { + 3, 6, 11, 15, 17, 20, 22 + }; + + int32_t size = u_strlen(c); + UnicodeString text=UnicodeString(c); + + b = BreakIterator::createWordInstance(locale, status); + if (U_FAILURE(status)) { + errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status)); + return; + } + b->setText(text); + p = index = 0; + while ((p=b->next())!=BreakIterator::DONE && p < size) { + if (p != expectedWordResult[index++]) { + errln("Incorrect break given by thai word break iterator. Expected: %d Got: %d", expectedWordResult[index-1], p); + } + } + delete b; + + b = BreakIterator::createLineInstance(locale, status); + if (U_FAILURE(status)) { + printf("Unable to create thai line break iterator.\n"); + return; + } + b->setText(text); + p = index = 0; + while ((p=b->next())!=BreakIterator::DONE && p < size) { + if (p != expectedLineResult[index++]) { + errln("Incorrect break given by thai line break iterator. Expected: %d Got: %d", expectedLineResult[index-1], p); + } + } + + delete b; +} + +// UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" +// Words don't include colon or period (cldrbug #1969). +static const char posxWordText[] = "Can't have breaks in xx:yy or struct.field for CS-types."; +static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 }; +static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 }; + +// UBreakIteratorType UBRK_WORD, Locale "ja" +// Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009). +static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" + "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; +static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 }; +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; + +// UBreakIteratorType UBRK_SENTENCE, Locale "el" +// Add break after Greek question mark (cldrbug #2069). +static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. " + "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3"; +static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 }; +static const int32_t elSentROffsets[] = { 20, 27, 35, 36 }; + +// UBreakIteratorType UBRK_CHARACTER, Locale "th" +// Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161). +static const char thCharText[] = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 " + "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) " + "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 "; +static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 32, 33, 35, 37, 38, 39, 40, 41 }; +static const int32_t thCharROffsets[] = { 1, 3, 5, 6, 7, 8, 9, 11, + 12, 13, 15, 17, 19, 20, 22, 24, 26, 27, 28, + 29, 32, 33, 35, 37, 38, 40, 41 }; + +typedef struct { + UBreakIteratorType type; + const char * locale; + const char * escapedText; + const int32_t * tailoredOffsets; + int32_t tailoredOffsetsCount; + const int32_t * rootOffsets; + int32_t rootOffsetsCount; +} TailoredBreakItem; + +#define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0])) + +static const TailoredBreakItem tbItems[] = { + { UBRK_WORD, "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) }, + { UBRK_WORD, "ja", jaWordText, ARRAY_PTR_LEN(jaWordTOffsets), ARRAY_PTR_LEN(jaWordROffsets) }, + { UBRK_SENTENCE, "el", elSentText, ARRAY_PTR_LEN(elSentTOffsets), ARRAY_PTR_LEN(elSentROffsets) }, + { UBRK_CHARACTER, "th", thCharText, ARRAY_PTR_LEN(thCharTOffsets), ARRAY_PTR_LEN(thCharROffsets) }, + { UBRK_CHARACTER, NULL, NULL, NULL,0, NULL,0 } // terminator +}; + +static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) { + while (count-- > 0) { + int writeCount; + sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */ + buffer += writeCount; + buflen -= writeCount; + } +} + +enum { kMaxOffsetCount = 128 }; + +void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) { + brkitr->setText( CharsToUnicodeString(escapedText) ); + int32_t foundOffsets[kMaxOffsetCount]; + int32_t offset, foundOffsetsCount = 0; + // do forwards iteration test + while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) { + foundOffsets[foundOffsetsCount++] = offset; + } + if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) { + // log error for forwards test + char formatExpect[512], formatFound[512]; + formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); + formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets); + errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n", + type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound); + } else { + // do backwards iteration test + --foundOffsetsCount; // back off one from the end offset + while ( foundOffsetsCount > 0 ) { + offset = brkitr->previous(); + if ( offset != foundOffsets[--foundOffsetsCount] ) { + // log error for backwards test + char formatExpect[512]; + formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets); + errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n", + type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]); + break; + } + } + } +} + +void RBBITest::TestTailoredBreaks() { + const TailoredBreakItem * tbItemPtr; + Locale rootLocale = Locale("root"); + for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) { + Locale testLocale = Locale(tbItemPtr->locale); + BreakIterator * tailoredBrkiter = NULL; + BreakIterator * rootBrkiter = NULL; + UErrorCode status = U_ZERO_ERROR; + switch (tbItemPtr->type) { + case UBRK_CHARACTER: + tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status); + rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status); + break; + case UBRK_WORD: + tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status); + rootBrkiter = BreakIterator::createWordInstance(rootLocale, status); + break; + case UBRK_LINE: + tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status); + rootBrkiter = BreakIterator::createLineInstance(rootLocale, status); + break; + case UBRK_SENTENCE: + tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status); + rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status); + break; + default: + status = U_UNSUPPORTED_ERROR; + break; + } + if (U_FAILURE(status)) { + errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status)); + continue; + } + TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount); + TBTest(rootBrkiter, (int)(tbItemPtr->type), "root", tbItemPtr->escapedText, tbItemPtr->rootOffsets, tbItemPtr->rootOffsetsCount); + + delete rootBrkiter; + delete tailoredBrkiter; + } +} + + +//------------------------------------------------------------------------------- +// +// TestDictRules create a break iterator from source rules that includes a +// dictionary range. Regression for bug #7130. Source rules +// do not declare a break iterator type (word, line, sentence, etc. +// but the dictionary code, without a type, would loop. +// +//------------------------------------------------------------------------------- +void RBBITest::TestDictRules() { + const char *rules = "$dictionary = [a-z]; \n" + "!!forward; \n" + "$dictionary $dictionary; \n" + "!!reverse; \n" + "$dictionary $dictionary; \n"; + const char *text = "aa"; + UErrorCode status = U_ZERO_ERROR; + UParseError parseError; + + RuleBasedBreakIterator bi(rules, parseError, status); + if (U_SUCCESS(status)) { + UnicodeString utext = text; + bi.setText(utext); + int32_t position; + int32_t loops; + for (loops = 0; loops<10; loops++) { + position = bi.next(); + if (position == RuleBasedBreakIterator::DONE) { + break; + } + } + TEST_ASSERT(loops == 1); + } else { + dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); + } } + //------------------------------------------------------------------------------- // // ReadAndConvertFile Read a text data file, convert it to UChars, and // return the datain one big UChar * buffer, which the caller must delete. // +// parameters: +// fileName: the name of the file, with no directory part. The test data directory +// is assumed. +// ulen an out parameter, receives the actual length (in UChars) of the file data. +// encoding The file encoding. If the file contains a BOM, that will override the encoding +// specified here. The BOM, if it exists, will be stripped from the returned data. +// Pass NULL for the system default encoding. +// status +// returns: +// The file data, converted to UChar. +// The caller must delete this when done with +// delete [] theBuffer; +// // TODO: This is a clone of RegexTest::ReadAndConvertFile. // Move this function to some common place. // //-------------------------------------------------------------------------------- -UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status) { +UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { UChar *retPtr = NULL; char *fileBuf = NULL; UConverter* conv = NULL; @@ -1653,8 +2073,9 @@ UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode // f = fopen(fileName, "rb"); if (f == 0) { - errln("Error opening test data file %s\n", fileName); - goto cleanUpAndReturn; + dataerrln("Error opening test data file %s\n", fileName); + status = U_FILE_ACCESS_ERROR; + return NULL; } // // Read it in @@ -1677,14 +2098,15 @@ UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode // int32_t signatureLength; const char * fileBufC; - const char* encoding; + const char* bomEncoding; fileBufC = fileBuf; - encoding = ucnv_detectUnicodeSignature( + bomEncoding = ucnv_detectUnicodeSignature( fileBuf, fileSize, &signatureLength, &status); - if(encoding!=NULL ){ + if(bomEncoding!=NULL ){ fileBufC += signatureLength; fileSize -= signatureLength; + encoding = bomEncoding; } // @@ -1720,7 +2142,7 @@ UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode cleanUpAndReturn: fclose(f); - delete fileBuf; + delete []fileBuf; ucnv_close(conv); if (U_FAILURE(status)) { errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); @@ -1732,220 +2154,235 @@ cleanUpAndReturn: } + //-------------------------------------------------------------------------------------------- // -// Exhaustive Tests, using Unicode Data Files. -// -//-------------------------------------------------------------------------------------------- - +// Run tests from each of the boundary test data files distributed by the Unicode Consortium // -// Token level scanner for the Unicode Line Break Test Data file. -// Return the next token, as follows: -// >= 0: a UChar32 character, scanned from hex in the file. -// -1: a break position, a division sign in the file. -// -2: end of rule. A new line in the file. -// -3: end of file. No more rules. -// -4: Error -// -// The scanner -// strips comments, ('#' to end of line) -// Recognizes CR, CR/LF and LF as new lines. -// Skips over spaces and Xs (don't break here) in the data. -// -struct ScanState { - int32_t fPeekChar; - UBool fPeeked; - int32_t fLineNum; - FILE *fFile; - ScanState() :fPeeked(FALSE), fLineNum(0), fFile(NULL) {}; -}; - -// Literal characters that are of interest. In hex to keep EBCDIC based machines happy. -// The data itself is latin-1 on all platforms. -static const int32_t chSpace = 0x20; -static const int32_t chTab = 0x09; -static const int32_t chCR = 0x0D; -static const int32_t chLF = 0x0A; -static const int32_t chHash = 0x23; -static const int32_t chMult = 0xD7; -static const int32_t chDivide = 0xF7; - -static int32_t nextLBDToken(ScanState *s) { - int32_t c; - - // Read characters from the input file until we get something interesting - // to return. The file is in latin-1 encoding. - for (;;) { - // Get the next character to look at, - if (s->fPeeked) { - c = s->fPeekChar; - s->fPeeked = FALSE; - } else { - c = getc(s->fFile); - } - - // EOF. Return immediately. - if (c == EOF) { - return -3; - } - - // Spaces. Treat the multiply sign as a space - it indicates a no-break position - // in the data, and the test program doesn't want to see them. - // Continue the next char loop, looking for something significant. - if (c == chSpace || c == chTab || c == chMult) { - continue; - } - - // Divide sign. Indicates an expected break position. - if (c == chDivide) { - return -1; - } - - // New Line Handling. Keep track of line number in the file, which in turn - // requires keeping track of CR/LF as a single new line. - if (c == chCR) { - s->fLineNum++; - s->fPeekChar = getc(s->fFile); - if (s->fPeekChar != chLF) {s->fPeeked = TRUE;}; - return -2; - } - if (c == chLF) { - s->fLineNum++; - return -2; - } +//------------------------------------------------------------------------------------------- +void RBBITest::TestUnicodeFiles() { + RuleBasedBreakIterator *bi; + UErrorCode status = U_ZERO_ERROR; + + bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status); + TEST_ASSERT_SUCCESS(status); + if (U_SUCCESS(status)) { + runUnicodeTestData("GraphemeBreakTest.txt", bi); + } + delete bi; - // Comments. Consume everything up to the next new line. - if (c == chHash) { - do { - c = getc(s->fFile); - } while (!(c == EOF || c == chCR || c == chLF)); - s->fPeekChar = c; - s->fPeeked = TRUE; - return nextLBDToken(s); - } + bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); + TEST_ASSERT_SUCCESS(status); + if (U_SUCCESS(status)) { + runUnicodeTestData("WordBreakTest.txt", bi); + } + delete bi; - // Scan a hex character (UChar32) value. - if (u_digit(c, 16) >= 0) { - int32_t v = u_digit(c, 16); - for (;;) { - c = getc(s->fFile); - if (u_digit(c, 16) < 0) {break;}; - v <<= 4; - v += u_digit(c, 16); - } - s->fPeekChar = c; - s->fPeeked = TRUE; - return v; - } + bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status); + TEST_ASSERT_SUCCESS(status); + if (U_SUCCESS(status)) { + runUnicodeTestData("SentenceBreakTest.txt", bi); + } + delete bi; - // Error. Character was something unexpected. - return -4; + bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); + TEST_ASSERT_SUCCESS(status); + if (U_SUCCESS(status)) { + runUnicodeTestData("LineBreakTest.txt", bi); } + delete bi; } +//-------------------------------------------------------------------------------------------- +// +// Run tests from one of the boundary test data files distributed by the Unicode Consortium +// +//------------------------------------------------------------------------------------------- +void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { +#if !UCONFIG_NO_REGULAR_EXPRESSIONS +// TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. + UVersionInfo icu4700 = { 4, 7, 0, 0 }; +UBool isICUVersionPast46 = isICUVersionAtLeast(icu4700); +UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt"); + UErrorCode status = U_ZERO_ERROR; -void RBBITest::TestLineBreakData() { - - UErrorCode status = U_ZERO_ERROR; - UnicodeString testString; - UVector expectedBreaks(status); - ScanState ss; - int32_t tok; - - BreakIterator *bi = BreakIterator::createLineInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) { - errln("Failure creating break iterator"); + // + // Open and read the test data file, put it into a UnicodeString. + // + const char *testDataDirectory = IntlTest::getSourceTestData(status); + char testFileName[1000]; + if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { + dataerrln("Can't open test data. Path too long."); return; } + strcpy(testFileName, testDataDirectory); + strcat(testFileName, fileName); + + logln("Opening data file %s\n", fileName); - const char * lbdfName = "LBTest.txt"; + int len; + UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); + if (status != U_FILE_ACCESS_ERROR) { + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(testFile != NULL); + } + if (U_FAILURE(status) || testFile == NULL) { + return; /* something went wrong, error already output */ + } + UnicodeString testFileAsString(TRUE, testFile, len); - // Open the test data file. - // TODO: a proper way to handle this data. - ss.fFile = fopen(lbdfName, "rb"); - if (ss.fFile == NULL) { - logln("Unable to open Line Break Test Data file. Skipping test."); - delete bi; + // + // Parse the test data file using a regular expression. + // Each kind of token is recognized in its own capture group; what type of item was scanned + // is identified by which group had a match. + // + // Caputure Group # 1 2 3 4 5 + // Parses this item: divide x hex digits comment \n unrecognized \n + // + UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); + RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); + UnicodeString testString; + UVector32 breakPositions(status); + int lineNumber = 1; + TEST_ASSERT_SUCCESS(status); + if (U_FAILURE(status)) { return; } - // Loop once per line from the test data file. - for (;;) { - // Zero out test data from previous line. - testString.truncate(0); - expectedBreaks.removeAllElements(); - - // Read one test's (line's) worth of data from the file. - // Loop once per token on the input file line. - for(;;) { - tok = nextLBDToken(&ss); - - // If we scanned a character number in the file. - // save it in the test data array. - if (tok >= 0) { - testString.append((UChar32)tok); - continue; + // + // Scan through each test case, building up the string to be broken in testString, + // and the positions that should be boundaries in the breakPositions vector. + // + int spin = 0; + while (tokenMatcher.find()) { + if(tokenMatcher.hitEnd()) { + /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. + This occurred when the text file was corrupt (wasn't marked as UTF-8) + and caused an infinite loop here on EBCDIC systems! + */ + fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); + // return; + } + if (tokenMatcher.start(1, status) >= 0) { + // Scanned a divide sign, indicating a break position in the test data. + if (testString.length()>0) { + breakPositions.addElement(testString.length(), status); } - - // If we scanned a break position in the data, record it. - if (tok == -1) { - expectedBreaks.addElement(testString.length(), status); - continue; + } + else if (tokenMatcher.start(2, status) >= 0) { + // Scanned an 'x', meaning no break at this position in the test data + // Nothing to be done here. + } + else if (tokenMatcher.start(3, status) >= 0) { + // Scanned Hex digits. Convert them to binary, append to the character data string. + const UnicodeString &hexNumber = tokenMatcher.group(3, status); + int length = hexNumber.length(); + if (length<=8) { + char buf[10]; + hexNumber.extract (0, length, buf, sizeof(buf), US_INV); + UChar32 c = (UChar32)strtol(buf, NULL, 16); + if (c<=0x10ffff) { + testString.append(c); + } else { + errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", + fileName, lineNumber); + } + } else { + errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", + fileName, lineNumber); + } + } + else if (tokenMatcher.start(4, status) >= 0) { + // Scanned to end of a line, possibly skipping over a comment in the process. + // If the line from the file contained test data, run the test now. + // + if (testString.length() > 0) { +// TODO(andy): Remove this time bomb code. +if (!isLineBreak || isICUVersionPast46 || !(4658 <= lineNumber && lineNumber <= 4758)) { + checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); +} } - // If we scanned a new line, or EOF - // drop out of scan loop and run the test case. - if (tok == -2 || tok == -3) {break;}; - - // None of above. Error. - errln("Failure: Unrecognized data format, test file line %d", ss.fLineNum); + // Clear out this test case. + // The string and breakPositions vector will be refilled as the next + // test case is parsed. + testString.remove(); + breakPositions.removeAllElements(); + lineNumber++; + } else { + // Scanner catchall. Something unrecognized appeared on the line. + char token[16]; + UnicodeString uToken = tokenMatcher.group(0, status); + uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); + token[sizeof(token)-1] = 0; + errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); + + // Clean up, in preparation for continuing with the next line. + testString.remove(); + breakPositions.removeAllElements(); + lineNumber++; + } + TEST_ASSERT_SUCCESS(status); + if (U_FAILURE(status)) { break; } + } - // If this line from the test data file actually contained test data, - // run the test. - if (testString.length() > 0) { - int32_t pos; // Break Position in the test string - int32_t expectedI = 0; // Index of expected break position in vector of same. - int32_t expectedPos; // Expected break position (index into test string) - - bi->setText(testString); - pos = bi->first(); // TODO: break iterators always return a match at pos 0. - pos = bi->next(); // Line Break TR says no match at position 0. - // Resolve. + delete [] testFile; + #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS +} - for (; pos != BreakIterator::DONE; ) { - expectedPos = expectedBreaks.elementAti(expectedI); - if (pos < expectedPos) { - errln("Failure: Test file line %d, unexpected break found at position %d", - ss.fLineNum, pos); - break; - } - if (pos > expectedPos) { - errln("Failure: Test file line %d, failed to find break at position %d", - ss.fLineNum, expectedPos); - break; - } - pos = bi->next(); - expectedI++; - } +//-------------------------------------------------------------------------------------------- +// +// checkUnicodeTestCase() Run one test case from one of the Unicode Consortium +// test data files. Do only a simple, forward-only check - +// this test is mostly to check that ICU and the Unicode +// data agree with each other. +// +//-------------------------------------------------------------------------------------------- +void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, + const UnicodeString &testString, // Text data to be broken + UVector32 *breakPositions, // Positions where breaks should be found. + RuleBasedBreakIterator *bi) { + int32_t pos; // Break Position in the test string + int32_t expectedI = 0; // Index of expected break position in the vector of expected results. + int32_t expectedPos; // Expected break position (index into test string) + + bi->setText(testString); + pos = bi->first(); + pos = bi->next(); + + while (pos != BreakIterator::DONE) { + if (expectedI >= breakPositions->size()) { + errln("Test file \"%s\", line %d, unexpected break found at position %d", + testFileName, lineNumber, pos); + break; } - - // If we've hit EOF on the input file, we're done. - if (tok == -3) { + expectedPos = breakPositions->elementAti(expectedI); + if (pos < expectedPos) { + errln("Test file \"%s\", line %d, unexpected break found at position %d", + testFileName, lineNumber, pos); break; } + if (pos > expectedPos) { + errln("Test file \"%s\", line %d, failed to find expected break at position %d", + testFileName, lineNumber, expectedPos); + break; + } + pos = bi->next(); + expectedI++; + } + if (pos==BreakIterator::DONE && expectedIsize()) { + errln("Test file \"%s\", line %d, failed to find expected break at position %d", + testFileName, lineNumber, breakPositions->elementAti(expectedI)); } +} - fclose(ss.fFile); - delete bi; -} #if !UCONFIG_NO_REGULAR_EXPRESSIONS - //--------------------------------------------------------------------------------------- // // classs RBBIMonkeyKind @@ -2025,10 +2462,16 @@ private: UnicodeSet *fCRLFSet; UnicodeSet *fControlSet; UnicodeSet *fExtendSet; + UnicodeSet *fPrependSet; + UnicodeSet *fSpacingSet; + UnicodeSet *fLSet; + UnicodeSet *fVSet; + UnicodeSet *fTSet; + UnicodeSet *fLVSet; + UnicodeSet *fLVTSet; UnicodeSet *fHangulSet; UnicodeSet *fAnySet; - RegexMatcher *fMatcher; const UnicodeString *fText; }; @@ -2037,48 +2480,156 @@ RBBICharMonkey::RBBICharMonkey() { UErrorCode status = U_ZERO_ERROR; fText = NULL; - fMatcher = new RegexMatcher("\\X", 0, status); // Pattern to match a grampheme cluster - - fCRLFSet = new UnicodeSet("[\\r\\n]", status); - fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]]", status); - fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status); - fHangulSet = new UnicodeSet( - "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}" - "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]", status); - fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]", status); + fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); + fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status); + fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status); + fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); + fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); + fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); + fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); + fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); + fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); + fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); + fHangulSet = new UnicodeSet(); + fHangulSet->addAll(*fLSet); + fHangulSet->addAll(*fVSet); + fHangulSet->addAll(*fTSet); + fHangulSet->addAll(*fLVSet); + fHangulSet->addAll(*fLVTSet); + fAnySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status); + fSets = new UVector(status); fSets->addElement(fCRLFSet, status); fSets->addElement(fControlSet, status); fSets->addElement(fExtendSet, status); + fSets->addElement(fPrependSet, status); + fSets->addElement(fSpacingSet, status); fSets->addElement(fHangulSet, status); fSets->addElement(fAnySet, status); if (U_FAILURE(status)) { deferredStatus = status; } -}; +} void RBBICharMonkey::setText(const UnicodeString &s) { fText = &s; - fMatcher->reset(s); } -int32_t RBBICharMonkey::next(int32_t i) { - UErrorCode status = U_ZERO_ERROR; - int32_t retVal = -1; - if (fMatcher->find(i, status)) { - retVal = fMatcher->end(status); +int32_t RBBICharMonkey::next(int32_t prevPos) { + int p0, p1, p2, p3; // Indices of the significant code points around the + // break position being tested. The candidate break + // location is before p2. + + int breakPos = -1; + + UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. + + if (U_FAILURE(deferredStatus)) { + return -1; + } + + // Previous break at end of string. return DONE. + if (prevPos >= fText->length()) { + return -1; } - if (U_FAILURE(status)){ - retVal = -1; + p0 = p1 = p2 = p3 = prevPos; + c3 = fText->char32At(prevPos); + c0 = c1 = c2 = 0; + + // Loop runs once per "significant" character position in the input text. + for (;;) { + // Move all of the positions forward in the input string. + p0 = p1; c0 = c1; + p1 = p2; c1 = c2; + p2 = p3; c2 = c3; + + // Advancd p3 by one codepoint + p3 = fText->moveIndex32(p3, 1); + c3 = fText->char32At(p3); + + if (p1 == p2) { + // Still warming up the loop. (won't work with zero length strings, but we don't care) + continue; + } + if (p2 == fText->length()) { + // Reached end of string. Always a break position. + break; + } + + // Rule GB3 CR x LF + // No Extend or Format characters may appear between the CR and LF, + // which requires the additional check for p2 immediately following p1. + // + if (c1==0x0D && c2==0x0A && p1==(p2-1)) { + continue; + } + + // Rule (GB4). ( Control | CR | LF ) + if (fControlSet->contains(c1) || + c1 == 0x0D || + c1 == 0x0A) { + break; + } + + // Rule (GB5) ( Control | CR | LF ) + // + if (fControlSet->contains(c2) || + c2 == 0x0D || + c2 == 0x0A) { + break; + } + + + // Rule (GB6) L x ( L | V | LV | LVT ) + if (fLSet->contains(c1) && + (fLSet->contains(c2) || + fVSet->contains(c2) || + fLVSet->contains(c2) || + fLVTSet->contains(c2))) { + continue; + } + + // Rule (GB7) ( LV | V ) x ( V | T ) + if ((fLVSet->contains(c1) || fVSet->contains(c1)) && + (fVSet->contains(c2) || fTSet->contains(c2))) { + continue; + } + + // Rule (GB8) ( LVT | T) x T + if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && + fTSet->contains(c2)) { + continue; + } + + // Rule (GB9) Numeric x ALetter + if (fExtendSet->contains(c2)) { + continue; + } + + // Rule (GB9a) x SpacingMark + if (fSpacingSet->contains(c2)) { + continue; + } + + // Rule (GB9b) Prepend x + if (fPrependSet->contains(c1)) { + continue; + } + + // Rule (GB10) Any Any + break; } - return retVal; + + breakPos = p2; + return breakPos; } + UVector *RBBICharMonkey::charClasses() { return fSets; } @@ -2089,10 +2640,15 @@ RBBICharMonkey::~RBBICharMonkey() { delete fCRLFSet; delete fControlSet; delete fExtendSet; + delete fPrependSet; + delete fSpacingSet; + delete fLSet; + delete fVSet; + delete fTSet; + delete fLVSet; + delete fLVTSet; delete fHangulSet; delete fAnySet; - - delete fMatcher; } //------------------------------------------------------------------------------------------ @@ -2111,273 +2667,1231 @@ public: private: UVector *fSets; + UnicodeSet *fCRSet; + UnicodeSet *fLFSet; + UnicodeSet *fNewlineSet; UnicodeSet *fKatakanaSet; UnicodeSet *fALetterSet; - UnicodeSet *fMidLetterSet; UnicodeSet *fMidNumLetSet; + UnicodeSet *fMidLetterSet; UnicodeSet *fMidNumSet; UnicodeSet *fNumericSet; UnicodeSet *fFormatSet; UnicodeSet *fOtherSet; UnicodeSet *fExtendSet; + UnicodeSet *fExtendNumLetSet; RegexMatcher *fMatcher; const UnicodeString *fText; - UChar32 *fMungedText; - int32_t fMungedLen; - int32_t *fMungedPositions; - int32_t *fOrigPositions; - - RegexMatcher *fGCFMatcher; - RegexMatcher *fGCMatcher; - }; -RBBIWordMonkey::RBBIWordMonkey() : fMungedText(0), - fMungedPositions(0), - fOrigPositions(0), - fGCFMatcher(0), - fGCMatcher(0) +RBBIWordMonkey::RBBIWordMonkey() { UErrorCode status = U_ZERO_ERROR; - fSets = new UVector(status); - - fKatakanaSet = new UnicodeSet("[\\p{script=KATAKANA}\\u30fc\\uff70\\ufe9e\\ff9f]", status); - - const UnicodeString ALetterStr( "[[\\p{Alphabetic}\\u05f3]-[\\p{Ideographic}]-[\\p{Script=Thai}]" - "-[\\p{Script=Lao}]-[\\p{Script=Hiragana}]-" - "[\\p{script=KATAKANA}\\u30fc\\uff70\\ufe9e\\ff9f]]"); - - fALetterSet = new UnicodeSet(ALetterStr, status); - fMidLetterSet = new UnicodeSet("[\\u0027\\u00b7\\u05f4\\u2019\\u2027]", status); - fMidNumLetSet = new UnicodeSet("[\\u002e\\u003a]", status); - fMidNumSet = new UnicodeSet("[\\p{Line_Break=Infix_Numeric}]", status); - fNumericSet = new UnicodeSet("[\\p{Line_Break=Numeric}]", status); - fFormatSet = new UnicodeSet("[\\p{Format}]", status); - fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status); - fOtherSet = new UnicodeSet(); + fSets = new UVector(status); + + fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); + fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); + fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); + fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); + fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); + fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); + fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); + fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); + fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); + fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); + fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); + fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); + + fOtherSet = new UnicodeSet(); if(U_FAILURE(status)) { deferredStatus = status; return; } fOtherSet->complement(); + fOtherSet->removeAll(*fCRSet); + fOtherSet->removeAll(*fLFSet); + fOtherSet->removeAll(*fNewlineSet); fOtherSet->removeAll(*fKatakanaSet); fOtherSet->removeAll(*fALetterSet); fOtherSet->removeAll(*fMidLetterSet); - fOtherSet->removeAll(*fMidNumLetSet); fOtherSet->removeAll(*fMidNumSet); fOtherSet->removeAll(*fNumericSet); - + fOtherSet->removeAll(*fExtendNumLetSet); + fOtherSet->removeAll(*fFormatSet); + fOtherSet->removeAll(*fExtendSet); + // Inhibit dictionary characters from being tested at all. + fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); + + fSets->addElement(fCRSet, status); + fSets->addElement(fLFSet, status); + fSets->addElement(fNewlineSet, status); fSets->addElement(fALetterSet, status); + fSets->addElement(fKatakanaSet, status); fSets->addElement(fMidLetterSet, status); fSets->addElement(fMidNumLetSet, status); fSets->addElement(fMidNumSet, status); fSets->addElement(fNumericSet, status); fSets->addElement(fFormatSet, status); + fSets->addElement(fExtendSet, status); fSets->addElement(fOtherSet, status); + fSets->addElement(fExtendNumLetSet, status); + + if (U_FAILURE(status)) { + deferredStatus = status; + } +} + +void RBBIWordMonkey::setText(const UnicodeString &s) { + fText = &s; +} + + +int32_t RBBIWordMonkey::next(int32_t prevPos) { + int p0, p1, p2, p3; // Indices of the significant code points around the + // break position being tested. The candidate break + // location is before p2. + + int breakPos = -1; + + UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. + + if (U_FAILURE(deferredStatus)) { + return -1; + } + + // Prev break at end of string. return DONE. + if (prevPos >= fText->length()) { + return -1; + } + p0 = p1 = p2 = p3 = prevPos; + c3 = fText->char32At(prevPos); + c0 = c1 = c2 = 0; + + // Loop runs once per "significant" character position in the input text. + for (;;) { + // Move all of the positions forward in the input string. + p0 = p1; c0 = c1; + p1 = p2; c1 = c2; + p2 = p3; c2 = c3; + + // Advancd p3 by X(Extend | Format)* Rule 4 + // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) + do { + p3 = fText->moveIndex32(p3, 1); + c3 = fText->char32At(p3); + if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { + break; + }; + } + while (fFormatSet->contains(c3) || fExtendSet->contains(c3)); + + + if (p1 == p2) { + // Still warming up the loop. (won't work with zero length strings, but we don't care) + continue; + } + if (p2 == fText->length()) { + // Reached end of string. Always a break position. + break; + } + + // Rule (3) CR x LF + // No Extend or Format characters may appear between the CR and LF, + // which requires the additional check for p2 immediately following p1. + // + if (c1==0x0D && c2==0x0A) { + continue; + } + + // Rule (3a) Break before and after newlines (including CR and LF) + // + if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { + break; + }; + if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { + break; + }; + + // Rule (5). ALetter x ALetter + if (fALetterSet->contains(c1) && + fALetterSet->contains(c2)) { + continue; + } + + // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter + // + if ( fALetterSet->contains(c1) && + (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) && + fALetterSet->contains(c3)) { + continue; + } + + + // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter + if (fALetterSet->contains(c0) && + (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) && + fALetterSet->contains(c2)) { + continue; + } + + // Rule (8) Numeric x Numeric + if (fNumericSet->contains(c1) && + fNumericSet->contains(c2)) { + continue; + } + + // Rule (9) ALetter x Numeric + if (fALetterSet->contains(c1) && + fNumericSet->contains(c2)) { + continue; + } + + // Rule (10) Numeric x ALetter + if (fNumericSet->contains(c1) && + fALetterSet->contains(c2)) { + continue; + } + + // Rule (11) Numeric (MidNum | MidNumLet) x Numeric + if (fNumericSet->contains(c0) && + (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) && + fNumericSet->contains(c2)) { + continue; + } + + // Rule (12) Numeric x (MidNum | MidNumLet) Numeric + if (fNumericSet->contains(c1) && + (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) && + fNumericSet->contains(c3)) { + continue; + } + + // Rule (13) Katakana x Katakana + if (fKatakanaSet->contains(c1) && + fKatakanaSet->contains(c2)) { + continue; + } + + // Rule 13a + if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) || + fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && + fExtendNumLetSet->contains(c2)) { + continue; + } + + // Rule 13b + if (fExtendNumLetSet->contains(c1) && + (fALetterSet->contains(c2) || fNumericSet->contains(c2) || + fKatakanaSet->contains(c2))) { + continue; + } + + // Rule 14. Break found here. + break; + } + + breakPos = p2; + return breakPos; +} + + +UVector *RBBIWordMonkey::charClasses() { + return fSets; +} + + +RBBIWordMonkey::~RBBIWordMonkey() { + delete fSets; + delete fCRSet; + delete fLFSet; + delete fNewlineSet; + delete fKatakanaSet; + delete fALetterSet; + delete fMidNumLetSet; + delete fMidLetterSet; + delete fMidNumSet; + delete fNumericSet; + delete fFormatSet; + delete fExtendSet; + delete fExtendNumLetSet; + delete fOtherSet; +} + - fMungedText = NULL; - fMungedLen = 0; - fMungedPositions = NULL; - fOrigPositions = NULL; - fGCFMatcher = new RegexMatcher("\\X(?:\\p{Format}\\p{Grapheme_Extend}*)*", 0, status); - fGCMatcher = new RegexMatcher("\\X", 0, status); + +//------------------------------------------------------------------------------------------ +// +// class RBBISentMonkey Sentence Break specific implementation +// of RBBIMonkeyKind. +// +//------------------------------------------------------------------------------------------ +class RBBISentMonkey: public RBBIMonkeyKind { +public: + RBBISentMonkey(); + virtual ~RBBISentMonkey(); + virtual UVector *charClasses(); + virtual void setText(const UnicodeString &s); + virtual int32_t next(int32_t i); +private: + int moveBack(int posFrom); + int moveForward(int posFrom); + UChar32 cAt(int pos); + + UVector *fSets; + + UnicodeSet *fSepSet; + UnicodeSet *fFormatSet; + UnicodeSet *fSpSet; + UnicodeSet *fLowerSet; + UnicodeSet *fUpperSet; + UnicodeSet *fOLetterSet; + UnicodeSet *fNumericSet; + UnicodeSet *fATermSet; + UnicodeSet *fSContinueSet; + UnicodeSet *fSTermSet; + UnicodeSet *fCloseSet; + UnicodeSet *fOtherSet; + UnicodeSet *fExtendSet; + + const UnicodeString *fText; + +}; + +RBBISentMonkey::RBBISentMonkey() +{ + UErrorCode status = U_ZERO_ERROR; + + fSets = new UVector(status); + + // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator + // set and made into character classes of their own. For the monkey impl, + // they remain in SEP, since Sep always appears with CR and LF in the rules. + fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); + fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); + fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); + fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); + fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); + fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); + fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); + fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); + fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); + fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); + fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); + fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); + fOtherSet = new UnicodeSet(); + + if(U_FAILURE(status)) { + deferredStatus = status; + return; + } + + fOtherSet->complement(); + fOtherSet->removeAll(*fSepSet); + fOtherSet->removeAll(*fFormatSet); + fOtherSet->removeAll(*fSpSet); + fOtherSet->removeAll(*fLowerSet); + fOtherSet->removeAll(*fUpperSet); + fOtherSet->removeAll(*fOLetterSet); + fOtherSet->removeAll(*fNumericSet); + fOtherSet->removeAll(*fATermSet); + fOtherSet->removeAll(*fSContinueSet); + fOtherSet->removeAll(*fSTermSet); + fOtherSet->removeAll(*fCloseSet); + fOtherSet->removeAll(*fExtendSet); + + fSets->addElement(fSepSet, status); + fSets->addElement(fFormatSet, status); + fSets->addElement(fSpSet, status); + fSets->addElement(fLowerSet, status); + fSets->addElement(fUpperSet, status); + fSets->addElement(fOLetterSet, status); + fSets->addElement(fNumericSet, status); + fSets->addElement(fATermSet, status); + fSets->addElement(fSContinueSet, status); + fSets->addElement(fSTermSet, status); + fSets->addElement(fCloseSet, status); + fSets->addElement(fOtherSet, status); + fSets->addElement(fExtendSet, status); if (U_FAILURE(status)) { deferredStatus = status; } +} + + + +void RBBISentMonkey::setText(const UnicodeString &s) { + fText = &s; +} + +UVector *RBBISentMonkey::charClasses() { + return fSets; +} + + +// moveBack() Find the "significant" code point preceding the index i. +// Skips over ($Extend | $Format)* . +// +int RBBISentMonkey::moveBack(int i) { + if (i <= 0) { + return -1; + } + UChar32 c; + int32_t j = i; + do { + j = fText->moveIndex32(j, -1); + c = fText->char32At(j); + } + while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); + return j; + + } + + +int RBBISentMonkey::moveForward(int i) { + if (i>=fText->length()) { + return fText->length(); + } + UChar32 c; + int32_t j = i; + do { + j = fText->moveIndex32(j, 1); + c = cAt(j); + } + while (fFormatSet->contains(c) || fExtendSet->contains(c)); + return j; +} + +UChar32 RBBISentMonkey::cAt(int pos) { + if (pos<0 || pos>=fText->length()) { + return -1; + } else { + return fText->char32At(pos); + } +} + +int32_t RBBISentMonkey::next(int32_t prevPos) { + int p0, p1, p2, p3; // Indices of the significant code points around the + // break position being tested. The candidate break + // location is before p2. + + int breakPos = -1; + + UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. + UChar32 c; + + if (U_FAILURE(deferredStatus)) { + return -1; + } + + // Prev break at end of string. return DONE. + if (prevPos >= fText->length()) { + return -1; + } + p0 = p1 = p2 = p3 = prevPos; + c3 = fText->char32At(prevPos); + c0 = c1 = c2 = 0; + + // Loop runs once per "significant" character position in the input text. + for (;;) { + // Move all of the positions forward in the input string. + p0 = p1; c0 = c1; + p1 = p2; c1 = c2; + p2 = p3; c2 = c3; + + // Advancd p3 by X(Extend | Format)* Rule 4 + p3 = moveForward(p3); + c3 = cAt(p3); + + // Rule (3) CR x LF + if (c1==0x0d && c2==0x0a && p2==(p1+1)) { + continue; + } + + // Rule (4). Sep + if (fSepSet->contains(c1)) { + p2 = p1+1; // Separators don't combine with Extend or Format. + break; + } + + if (p2 >= fText->length()) { + // Reached end of string. Always a break position. + break; + } + + if (p2 == prevPos) { + // Still warming up the loop. (won't work with zero length strings, but we don't care) + continue; + } + + // Rule (6). ATerm x Numeric + if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { + continue; + } + + // Rule (7). Upper ATerm x Uppper + if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { + continue; + } + + // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower + // Note: STerm | ATerm are added to the negated part of the expression by a + // note to the Unicode 5.0 documents. + int p8 = p1; + while (fSpSet->contains(cAt(p8))) { + p8 = moveBack(p8); + } + while (fCloseSet->contains(cAt(p8))) { + p8 = moveBack(p8); + } + if (fATermSet->contains(cAt(p8))) { + p8=p2; + for (;;) { + c = cAt(p8); + if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || + fLowerSet->contains(c) || fSepSet->contains(c) || + fATermSet->contains(c) || fSTermSet->contains(c)) { + break; + } + p8 = moveForward(p8); + } + if (fLowerSet->contains(cAt(p8))) { + continue; + } + } + + // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); + if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { + p8 = p1; + while (fSpSet->contains(cAt(p8))) { + p8 = moveBack(p8); + } + while (fCloseSet->contains(cAt(p8))) { + p8 = moveBack(p8); + } + c = cAt(p8); + if (fSTermSet->contains(c) || fATermSet->contains(c)) { + continue; + } + } + + // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) + int p9 = p1; + while (fCloseSet->contains(cAt(p9))) { + p9 = moveBack(p9); + } + c = cAt(p9); + if ((fSTermSet->contains(c) || fATermSet->contains(c))) { + if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { + continue; + } + } + + // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) + int p10 = p1; + while (fSpSet->contains(cAt(p10))) { + p10 = moveBack(p10); + } + while (fCloseSet->contains(cAt(p10))) { + p10 = moveBack(p10); + } + if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { + if (fSpSet->contains(c2) || fSepSet->contains(c2)) { + continue; + } + } + + // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? + int p11 = p1; + if (fSepSet->contains(cAt(p11))) { + p11 = moveBack(p11); + } + while (fSpSet->contains(cAt(p11))) { + p11 = moveBack(p11); + } + while (fCloseSet->contains(cAt(p11))) { + p11 = moveBack(p11); + } + if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { + break; + } + + // Rule (12) Any x Any + continue; + } + breakPos = p2; + return breakPos; +} + +RBBISentMonkey::~RBBISentMonkey() { + delete fSets; + delete fSepSet; + delete fFormatSet; + delete fSpSet; + delete fLowerSet; + delete fUpperSet; + delete fOLetterSet; + delete fNumericSet; + delete fATermSet; + delete fSContinueSet; + delete fSTermSet; + delete fCloseSet; + delete fOtherSet; + delete fExtendSet; +} + + + +//------------------------------------------------------------------------------------------- +// +// RBBILineMonkey +// +//------------------------------------------------------------------------------------------- + +class RBBILineMonkey: public RBBIMonkeyKind { +public: + RBBILineMonkey(); + virtual ~RBBILineMonkey(); + virtual UVector *charClasses(); + virtual void setText(const UnicodeString &s); + virtual int32_t next(int32_t i); + virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); +private: + UVector *fSets; + + UnicodeSet *fBK; + UnicodeSet *fCR; + UnicodeSet *fLF; + UnicodeSet *fCM; + UnicodeSet *fNL; + UnicodeSet *fSG; + UnicodeSet *fWJ; + UnicodeSet *fZW; + UnicodeSet *fGL; + UnicodeSet *fCB; + UnicodeSet *fSP; + UnicodeSet *fB2; + UnicodeSet *fBA; + UnicodeSet *fBB; + UnicodeSet *fHY; + UnicodeSet *fH2; + UnicodeSet *fH3; + UnicodeSet *fCL; + UnicodeSet *fCP; + UnicodeSet *fEX; + UnicodeSet *fIN; + UnicodeSet *fJL; + UnicodeSet *fJV; + UnicodeSet *fJT; + UnicodeSet *fNS; + UnicodeSet *fOP; + UnicodeSet *fQU; + UnicodeSet *fIS; + UnicodeSet *fNU; + UnicodeSet *fPO; + UnicodeSet *fPR; + UnicodeSet *fSY; + UnicodeSet *fAI; + UnicodeSet *fAL; + UnicodeSet *fID; + UnicodeSet *fSA; + UnicodeSet *fXX; + + BreakIterator *fCharBI; + + const UnicodeString *fText; + int32_t *fOrigPositions; + + RegexMatcher *fNumberMatcher; + RegexMatcher *fLB11Matcher; }; -void RBBIWordMonkey::setText(const UnicodeString &s) { + +RBBILineMonkey::RBBILineMonkey() +{ + UErrorCode status = U_ZERO_ERROR; + + fSets = new UVector(status); + + fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); + fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); + fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); + fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); + fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); + fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); + fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); + fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); + fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); + fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); + fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); + fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); + fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); + fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); + fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); + fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); + fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); + fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); + fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); + fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); + fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); + fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); + fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); + fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); + fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); + fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); + fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); + fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); + fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); + fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); + fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); + fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); + fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); + fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); + fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status); + fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); + fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); + + if (U_FAILURE(status)) { + deferredStatus = status; + fCharBI = NULL; + fNumberMatcher = NULL; + return; + } + + fAL->addAll(*fXX); // Default behavior for XX is identical to AL + fAL->addAll(*fAI); // Default behavior for AI is identical to AL + fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL + fAL->addAll(*fSG); // Default behavior for SG is identical to AL. + + fSets->addElement(fBK, status); + fSets->addElement(fCR, status); + fSets->addElement(fLF, status); + fSets->addElement(fCM, status); + fSets->addElement(fNL, status); + fSets->addElement(fWJ, status); + fSets->addElement(fZW, status); + fSets->addElement(fGL, status); + fSets->addElement(fCB, status); + fSets->addElement(fSP, status); + fSets->addElement(fB2, status); + fSets->addElement(fBA, status); + fSets->addElement(fBB, status); + fSets->addElement(fHY, status); + fSets->addElement(fH2, status); + fSets->addElement(fH3, status); + fSets->addElement(fCL, status); + fSets->addElement(fCP, status); + fSets->addElement(fEX, status); + fSets->addElement(fIN, status); + fSets->addElement(fJL, status); + fSets->addElement(fJT, status); + fSets->addElement(fJV, status); + fSets->addElement(fNS, status); + fSets->addElement(fOP, status); + fSets->addElement(fQU, status); + fSets->addElement(fIS, status); + fSets->addElement(fNU, status); + fSets->addElement(fPO, status); + fSets->addElement(fPR, status); + fSets->addElement(fSY, status); + fSets->addElement(fAI, status); + fSets->addElement(fAL, status); + fSets->addElement(fID, status); + fSets->addElement(fWJ, status); + fSets->addElement(fSA, status); + fSets->addElement(fSG, status); + + const char *rules = + "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?" + "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?" + "\\p{Line_Break=NU}\\p{Line_Break=CM}*" + "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*" + "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?" + "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"; + + fNumberMatcher = new RegexMatcher( + UnicodeString(rules, -1, US_INV), 0, status); + + fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); + + if (U_FAILURE(status)) { + deferredStatus = status; + } +} + + +void RBBILineMonkey::setText(const UnicodeString &s) { fText = &s; + fCharBI->setText(s); + fNumberMatcher->reset(s); +} + +// +// rule9Adjust +// Line Break TR rules 9 and 10 implementation. +// This deals with combining marks and other sequences that +// that must be treated as if they were something other than what they actually are. +// +// This is factored out into a separate function because it must be applied twice for +// each potential break, once to the chars before the position being checked, then +// again to the text following the possible break. +// +void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { + if (pos == -1) { + // Invalid initial position. Happens during the warmup iteration of the + // main loop in next(). + return; + } + + int32_t nPos = *nextPos; + + // LB 9 Keep combining sequences together. + // advance over any CM class chars. Note that Line Break CM is different + // from the normal Grapheme Extend property. + if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || + *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { + for (;;) { + *nextChar = fText->char32At(nPos); + if (!fCM->contains(*nextChar)) { + break; + } + nPos = fText->moveIndex32(nPos, 1); + } + } + - delete [] fMungedText; - fMungedText = new UChar32[s.length()]; - fMungedLen = 0; - delete [] fMungedPositions; - fMungedPositions = new int32_t[s.length()]; - delete [] fOrigPositions; - fOrigPositions = new int32_t[s.length()]; - memset(fOrigPositions, -1, s.length()*4); + // LB 9 Treat X CM* as if it were x. + // No explicit action required. + + // LB 10 Treat any remaining combining mark as AL + if (fCM->contains(*posChar)) { + *posChar = 0x41; // thisChar = 'A'; + } + + // Push the updated nextPos and nextChar back to our caller. + // This only makes a difference if posChar got bigger by consuming a + // combining sequence. + *nextPos = nPos; + *nextChar = fText->char32At(nPos); +} + + + +int32_t RBBILineMonkey::next(int32_t startPos) { + UErrorCode status = U_ZERO_ERROR; + int32_t pos; // Index of the char following a potential break position + UChar32 thisChar; // Character at above position "pos" + + int32_t prevPos; // Index of the char preceding a potential break position + UChar32 prevChar; // Character at above position. Note that prevChar + // and thisChar may not be adjacent because combining + // characters between them will be ignored. + + int32_t nextPos; // Index of the next character following pos. + // Usually skips over combining marks. + int32_t nextCPPos; // Index of the code point following "pos." + // May point to a combining mark. + int32_t tPos; // temp value. + UChar32 c; + + if (U_FAILURE(deferredStatus)) { + return -1; + } + + if (startPos >= fText->length()) { + return -1; + } + + + // Initial values for loop. Loop will run the first time without finding breaks, + // while the invalid values shift out and the "this" and + // "prev" positions are filled in with good values. + pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. + thisChar = prevChar = 0; + nextPos = nextCPPos = startPos; + + + // Loop runs once per position in the test text, until a break position + // is found. + for (;;) { + prevPos = pos; + prevChar = thisChar; + + pos = nextPos; + thisChar = fText->char32At(pos); + + nextCPPos = fText->moveIndex32(pos, 1); + nextPos = nextCPPos; + + // Rule LB2 - Break at end of text. + if (pos >= fText->length()) { + break; + } + + // Rule LB 9 - adjust for combining sequences. + // We do this one out-of-order because the adjustment does not change anything + // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to + // be applied. + rule9Adjust(prevPos, &prevChar, &pos, &thisChar); + nextCPPos = nextPos = fText->moveIndex32(pos, 1); + c = fText->char32At(nextPos); + rule9Adjust(pos, &thisChar, &nextPos, &c); + + // If the loop is still warming up - if we haven't shifted the initial + // -1 positions out of prevPos yet - loop back to advance the + // position in the input without any further looking for breaks. + if (prevPos == -1) { + continue; + } + + // LB 4 Always break after hard line breaks, + if (fBK->contains(prevChar)) { + break; + } + + // LB 5 Break after CR, LF, NL, but not inside CR LF + if (prevChar == 0x0d && thisChar == 0x0a) { + continue; + } + if (prevChar == 0x0d || + prevChar == 0x0a || + prevChar == 0x85) { + break; + } + + // LB 6 Don't break before hard line breaks + if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || + fBK->contains(thisChar)) { + continue; + } + + + // LB 7 Don't break before spaces or zero-width space. + if (fSP->contains(thisChar)) { + continue; + } + + if (fZW->contains(thisChar)) { + continue; + } + + // LB 8 Break after zero width space + if (fZW->contains(prevChar)) { + break; + } + + // LB 9, 10 Already done, at top of loop. + // + + + // LB 11 Do not break before or after WORD JOINER and related characters. + // x WJ + // WJ x + // + if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { + continue; + } + + // LB 12 + // GL x + if (fGL->contains(prevChar)) { + continue; + } + + // LB 12a + // [^SP BA HY] x GL + if (!(fSP->contains(prevChar) || + fBA->contains(prevChar) || + fHY->contains(prevChar) ) && fGL->contains(thisChar)) { + continue; + } + + + + // LB 13 Don't break before closings. + // NU x CL, NU x CP and NU x IS are not matched here so that they will + // fall into LB 17 and the more general number regular expression. + // + if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || + (!fNU->contains(prevChar) && fCP->contains(thisChar)) || + fEX->contains(thisChar) || + (!fNU->contains(prevChar) && fIS->contains(thisChar)) || + (!fNU->contains(prevChar) && fSY->contains(thisChar))) { + continue; + } + + // LB 14 Don't break after OP SP* + // Scan backwards, checking for this sequence. + // The OP char could include combining marks, so we actually check for + // OP CM* SP* + // Another Twist: The Rule 67 fixes may have changed a SP CM + // sequence into a ID char, so before scanning back through spaces, + // verify that prevChar is indeed a space. The prevChar variable + // may differ from fText[prevPos] + tPos = prevPos; + if (fSP->contains(prevChar)) { + while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { + tPos=fText->moveIndex32(tPos, -1); + } + } + while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { + tPos=fText->moveIndex32(tPos, -1); + } + if (fOP->contains(fText->char32At(tPos))) { + continue; + } - // Precompute the "Munged Text", which is the test text, - // converted to an array of UChar32 for easier indexing, - // and with all but the first char of each Graphem Cluster removed (rule 3) - // and with format chars removed (rule 4) - fGCFMatcher->reset(s); - fGCMatcher ->reset(s); - int32_t pos=0; - while (fGCFMatcher->find()) { - pos = fGCFMatcher->start(deferredStatus); - UChar32 c = s.char32At(pos); - fMungedPositions[fMungedLen] = pos; - fOrigPositions[pos] = fMungedLen; - fMungedText[fMungedLen++] = c; - } -} + // LB 15 QU SP* x OP + if (fOP->contains(thisChar)) { + // Scan backwards from prevChar to see if it is preceded by QU CM* SP* + int tPos = prevPos; + while (tPos>0 && fSP->contains(fText->char32At(tPos))) { + tPos = fText->moveIndex32(tPos, -1); + } + while (tPos>0 && fCM->contains(fText->char32At(tPos))) { + tPos = fText->moveIndex32(tPos, -1); + } + if (fQU->contains(fText->char32At(tPos))) { + continue; + } + } -int32_t RBBIWordMonkey::next(int32_t prevPos) { - UErrorCode status = U_ZERO_ERROR; - if (prevPos >= fText->length()) { - return -1; - } - // If the previous position doesn't map to a position in the munged text, - // it means that the prev position was pointing to a trailing format char - // Advance, looking for additional format chars while doing so. - if (fOrigPositions[prevPos] == -1) { - // Advance by one grapheme cluster (could include combining marks) - fGCMatcher->reset(); - fGCMatcher->find(prevPos, status); - int32_t pos = fGCMatcher->end(status); - if (U_FAILURE(status)) { - pos = -1; + // LB 16 (CL | CP) SP* x NS + // Scan backwards for SP* CM* (CL | CP) + if (fNS->contains(thisChar)) { + int tPos = prevPos; + while (tPos>0 && fSP->contains(fText->char32At(tPos))) { + tPos = fText->moveIndex32(tPos, -1); + } + while (tPos>0 && fCM->contains(fText->char32At(tPos))) { + tPos = fText->moveIndex32(tPos, -1); + } + if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { + continue; + } } - // TODO: Don't return extend chars here!!! - return pos; - } - // Loop runs once per position in the munged test text, until a break position - // is found. - int32_t mpos = fOrigPositions[prevPos]; - for (; ; mpos++) { - UChar32 letter = fMungedText[mpos]; + // LB 17 B2 SP* x B2 + if (fB2->contains(thisChar)) { + // Scan backwards, checking for the B2 CM* SP* sequence. + tPos = prevPos; + if (fSP->contains(prevChar)) { + while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { + tPos=fText->moveIndex32(tPos, -1); + } + } + while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { + tPos=fText->moveIndex32(tPos, -1); + } + if (fB2->contains(fText->char32At(tPos))) { + continue; + } + } + - // Break at end of text. - if (mpos >= fMungedLen-1) { - mpos = fMungedLen; + // LB 18 break after space + if (fSP->contains(prevChar)) { break; } - // Rule (5). ALetter x ALetter - if (fALetterSet->contains(fMungedText[mpos]) && - fALetterSet->contains(fMungedText[mpos+1])) { + // LB 19 + // x QU + // QU x + if (fQU->contains(thisChar) || fQU->contains(prevChar)) { continue; } - // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter - if ((mpos+2) < fMungedLen && - fALetterSet->contains(fMungedText[mpos]) && - (fMidLetterSet->contains(fMungedText[mpos+1]) || - fMidNumLetSet->contains(fMungedText[mpos+1]) ) && - fALetterSet->contains(fMungedText[mpos+2])) - continue; + // LB 20 Break around a CB + if (fCB->contains(thisChar) || fCB->contains(prevChar)) { + break; + } - // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter - if (mpos >= 1 && - fALetterSet->contains(fMungedText[mpos-1]) && - (fMidLetterSet->contains(fMungedText[mpos]) || - fMidNumLetSet->contains(fMungedText[mpos]) ) && - fALetterSet->contains(fMungedText[mpos+1])) + // LB 21 + if (fBA->contains(thisChar) || + fHY->contains(thisChar) || + fNS->contains(thisChar) || + fBB->contains(prevChar) ) { continue; + } - // Rule (8) Numeric x Numeric - if (fNumericSet->contains(fMungedText[mpos]) && - fNumericSet->contains(fMungedText[mpos+1])) { + // LB 22 + if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || + (fID->contains(prevChar) && fIN->contains(thisChar)) || + (fIN->contains(prevChar) && fIN->contains(thisChar)) || + (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { continue; } - // Rule (9) ALetter x Numeric - if (fALetterSet->contains(fMungedText[mpos]) && - fNumericSet->contains(fMungedText[mpos+1])) { + + // LB 23 ID x PO + // AL x NU + // NU x AL + if ((fID->contains(prevChar) && fPO->contains(thisChar)) || + (fAL->contains(prevChar) && fNU->contains(thisChar)) || + (fNU->contains(prevChar) && fAL->contains(thisChar)) ) { continue; } - // Rule (10) Numeric x ALetter - if (fNumericSet->contains(fMungedText[mpos]) && - fALetterSet->contains(fMungedText[mpos+1])) { + // LB 24 Do not break between prefix and letters or ideographs. + // PR x ID + // PR x AL + // PO x AL + if ((fPR->contains(prevChar) && fID->contains(thisChar)) || + (fPR->contains(prevChar) && fAL->contains(thisChar)) || + (fPO->contains(prevChar) && fAL->contains(thisChar)) ) { continue; } - // Rule (11) Numeric (MidNum | MidNumLet) x Numeric - if (mpos >= 1 && - fNumericSet->contains(fMungedText[mpos-1]) && - (fMidNumSet->contains(fMungedText[mpos]) || - fMidNumLetSet->contains(fMungedText[mpos]) ) && - fNumericSet->contains(fMungedText[mpos+1])) - continue; - // Rule (12) Numeric x (MidNum | MidNumLet) Numeric - if ((mpos+2) < fMungedLen && - fNumericSet->contains(fMungedText[mpos]) && - (fMidNumSet->contains(fMungedText[mpos+1]) || - fMidNumLetSet->contains(fMungedText[mpos+1]) ) && - fNumericSet->contains(fMungedText[mpos+2])) - continue; - // Rule (13) Katakana x Katakana - if (fKatakanaSet->contains(fMungedText[mpos]) && - fKatakanaSet->contains(fMungedText[mpos+1])) { - continue; + // LB 25 Numbers + if (fNumberMatcher->lookingAt(prevPos, status)) { + if (U_FAILURE(status)) { + break; + } + // Matched a number. But could have been just a single digit, which would + // not represent a "no break here" between prevChar and thisChar + int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num + if (numEndIdx > pos) { + // Number match includes at least our two chars being checked + if (numEndIdx > nextPos) { + // Number match includes additional chars. Update pos and nextPos + // so that next loop iteration will continue at the end of the number, + // checking for breaks between last char in number & whatever follows. + pos = nextPos = numEndIdx; + do { + pos = fText->moveIndex32(pos, -1); + thisChar = fText->char32At(pos); + } while (fCM->contains(thisChar)); + } + continue; + } } - // Rule 14. Break found here. - mpos++; - break; - } - // We have a break position in terms of an index in the munged data. - // Get the corresponding index in the original test text. - int32_t breakPos; - if (mpos == fMungedLen) { - breakPos = fText->length(); - } else { - breakPos = fMungedPositions[mpos]; - } + // LB 26 Do not break a Korean syllable. + if (fJL->contains(prevChar) && (fJL->contains(thisChar) || + fJV->contains(thisChar) || + fH2->contains(thisChar) || + fH3->contains(thisChar))) { + continue; + } - // Rule 4 fixup, back up before any trailing - // format characters at the end of the word. - int32_t t = breakPos; - for (;;) { - t = fText->moveIndex32(t, -1); - if (t <= prevPos) { - break; + if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && + (fJV->contains(thisChar) || fJT->contains(thisChar))) { + continue; + } + + if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && + fJT->contains(thisChar)) { + continue; + } + + // LB 27 Treat a Korean Syllable Block the same as ID. + if ((fJL->contains(prevChar) || fJV->contains(prevChar) || + fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && + fIN->contains(thisChar)) { + continue; + } + if ((fJL->contains(prevChar) || fJV->contains(prevChar) || + fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && + fPO->contains(thisChar)) { + continue; + } + if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || + fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { + continue; + } + + + + // LB 28 Do not break between alphabetics ("at"). + if (fAL->contains(prevChar) && fAL->contains(thisChar)) { + continue; + } + + // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). + if (fIS->contains(prevChar) && fAL->contains(thisChar)) { + continue; } - UChar32 prevC = fText->char32At(t); - if (fExtendSet->contains(prevC)) { + + // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. + // (AL | NU) x OP + // CP x (AL | NU) + if ((fAL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { continue; } - if (fFormatSet->contains(prevC) == FALSE) { - break; + if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fNU->contains(thisChar))) { + continue; } - breakPos = t; + + // LB 31 Break everywhere else + break; + } - return breakPos; + return pos; } -UVector *RBBIWordMonkey::charClasses() { +UVector *RBBILineMonkey::charClasses() { return fSets; } -RBBIWordMonkey::~RBBIWordMonkey() { +RBBILineMonkey::~RBBILineMonkey() { delete fSets; - delete fKatakanaSet; - delete fALetterSet; - delete fMidLetterSet; - delete fMidNumLetSet; - delete fMidNumSet; - delete fNumericSet; - delete fFormatSet; - delete fExtendSet; - delete fOtherSet; - delete [] fMungedText; - delete [] fMungedPositions; - delete [] fOrigPositions; - - delete fGCFMatcher; - delete fGCMatcher; + delete fBK; + delete fCR; + delete fLF; + delete fCM; + delete fNL; + delete fWJ; + delete fZW; + delete fGL; + delete fCB; + delete fSP; + delete fB2; + delete fBA; + delete fBB; + delete fHY; + delete fH2; + delete fH3; + delete fCL; + delete fCP; + delete fEX; + delete fIN; + delete fJL; + delete fJV; + delete fJT; + delete fNS; + delete fOP; + delete fQU; + delete fIS; + delete fNU; + delete fPO; + delete fPR; + delete fSY; + delete fAI; + delete fAL; + delete fID; + delete fSA; + delete fSG; + delete fXX; + + delete fCharBI; + delete fNumberMatcher; } @@ -2392,7 +3906,7 @@ RBBIWordMonkey::~RBBIWordMonkey() { // -1: run forever. // 0 or greater: run length. // -// type = char | work | line | sent | title +// type = char | word | line | sent | title // //------------------------------------------------------------------------------------------- @@ -2405,7 +3919,9 @@ static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t d // The param exists. Convert the string to an int. char valString[100]; int32_t paramLength = m.end(1, status) - m.start(1, status); - if (paramLength >= sizeof(valString)-1) {paramLength = sizeof(valString)-2;}; + if (paramLength >= (int32_t)(sizeof(valString)-1)) { + paramLength = (int32_t)(sizeof(valString)-2); + } params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); val = strtol(valString, NULL, 10); @@ -2418,6 +3934,393 @@ static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t d } #endif +static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, + BreakIterator *bi, + int expected[], + int expectedcount) +{ + int count = 0; + int i = 0; + int forward[50]; + bi->setText(ustr); + for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { + forward[count] = i; + if (count < expectedcount && expected[count] != i) { + test->errln("break forward test failed: expected %d but got %d", + expected[count], i); + break; + } + count ++; + } + if (count != expectedcount) { + printStringBreaks(ustr, expected, expectedcount); + test->errln("break forward test failed: missed %d match", + expectedcount - count); + return; + } + // testing boundaries + for (i = 1; i < expectedcount; i ++) { + int j = expected[i - 1]; + if (!bi->isBoundary(j)) { + printStringBreaks(ustr, expected, expectedcount); + test->errln("isBoundary() failed. Expected boundary at position %d", j); + return; + } + for (j = expected[i - 1] + 1; j < expected[i]; j ++) { + if (bi->isBoundary(j)) { + printStringBreaks(ustr, expected, expectedcount); + test->errln("isBoundary() failed. Not expecting boundary at position %d", j); + return; + } + } + } + + for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { + count --; + if (forward[count] != i) { + test->errln("happy break test previous() failed: expected %d but got %d", + forward[count], i); + break; + } + } + if (count != 0) { + printStringBreaks(ustr, expected, expectedcount); + test->errln("break test previous() failed: missed a match"); + return; + } + + // testing preceding + for (i = 0; i < expectedcount - 1; i ++) { + // int j = expected[i] + 1; + int j = ustr.moveIndex32(expected[i], 1); + for (; j <= expected[i + 1]; j ++) { + if (bi->preceding(j) != expected[i]) { + printStringBreaks(ustr, expected, expectedcount); + test->errln("preceding(): Not expecting boundary at position %d", j); + return; + } + } + } +} + +void RBBITest::TestWordBreaks(void) +{ +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + + Locale locale("en"); + UErrorCode status = U_ZERO_ERROR; + // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); + BreakIterator *bi = BreakIterator::createWordInstance(locale, status); + static const char *strlist[] = + { + "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", + "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b", + "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", + "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", + "\\u90ca\\u3588\\u009c\\u0953\\u194b", + "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", + "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", + "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", + "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", + "\\u003b\\u024a\\u102e\\U000e0071\\u0600", + "\\u2027\\U000e0067\\u0a47\\u00b7", + "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", + "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", + "\\u0589\\U000e006e\\u0a42\\U000104a5", + "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", + "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", + "\\u0027\\u11af\\U000e0057\\u0602", + "\\U0001d7f2\\U000e007\\u0004\\u0589", + "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", + "\\U0001d7f2\\U000e007d\\u0004\\u0589", + "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", + "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", + "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", + "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", + "\\u0233\\U000e0020\\u0a69\\u0d6a", + "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", + "\\u58f4\\U000e0049\\u20e7\\u2027", + "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", + "\\ua183\\u102d\\u0bec\\u003a", + "\\u17e8\\u06e7\\u002e\\u096d\\u003b", + "\\u003a\\u0e57\\u0fad\\u002e", + "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", + "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", + "\\U000e005d\\u2044\\u0731\\u0650\\u0061", + "\\u003a\\u0664\\u00b7\\u1fba", + "\\u003b\\u0027\\u00b7\\u47a3", + "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", + "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", + "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", + }; + int loop; + if (U_FAILURE(status)) { + errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); + return; + } + for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { + // printf("looping %d\n", loop); + UnicodeString ustr = CharsToUnicodeString(strlist[loop]); + // RBBICharMonkey monkey; + RBBIWordMonkey monkey; + + int expected[50]; + int expectedcount = 0; + + monkey.setText(ustr); + int i; + for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { + expected[expectedcount ++] = i; + } + + testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); + } + delete bi; +#endif +} + +void RBBITest::TestWordBoundary(void) +{ + // <>\u1d4a\u206e\u0603\U0001d7ff<>\u2019<> + Locale locale("en"); + UErrorCode status = U_ZERO_ERROR; + // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); + BreakIterator *bi = BreakIterator::createWordInstance(locale, status); + UChar str[50]; + static const char *strlist[] = + { + "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", + "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", + "\\u003b\\u024a\\u102e\\U000e0071\\u0600", + "\\u2027\\U000e0067\\u0a47\\u00b7", + "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", + "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", + "\\u0589\\U000e006e\\u0a42\\U000104a5", + "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", + "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", + "\\u0027\\u11af\\U000e0057\\u0602", + "\\U0001d7f2\\U000e007\\u0004\\u0589", + "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", + "\\U0001d7f2\\U000e007d\\u0004\\u0589", + "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", + "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", + "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", + "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", + "\\u0233\\U000e0020\\u0a69\\u0d6a", + "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", + "\\u58f4\\U000e0049\\u20e7\\u2027", + "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", + "\\ua183\\u102d\\u0bec\\u003a", + "\\u17e8\\u06e7\\u002e\\u096d\\u003b", + "\\u003a\\u0e57\\u0fad\\u002e", + "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", + "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", + "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", + "\\u003a\\u0664\\u00b7\\u1fba", + "\\u003b\\u0027\\u00b7\\u47a3", + }; + int loop; + if (U_FAILURE(status)) { + errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); + return; + } + for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { + // printf("looping %d\n", loop); + u_unescape(strlist[loop], str, 20); + UnicodeString ustr(str); + int forward[50]; + int count = 0; + + bi->setText(ustr); + int prev = 0; + int i; + for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { + forward[count ++] = i; + if (i > prev) { + int j; + for (j = prev + 1; j < i; j ++) { + if (bi->isBoundary(j)) { + printStringBreaks(ustr, forward, count); + errln("happy boundary test failed: expected %d not a boundary", + j); + return; + } + } + } + if (!bi->isBoundary(i)) { + printStringBreaks(ustr, forward, count); + errln("happy boundary test failed: expected %d a boundary", + i); + return; + } + prev = i; + } + } + delete bi; +} + +void RBBITest::TestLineBreaks(void) +{ +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + Locale locale("en"); + UErrorCode status = U_ZERO_ERROR; + BreakIterator *bi = BreakIterator::createLineInstance(locale, status); + const int32_t STRSIZE = 50; + UChar str[STRSIZE]; + static const char *strlist[] = + { + "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", + "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" + "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", + "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" + "u2014\\U000e0105\\u118c\\u000a\\u07f8", + "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", + "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", + "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", + "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", + "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", + "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", + "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", + "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", + "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", + "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", + "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", + "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", + "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", + "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", + "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", + "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", + "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", + "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", + "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", + "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", + "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", + "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc", + "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", + "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", + "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", + "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", + "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", + "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", + "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", + "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", + "\\u2014\\u0020\\u000a\\u17c5\\u24fc", + "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", + "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", + "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", + "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", + "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", + "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", + "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d" + "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5" + "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b", + "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", + "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", + }; + int loop; + TEST_ASSERT_SUCCESS(status); + if (U_FAILURE(status)) { + return; + } + for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { + // printf("looping %d\n", loop); + int32_t t = u_unescape(strlist[loop], str, STRSIZE); + if (t >= STRSIZE) { + TEST_ASSERT(FALSE); + continue; + } + + + UnicodeString ustr(str); + RBBILineMonkey monkey; + if (U_FAILURE(monkey.deferredStatus)) { + continue; + } + + const int EXPECTEDSIZE = 50; + int expected[EXPECTEDSIZE]; + int expectedcount = 0; + + monkey.setText(ustr); + int i; + for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { + if (expectedcount >= EXPECTEDSIZE) { + TEST_ASSERT(expectedcount < EXPECTEDSIZE); + return; + } + expected[expectedcount ++] = i; + } + + testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); + } + delete bi; +#endif +} + +void RBBITest::TestSentBreaks(void) +{ +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + Locale locale("en"); + UErrorCode status = U_ZERO_ERROR; + BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); + UChar str[200]; + static const char *strlist[] = + { + "Now\ris\nthe\r\ntime\n\rfor\r\r", + "This\n", + "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", + "\"Sentence ending with a quote.\" Bye.", + " (This is it). Testing the sentence iterator. \"This isn't it.\"", + "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", + "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", + "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", + "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", + "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", + "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" + "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" + "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" + "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", + "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" + "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" + "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" + "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" + "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" + "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" + }; + int loop; + if (U_FAILURE(status)) { + errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); + return; + } + for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { + u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0]))); + UnicodeString ustr(str); + + RBBISentMonkey monkey; + if (U_FAILURE(monkey.deferredStatus)) { + continue; + } + + const int EXPECTEDSIZE = 50; + int expected[EXPECTEDSIZE]; + int expectedcount = 0; + + monkey.setText(ustr); + int i; + for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { + if (expectedcount >= EXPECTEDSIZE) { + TEST_ASSERT(expectedcount < EXPECTEDSIZE); + return; + } + expected[expectedcount ++] = i; + } + + testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); + } + delete bi; +#endif +} + void RBBITest::TestMonkey(char *params) { #if !UCONFIG_NO_REGULAR_EXPRESSIONS @@ -2426,6 +4329,7 @@ void RBBITest::TestMonkey(char *params) { int32_t seed = 1; UnicodeString breakType = "all"; Locale locale("en"); + UBool useUText = FALSE; if (quick == FALSE) { loopCount = 10000; @@ -2443,8 +4347,16 @@ void RBBITest::TestMonkey(char *params) { p = m.replaceFirst("", status); } - m.reset(p); - if (RegexMatcher("\\S", p, 0, status).find()) { + RegexMatcher u(" *utext", p, 0, status); + if (u.find()) { + useUText = TRUE; + u.reset(); + p = u.replaceFirst("", status); + } + + + // m.reset(p); + if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { // Each option is stripped out of the option string as it is processed. // All options have been checked. The option string should have been completely emptied.. char buf[100]; @@ -2459,35 +4371,93 @@ void RBBITest::TestMonkey(char *params) { if (breakType == "char" || breakType == "all") { RBBICharMonkey m; BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); - RunMonkey(bi, m, seed, loopCount); + if (U_SUCCESS(status)) { + RunMonkey(bi, m, "char", seed, loopCount, useUText); + if (breakType == "all" && useUText==FALSE) { + // Also run a quick test with UText when "all" is specified + RunMonkey(bi, m, "char", seed, loopCount, TRUE); + } + } + else { + errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); + } delete bi; } if (breakType == "word" || breakType == "all") { + logln("Word Break Monkey Test"); RBBIWordMonkey m; BreakIterator *bi = BreakIterator::createWordInstance(locale, status); - if (params == NULL) { - // TODO: Resolve rule ambiguities, unpin loop count. - loopCount = 2; + if (U_SUCCESS(status)) { + RunMonkey(bi, m, "word", seed, loopCount, useUText); + } + else { + errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); + } + delete bi; + } + + if (breakType == "line" || breakType == "all") { + logln("Line Break Monkey Test"); + RBBILineMonkey m; + BreakIterator *bi = BreakIterator::createLineInstance(locale, status); + if (loopCount >= 10) { + loopCount = loopCount / 5; // Line break runs slower than the others. + } + if (U_SUCCESS(status)) { + RunMonkey(bi, m, "line", seed, loopCount, useUText); + } + else { + errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); + } + delete bi; + } + + if (breakType == "sent" || breakType == "all" ) { + logln("Sentence Break Monkey Test"); + RBBISentMonkey m; + BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); + if (loopCount >= 10) { + loopCount = loopCount / 10; // Sentence runs slower than the other break types + } + if (U_SUCCESS(status)) { + RunMonkey(bi, m, "sentence", seed, loopCount, useUText); + } + else { + errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); } - RunMonkey(bi, m, seed, loopCount); delete bi; } #endif } +// +// Run a RBBI monkey test. Common routine, for all break iterator types. +// Parameters: +// bi - the break iterator to use +// mk - MonkeyKind, abstraction for obtaining expected results +// name - Name of test (char, word, etc.) for use in error messages +// seed - Seed for starting random number generator (parameter from user) +// numIterations +// +void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, + int32_t numIterations, UBool useUText) { -void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, uint32_t seed, int32_t numIterations) { #if !UCONFIG_NO_REGULAR_EXPRESSIONS const int32_t TESTSTRINGLEN = 500; UnicodeString testText; int32_t numCharClasses; UVector *chClasses; + int expected[TESTSTRINGLEN*2 + 1]; + int expectedCount = 0; char expectedBreaks[TESTSTRINGLEN*2 + 1]; char forwardBreaks[TESTSTRINGLEN*2 + 1]; char reverseBreaks[TESTSTRINGLEN*2+1]; + char isBoundaryBreaks[TESTSTRINGLEN*2+1]; + char followingBreaks[TESTSTRINGLEN*2+1]; + char precedingBreaks[TESTSTRINGLEN*2+1]; int i; int loopCount = 0; @@ -2513,7 +4483,12 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, uint32_t seed, } } - while (loopCount <= numIterations || numIterations == -1) { + while (loopCount < numIterations || numIterations == -1) { + if (numIterations == -1 && loopCount % 10 == 0) { + // If test is running in an infinite loop, display a periodic tic so + // we can tell that it is making progress. + fprintf(stderr, "."); + } // Save current random number seed, so that we can recreate the random numbers // for this loop iteration in event of an error. seed = m_seed; @@ -2525,7 +4500,10 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, uint32_t seed, UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); int32_t charIdx = m_rand() % classSet->size(); UChar32 c = classSet->charAt(charIdx); - assert(c >= 0); // TODO: deal with sets containing strings. + if (c < 0) { // TODO: deal with sets containing strings. + errln("c < 0"); + break; + } testText.append(c); } @@ -2534,21 +4512,38 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, uint32_t seed, memset(expectedBreaks, 0, sizeof(expectedBreaks)); expectedBreaks[0] = 1; int32_t breakPos = 0; + expectedCount = 0; for (;;) { breakPos = mk.next(breakPos); if (breakPos == -1) { break; } - assert(breakPos <= testText.length()); + if (breakPos > testText.length()) { + errln("breakPos > testText.length()"); + } expectedBreaks[breakPos] = 1; + U_ASSERT(expectedCountsetText(testText); + if (useUText) { + UErrorCode status = U_ZERO_ERROR; + UText *testUText = utext_openReplaceable(NULL, &testText, &status); + // testUText = utext_openUnicodeString(testUText, &testText, &status); + bi->setText(testUText, status); + TEST_ASSERT_SUCCESS(status); + utext_close(testUText); // The break iterator does a shallow clone of the UText + // This UText can be closed immediately, so long as the + // testText string continues to exist. + } else { + bi->setText(testText); + } + for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { if (i < 0 || i > testText.length()) { - errln("Out of range value returned by breakIterator::next()"); + errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); break; } forwardBreaks[i] = 1; @@ -2558,49 +4553,133 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, uint32_t seed, memset(reverseBreaks, 0, sizeof(reverseBreaks)); for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { if (i < 0 || i > testText.length()) { - errln("Out of range value returned by breakIterator::next()"); + errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); break; } reverseBreaks[i] = 1; } + // Find the break positions using isBoundary() tests. + memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); + U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); + for (i=0; i<=testText.length(); i++) { + isBoundaryBreaks[i] = bi->isBoundary(i); + } + + + // Find the break positions using the following() function. + // printf("."); + memset(followingBreaks, 0, sizeof(followingBreaks)); + int32_t lastBreakPos = 0; + followingBreaks[0] = 1; + for (i=0; ifollowing(i); + if (breakPos <= i || + breakPos < lastBreakPos || + breakPos > testText.length() || + (breakPos > lastBreakPos && lastBreakPos > i)) { + errln("%s break monkey test: " + "Out of range value returned by BreakIterator::following().\n" + "Random seed=%d index=%d; following returned %d; lastbreak=%d", + name, seed, i, breakPos, lastBreakPos); + break; + } + followingBreaks[breakPos] = 1; + lastBreakPos = breakPos; + } + + // Find the break positions using the preceding() function. + memset(precedingBreaks, 0, sizeof(precedingBreaks)); + lastBreakPos = testText.length(); + precedingBreaks[testText.length()] = 1; + for (i=testText.length(); i>0; i--) { + breakPos = bi->preceding(i); + if (breakPos >= i || + breakPos > lastBreakPos || + (breakPos < 0 && testText.getChar32Start(i)>0) || + (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) { + errln("%s break monkey test: " + "Out of range value returned by BreakIterator::preceding().\n" + "index=%d; prev returned %d; lastBreak=%d" , + name, i, breakPos, lastBreakPos); + if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { + precedingBreaks[i] = 2; // Forces an error. + } + } else { + if (breakPos >= 0) { + precedingBreaks[breakPos] = 1; + } + lastBreakPos = breakPos; + } + } + // Compare the expected and actual results. for (i=0; i<=testText.length(); i++) { - UBool forwardError = forwardBreaks[i] != expectedBreaks[i]; - UBool anyError = forwardError || reverseBreaks[i] != expectedBreaks[i]; - if (anyError) { + const char *errorType = NULL; + if (forwardBreaks[i] != expectedBreaks[i]) { + errorType = "next()"; + } else if (reverseBreaks[i] != forwardBreaks[i]) { + errorType = "previous()"; + } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { + errorType = "isBoundary()"; + } else if (followingBreaks[i] != expectedBreaks[i]) { + errorType = "following()"; + } else if (precedingBreaks[i] != expectedBreaks[i]) { + errorType = "preceding()"; + } + + + if (errorType != NULL) { // Format a range of the test text that includes the failure as // a data item that can be included in the rbbi test data file. // Start of the range is the last point where expected and actual results // both agreed that there was a break position. int startContext = i; + int32_t count = 0; for (;;) { if (startContext==0) { break; } - startContext--; - if (expectedBreaks[startContext] != 0) {break;} + startContext --; + if (expectedBreaks[startContext] != 0) { + if (count == 2) break; + count ++; + } } // End of range is two expected breaks past the start position. - int endContext = i+1; + int endContext = i + 1; int ci; for (ci=0; ci<2; ci++) { // Number of items to include in error text. for (;;) { if (endContext >= testText.length()) {break;} - if (expectedBreaks[endContext-1] != 0) { break;} - endContext++; + if (expectedBreaks[endContext-1] != 0) { + if (count == 0) break; + count --; + } + endContext ++; } } - // Format looks like "<>\uabcd\uabcd<>\U0001abcd..." + // Format looks like "\\\uabcd\uabcd\\\U0001abcd..." UnicodeString errorText = ""; + /***if (strcmp(errorType, "next()") == 0) { + startContext = 0; + endContext = testText.length(); + + printStringBreaks(testText, expected, expectedCount); + }***/ + for (ci=startContext; ci"); + if (ci == i) { + // This is the location of the error. + errorText.append(""); + } else if (expectedBreaks[ci] != 0) { + // This a non-error expected break position. + errorText.append("\\"); } if (c < 0x10000) { errorText.append("\\u"); @@ -2615,19 +4694,17 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, uint32_t seed, } ci = testText.moveIndex32(ci, 1); } - if (expectedBreaks[ci] != 0) { - errorText.append("<>"); - } + errorText.append("\\"); errorText.append("\n"); // Output the error - char charErrorTxt[100]; + char charErrorTxt[500]; UErrorCode status = U_ZERO_ERROR; errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); charErrorTxt[sizeof(charErrorTxt)-1] = 0; - errln("ERROR. %s. Direction = %s; Random seed = %d; buf Idx = %d\n%s", - (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), - (forwardError?"forward":"reverse"), seed, i, charErrorTxt); + errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", + name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), + errorType, seed, i, charErrorTxt); break; } } @@ -2638,4 +4715,80 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, uint32_t seed, } +// Bug 5532. UTF-8 based UText fails in dictionary code. +// This test checks the initial patch, +// which is to just keep it from crashing. Correct word boundaries +// await a proper fix to the dictionary code. +// +void RBBITest::TestBug5532(void) { + // Text includes a mixture of Thai and Latin. + const unsigned char utf8Data[] = { + 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, + 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, + 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, + 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, + 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, + 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, + 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, + 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, + 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, + 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, + 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; + + UErrorCode status = U_ZERO_ERROR; + UText utext=UTEXT_INITIALIZER; + utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); + TEST_ASSERT_SUCCESS(status); + + BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); + TEST_ASSERT_SUCCESS(status); + if (U_SUCCESS(status)) { + bi->setText(&utext, status); + TEST_ASSERT_SUCCESS(status); + + int32_t breakCount = 0; + int32_t previousBreak = -1; + for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { + // For now, just make sure that the break iterator doesn't hang. + TEST_ASSERT(previousBreak < bi->current()); + previousBreak = bi->current(); + } + TEST_ASSERT(breakCount > 0); + } + delete bi; + utext_close(&utext); +} + + +// +// TestDebug - A place-holder test for debugging purposes. +// For putting in fragments of other tests that can be invoked +// for tracing without a lot of unwanted extra stuff happening. +// +void RBBITest::TestDebug(void) { +#if 0 + UErrorCode status = U_ZERO_ERROR; + int pos = 0; + int ruleStatus = 0; + + RuleBasedBreakIterator* bi = + // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); + // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status); + (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status); + UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e"); + // UnicodeString s("Aaa. Bcd"); + s = s.unescape(); + bi->setText(s); + UBool r = bi->isBoundary(8); + printf("%s", r?"true":"false"); + return; + pos = bi->last(); + do { + // ruleStatus = bi->getRuleStatus(); + printf("%d\t%d\n", pos, ruleStatus); + pos = bi->previous(); + } while (pos != BreakIterator::DONE); +#endif +} + #endif /* #if !UCONFIG_NO_BREAK_ITERATION */