#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <utility>
+#include <vector>
#include "unicode/brkiter.h"
#include "unicode/localpointer.h"
#include "cstr.h"
#include "intltest.h"
#include "rbbitst.h"
+#include "rbbidata.h"
#include "utypeinfo.h" // for 'typeid' to work
#include "uvector.h"
#include "uvectr32.h"
+
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
#include "unicode/filteredbrk.h"
#endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
-
//---------------------------------------------
// runIndexedTest
//---------------------------------------------
#if !UCONFIG_NO_FILE_IO
TESTCASE_AUTO(TestBug4153072);
#endif
- TESTCASE_AUTO(TestStatusReturn);
#if !UCONFIG_NO_FILE_IO
TESTCASE_AUTO(TestUnicodeFiles);
- TESTCASE_AUTO(TestEmptyString);
#endif
TESTCASE_AUTO(TestGetAvailableLocales);
TESTCASE_AUTO(TestGetDisplayName);
TESTCASE_AUTO(TestBug12918);
TESTCASE_AUTO(TestBug12932);
TESTCASE_AUTO(TestEmoji);
+ TESTCASE_AUTO(TestBug12519);
+ TESTCASE_AUTO(TestBug12677);
+ TESTCASE_AUTO(TestTableRedundancies);
+ TESTCASE_AUTO(TestBug13447);
+ TESTCASE_AUTO(TestReverse);
+ TESTCASE_AUTO(TestBug13692);
TESTCASE_AUTO_END;
}
-//---------------------------------------------------------------------------
-//
-// class BITestData Holds a set of Break iterator test data and results
-// Includes
-// - the string data to be broken
-// - a vector of the expected break positions.
-// - a vector of source line numbers for the data,
-// (to help see where errors occured.)
-// - The expected break tag values.
-// - Vectors of actual break positions and tag values.
-// - Functions for comparing actual with expected and
-// reporting errors.
-//
-//----------------------------------------------------------------------------
-class BITestData {
-public:
- UnicodeString fDataToBreak;
- UVector fExpectedBreakPositions;
- UVector fExpectedTags;
- UVector fLineNum;
- UVector fActualBreakPositions; // Test Results.
- UVector fActualTags;
-
- BITestData(UErrorCode &status);
- void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
- void checkResults(const char *heading, RBBITest *test);
- void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
- void clearResults();
-};
-
-//
-// Constructor.
-//
-BITestData::BITestData(UErrorCode &status)
-: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
- fActualTags(status)
-{
-}
-
-//
-// addDataChunk. Add a section (non-breaking) piece if data to the test data.
-// The macro form collects the line number, which is helpful
-// when tracking down failures.
-//
-// A null data item is inserted at the start of each test's data
-// to put the starting zero into the data list. The position saved for
-// each non-null item is its ending position.
-//
-#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
-void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
- if (U_FAILURE(status)) {return;}
- if (data != NULL) {
- fDataToBreak.append(CharsToUnicodeString(data));
- }
- fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
- fExpectedTags.addElement(tag, status);
- fLineNum.addElement(lineNum, status);
-}
-
-
-//
-// checkResults. Compare the actual and expected break positions, report any differences.
-//
-void BITestData::checkResults(const char *heading, RBBITest *test) {
- int32_t expectedIndex = 0;
- int32_t actualIndex = 0;
-
- for (;;) {
- // If we've run through both the expected and actual results vectors, we're done.
- // break out of the loop.
- if (expectedIndex >= fExpectedBreakPositions.size() &&
- actualIndex >= fActualBreakPositions.size()) {
- break;
- }
-
-
- if (expectedIndex >= fExpectedBreakPositions.size()) {
- err(heading, test, expectedIndex-1, actualIndex);
- actualIndex++;
- continue;
- }
-
- if (actualIndex >= fActualBreakPositions.size()) {
- err(heading, test, expectedIndex, actualIndex-1);
- expectedIndex++;
- continue;
- }
-
- if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
- err(heading, test, expectedIndex, actualIndex);
- // Try to resync the positions of the indices, to avoid a rash of spurious erros.
- if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
- actualIndex++;
- } else {
- expectedIndex++;
- }
- continue;
- }
-
- if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
- test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
- heading, fLineNum.elementAt(expectedIndex),
- fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
- }
-
- actualIndex++;
- expectedIndex++;
- }
-}
-
-//
-// err - An error was found. Report it, along with information about where the
-// incorrectly broken test data appeared in the source file.
-//
-void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
-{
- int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
- int32_t actual = fActualBreakPositions.elementAti(actualIdx);
- int32_t o = 0;
- int32_t line = fLineNum.elementAti(expectedIdx);
- if (expectedIdx > 0) {
- // The line numbers are off by one because a premature break occurs somewhere
- // within the previous item, rather than at the start of the current (expected) item.
- // We want to report the offset of the unexpected break from the start of
- // this previous item.
- o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
- }
- if (actual < expected) {
- test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
- } else {
- test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
- }
-}
-
-
-void BITestData::clearResults() {
- fActualBreakPositions.removeAllElements();
- fActualTags.removeAllElements();
-}
-
-
//--------------------------------------------------------------------------------------
//
// RBBITest constructor and destructor
RBBITest::~RBBITest() {
}
-//-----------------------------------------------------------------------------------
-//
-// Test for status {tag} return value from break rules.
-// TODO: a more thorough test.
-//
-//-----------------------------------------------------------------------------------
-void RBBITest::TestStatusReturn() {
- UnicodeString rulesString1("$Letters = [:L:];\n"
- "$Numbers = [:N:];\n"
- "$Letters+{1};\n"
- "$Numbers+{2};\n"
- "Help\\ /me\\!{4};\n"
- "[^$Letters $Numbers];\n"
- "!.*;\n", -1, US_INV);
- UnicodeString testString1 = "abc123..abc Help me Help me!";
- // 01234567890123456789012345678
- int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
- int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
-
- UErrorCode status=U_ZERO_ERROR;
- UParseError parseError;
-
- LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
- if(U_FAILURE(status)) {
- dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status));
- return;
- }
- int32_t pos;
- int32_t i = 0;
- bi->setText(testString1);
- for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
- if (pos != bounds1[i]) {
- errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
- break;
- }
-
- int tag = bi->getRuleStatus();
- if (tag != brkStatus[i]) {
- errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
- break;
- }
- i++;
- }
-}
-
static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
UErrorCode status = U_ZERO_ERROR;
printf("code alpha extend alphanum type word sent line name\n");
int nextExpectedIndex = 0;
utext_setNativeIndex(tstr, 0);
- for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
+ for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
printf("------------------------------------------------ %d\n", j);
++nextExpectedIndex;
delete bi;
}
-//----------------------------------------------------------------------------
-//
-// generalIteratorTest Given a break iterator and a set of test data,
-// Run the tests and report the results.
-//
-//----------------------------------------------------------------------------
-void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
-{
-
- bi.setText(td.fDataToBreak);
-
- testFirstAndNext(bi, td);
-
- testLastAndPrevious(bi, td);
-
- testFollowing(bi, td);
- testPreceding(bi, td);
- testIsBoundary(bi, td);
- doMultipleSelectionTest(bi, td);
-}
-
-
-//
-// testFirstAndNext. Run the iterator forwards in the obvious first(), next()
-// kind of loop.
-//
-void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
-{
- UErrorCode status = U_ZERO_ERROR;
- int32_t p;
- int32_t lastP = -1;
- int32_t tag;
-
- logln("Test first and next");
- bi.setText(td.fDataToBreak);
- td.clearResults();
-
- for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
- td.fActualBreakPositions.addElement(p, status); // Save result.
- tag = bi.getRuleStatus();
- td.fActualTags.addElement(tag, status);
- if (p <= lastP) {
- // If the iterator is not making forward progress, stop.
- // No need to raise an error here, it'll be detected in the normal check of results.
- break;
- }
- lastP = p;
- }
- td.checkResults("testFirstAndNext", this);
-}
-
-
-//
-// TestLastAndPrevious. Run the iterator backwards, starting with last().
-//
-void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
-{
- UErrorCode status = U_ZERO_ERROR;
- int32_t p;
- int32_t lastP = 0x7ffffffe;
- int32_t tag;
-
- logln("Test last and previous");
- bi.setText(td.fDataToBreak);
- td.clearResults();
-
- for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
- // Save break position. Insert it at start of vector of results, shoving
- // already-saved results further towards the end.
- td.fActualBreakPositions.insertElementAt(p, 0, status);
- // bi.previous(); // TODO: Why does this fix things up????
- // bi.next();
- tag = bi.getRuleStatus();
- td.fActualTags.insertElementAt(tag, 0, status);
- if (p >= lastP) {
- // If the iterator is not making progress, stop.
- // No need to raise an error here, it'll be detected in the normal check of results.
- break;
- }
- lastP = p;
- }
- td.checkResults("testLastAndPrevious", this);
-}
-
-
-void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
-{
- UErrorCode status = U_ZERO_ERROR;
- int32_t p;
- int32_t tag;
- int32_t lastP = -2; // A value that will never be returned as a break position.
- // cannot be -1; that is returned for DONE.
- int i;
-
- logln("testFollowing():");
- bi.setText(td.fDataToBreak);
- td.clearResults();
-
- // Save the starting point, since we won't get that out of following.
- p = bi.first();
- td.fActualBreakPositions.addElement(p, status); // Save result.
- tag = bi.getRuleStatus();
- td.fActualTags.addElement(tag, status);
-
- for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
- p = bi.following(i);
- if (p != lastP) {
- if (p == RuleBasedBreakIterator::DONE) {
- break;
- }
- // We've reached a new break position. Save it.
- td.fActualBreakPositions.addElement(p, status); // Save result.
- tag = bi.getRuleStatus();
- td.fActualTags.addElement(tag, status);
- lastP = p;
- }
- }
- // The loop normally exits by means of the break in the middle.
- // Make sure that the index was at the correct position for the break iterator to have
- // returned DONE.
- if (i != td.fDataToBreak.length()) {
- errln("testFollowing(): iterator returned DONE prematurely.");
- }
-
- // Full check of all results.
- td.checkResults("testFollowing", this);
-}
-
-
-
-void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
- UErrorCode status = U_ZERO_ERROR;
- int32_t p;
- int32_t tag;
- int32_t lastP = 0x7ffffffe;
- int i;
-
- logln("testPreceding():");
- bi.setText(td.fDataToBreak);
- td.clearResults();
-
- p = bi.last();
- td.fActualBreakPositions.addElement(p, status);
- tag = bi.getRuleStatus();
- td.fActualTags.addElement(tag, status);
-
- for (i = td.fDataToBreak.length(); i>=-1; i--) {
- p = bi.preceding(i);
- if (p != lastP) {
- if (p == RuleBasedBreakIterator::DONE) {
- break;
- }
- // We've reached a new break position. Save it.
- td.fActualBreakPositions.insertElementAt(p, 0, status);
- lastP = p;
- tag = bi.getRuleStatus();
- td.fActualTags.insertElementAt(tag, 0, status);
- }
- }
- // The loop normally exits by means of the break in the middle.
- // Make sure that the index was at the correct position for the break iterator to have
- // returned DONE.
- if (i != 0) {
- errln("testPreceding(): iterator returned DONE prematurely.");
- }
-
- // Full check of all results.
- td.checkResults("testPreceding", this);
-}
-
-
-
-void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
- UErrorCode status = U_ZERO_ERROR;
- int i;
- int32_t tag;
-
- logln("testIsBoundary():");
- bi.setText(td.fDataToBreak);
- td.clearResults();
-
- for (i = 0; i <= td.fDataToBreak.length(); i++) {
- if (bi.isBoundary(i)) {
- td.fActualBreakPositions.addElement(i, status); // Save result.
- tag = bi.getRuleStatus();
- td.fActualTags.addElement(tag, status);
- }
- }
- td.checkResults("testIsBoundary: ", this);
-}
-
-
-
-void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
-{
- iterator.setText(td.fDataToBreak);
-
- RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
- int32_t offset = iterator.first();
- int32_t testOffset;
- int32_t count = 0;
-
- logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
-
- if (*testIterator != iterator)
- errln("clone() or operator!= failed: two clones compared unequal");
-
- do {
- testOffset = testIterator->first();
- testOffset = testIterator->next(count);
- if (offset != testOffset)
- errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
-
- if (offset != RuleBasedBreakIterator::DONE) {
- count++;
- offset = iterator.next();
-
- if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
- errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
- if (count > 10000 || offset == -1) {
- errln("operator== failed too many times. Stopping test.");
- if (offset == -1) {
- errln("Does (RuleBasedBreakIterator::DONE == -1)?");
- }
- return;
- }
- }
- }
- } while (offset != RuleBasedBreakIterator::DONE);
-
- // now do it backwards...
- offset = iterator.last();
- count = 0;
-
- do {
- testOffset = testIterator->last();
- testOffset = testIterator->next(count); // next() with a negative arg is same as previous
- if (offset != testOffset)
- errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
-
- if (offset != RuleBasedBreakIterator::DONE) {
- count--;
- offset = iterator.previous();
- }
- } while (offset != RuleBasedBreakIterator::DONE);
-
- delete testIterator;
-}
-
//---------------------------------------------
//
// other tests
//
//---------------------------------------------
-void RBBITest::TestEmptyString()
-{
- UnicodeString text = "";
- UErrorCode status = U_ZERO_ERROR;
-
- BITestData x(status);
- ADD_DATACHUNK(x, "", 0, status); // Break at start of data
- RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
- if (U_FAILURE(status))
- {
- errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
- return;
- }
- generalIteratorTest(*bi, x);
- delete bi;
-}
void RBBITest::TestGetAvailableLocales()
{
//
// Run the iterator backwards, verify that the same breaks are found.
//
- prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
- for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
+ prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
+ bp = t->bi->last();
+ while (bp != BreakIterator::DONE) {
if (prevBP == bp) {
// Fail for lack of progress.
errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
}
prevBP = bp;
+ bp = t->bi->previous();
}
// Verify that there were no missed breaks prior to the last one found
}
// Check following()
- for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
+ for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
int32_t actualBreak = t->bi->following(i);
int32_t expectedBreak = BreakIterator::DONE;
- for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
+ for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
if (t->getExpectedBreak(j) != 0) {
expectedBreak = j;
break;
}
// Check preceding()
- for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
+ for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
int32_t actualBreak = t->bi->preceding(i);
int32_t expectedBreak = BreakIterator::DONE;
// Therefore, start looking at the expected break data not at i-1, but at
// the start of code point index - 1.
utext_setNativeIndex(t->textToBreak, i);
- int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
+ int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
for (; j >= 0; j--) {
if (t->getExpectedBreak(j) != 0) {
expectedBreak = j;
UErrorCode status = U_ZERO_ERROR;
Locale locale("");
- UnicodeString rules;
TestParams tp(status);
- RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
+ RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
if (U_FAILURE(status)) {
dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
}
-
//
// Open and read the test data file.
//
const char *testDataDirectory = IntlTest::getSourceTestData(status);
- char testFileName[1000];
- if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
- errln("Can't open test data. Path too long.");
- return;
- }
- strcpy(testFileName, testDataDirectory);
- strcat(testFileName, "rbbitst.txt");
+ CharString testFileName(testDataDirectory, -1, status);
+ testFileName.append("rbbitst.txt", -1, status);
int len;
- UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
+ UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
if (U_FAILURE(status)) {
- return; /* something went wrong, error already output */
+ errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
+ return;
}
-
bool skipTest = false; // Skip this test?
//
PARSE_COMMENT,
PARSE_TAG,
PARSE_DATA,
- PARSE_NUM
+ PARSE_NUM,
+ PARSE_RULES
}
parseState = PARSE_TAG;
int32_t column = 0;
int32_t charIdx = 0;
- int32_t tagValue = 0; // The numeric value of a <nnn> tag.
+ int32_t tagValue = 0; // The numeric value of a <nnn> tag.
+
+ UnicodeString rules; // Holds rules from a <rules> ... </rules> block
+ int32_t rulesFirstLine; // Line number of the start of current <rules> block
for (charIdx = 0; charIdx < len; ) {
status = U_ZERO_ERROR;
if (u_isUWhiteSpace(c)) {
break;
}
- if (testString.compare(charIdx-1, 6, "<word>") == 0) {
+ if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createWordInstance(locale, status);
skipTest = false;
charIdx += 5;
break;
}
- if (testString.compare(charIdx-1, 6, "<char>") == 0) {
+ if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createCharacterInstance(locale, status);
skipTest = false;
charIdx += 5;
break;
}
- if (testString.compare(charIdx-1, 6, "<line>") == 0) {
+ if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createLineInstance(locale, status);
skipTest = false;
charIdx += 5;
break;
}
- if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
+ if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createSentenceInstance(locale, status);
skipTest = false;
charIdx += 5;
break;
}
- if (testString.compare(charIdx-1, 7, "<title>") == 0) {
+ if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createTitleInstance(locale, status);
charIdx += 6;
break;
}
+ if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
+ testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
+ charIdx = testString.indexOf(u'>', charIdx) + 1;
+ parseState = PARSE_RULES;
+ rules.remove();
+ rulesFirstLine = lineNum;
+ break;
+ }
+
// <locale loc_name>
localeMatcher.reset(testString);
if (localeMatcher.lookingAt(charIdx-1, status)) {
TEST_ASSERT_SUCCESS(status);
break;
}
- if (testString.compare(charIdx-1, 6, "<data>") == 0) {
+ if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
parseState = PARSE_DATA;
charIdx += 5;
tp.dataToBreak = "";
}
break;
+ case PARSE_RULES:
+ if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
+ charIdx += 7;
+ parseState = PARSE_TAG;
+ delete tp.bi;
+ UParseError pe;
+ tp.bi = new RuleBasedBreakIterator(rules, pe, status);
+ skipTest = U_FAILURE(status);
+ if (U_FAILURE(status)) {
+ errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
+ rulesFirstLine + pe.line - 1, u_errorName(status));
+ }
+ } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
+ charIdx += 10;
+ parseState = PARSE_TAG;
+ UErrorCode ec = U_ZERO_ERROR;
+ UParseError pe;
+ RuleBasedBreakIterator bi(rules, pe, ec);
+ if (U_SUCCESS(ec)) {
+ errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
+ rulesFirstLine + pe.line - 1);
+ }
+ } else {
+ rules.append(c);
+ }
+ break;
+
case PARSE_DATA:
if (c == u'\u2022') { // u'โข'
int32_t breakIdx = tp.dataToBreak.length();
break;
}
- if (testString.compare(charIdx-1, 7, "</data>") == 0) {
+ if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
// Add final entry to mappings from break location to source file position.
// Need one extra because last break position returned is after the
// last char in the data, not at the last char.
break;
}
- if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
+ if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
// Named character, e.g. \N{COMBINING GRAVE ACCENT}
// Get the code point from the name and insert it into the test data.
// (Damn, no API takes names in Unicode !!!
-
- if (testString.compare(charIdx-1, 2, "<>") == 0) {
+ if (testString.compare(charIdx-1, 2, u"<>") == 0) {
charIdx++;
int32_t breakIdx = tp.dataToBreak.length();
tp.expectedBreaks->setSize(breakIdx+1);
}
+ // Reached end of test file. Raise an error if parseState indicates that we are
+ // within a block that should have been terminated.
+
+ if (parseState == PARSE_RULES) {
+ errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
+ lineNum, rulesFirstLine);
+ }
+ if (parseState == PARSE_DATA) {
+ errln("rbbitst.txt:%d <data> block not closed.", lineNum);
+ }
+
+
end_test:
delete [] testFile;
#endif
fileSize = ftell(f);
fileBuf = new char[fileSize];
fseek(f, 0, SEEK_SET);
- amt_read = fread(fileBuf, 1, fileSize, f);
+ amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
if (amt_read != fileSize || fileSize <= 0) {
errln("Error reading test data file.");
goto cleanUpAndReturn;
// Check for test cases from the Unicode test data files that are known to fail
-// and should be skipped because ICU is not yet able to fully implement the spec.
-// See ticket #7270.
+// and should be skipped as known issues because ICU does not fully implement
+// the Unicode specifications, or because ICU includes tailorings that differ from
+// the Unicode standard.
+//
+// Test cases are identified by the test data sequence, which tends to be more stable
+// across Unicode versions than the test file line numbers.
+//
+// The test case with ticket "10666" is a dummy, included as an example.
UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
static struct TestCase {
+ const char *fTicketNum;
const char *fFileName;
const UChar *fString;
- } badTestCases[] = { // Line Numbers from Unicode 7.0.0 file.
- {"LineBreakTest.txt", u"\u200B\u0020}"}, // Line 5198
- {"LineBreakTest.txt", u"\u200B\u0020)"}, // Line 5202
- {"LineBreakTest.txt", u"\u200B\u0020!"}, // Line 5214
- {"LineBreakTest.txt", u"\u200B\u0020,"}, // Line 5246
- {"LineBreakTest.txt", u"\u200B\u0020/"}, // Line 5298
- {"LineBreakTest.txt", u"\u200B\u0020\u2060"}, // Line 5302
- // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
- {"GraphemeBreakTest.txt", u"\u200D\u2640"}, // Line 656, old GB 11 test ZWJ x GAZ
- {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
- {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
-
- // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
- {"WordBreakTest.txt", u"\u200D\u261D"}, // Line 1356, ZWJ x EmojiNRK
- {"WordBreakTest.txt", u"\u200D\U0001F3FB"}, // Line 1358, ZWJ x EmojiNRK
+ } badTestCases[] = {
+ {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
+ // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
+ // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
+ // ICU is out of sync with Unicode.
+ {"8151", "LineBreakTest.txt", u"-#"},
+ {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
+ {"8151", "LineBreakTest.txt", u"\u002d\u00a7"},
+ {"8151", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
+ {"8151", "LineBreakTest.txt", u"\u002d\U00050005"},
+ {"8151", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
+ {"8151", "LineBreakTest.txt", u"\u002d\u0e01"},
+ {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
+
+ // Issue ICU-12017 Improve line break around numbers
+ {"12017", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
+ {"12017", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
+ {"12017", "LineBreakTest.txt", u"find .com"},
+ {"12017", "LineBreakTest.txt", u"equals .35 cents"},
+ {"12017", "LineBreakTest.txt", u"a.2 "},
+ {"12017", "LineBreakTest.txt", u"a.2 \u0915"},
+ {"12017", "LineBreakTest.txt", u"a.2 \u672C"},
+ {"12017", "LineBreakTest.txt", u"a.2\u3000\u672C"},
+ {"12017", "LineBreakTest.txt", u"a.2\u3000\u307E"},
+ {"12017", "LineBreakTest.txt", u"a.2\u3000\u0033"},
+ {"12017", "LineBreakTest.txt", u"A.1 \uBABB"},
+ {"12017", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
+ {"12017", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
+ {"12017", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
+ {"12017", "LineBreakTest.txt", u"a.2\u3000\u300C"},
};
for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
const TestCase &badCase = badTestCases[n];
if (!strcmp(fileName, badCase.fFileName) &&
testCase == UnicodeString(badCase.fString)) {
- return logKnownIssue("7270");
+ return logKnownIssue(badCase.fTicketNum);
}
}
return FALSE;
}
-//
-// Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
-//
-static const char16_t *gExtended_Pict = u"["
- "\\U0001F774-\\U0001F77F\\U00002700-\\U00002701\\U00002703-\\U00002704\\U0000270E\\U00002710-\\U00002711\\U00002765-\\U00002767"
- "\\U0001F030-\\U0001F093\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
- "\\U0001F260-\\U0001F265\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F25F"
- "\\U0001F266-\\U0001F2FF\\U0001F7D5-\\U0001F7FF\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F"
- "\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6"
- "\\U0001F4FE\\U0001F53E-\\U0001F548\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586"
- "\\U0001F588-\\U0001F589\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7"
- "\\U0001F5A9-\\U0001F5B0\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB"
- "\\U0001F5DF-\\U0001F5E0\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
- "\\U00002605\\U00002607-\\U0000260D\\U0000260F-\\U00002610\\U00002612\\U00002616-\\U00002617\\U00002619-\\U0000261C"
- "\\U0000261E-\\U0000261F\\U00002621\\U00002624-\\U00002625\\U00002627-\\U00002629\\U0000262B-\\U0000262D\\U00002630-\\U00002637"
- "\\U0000263B-\\U00002647\\U00002654-\\U0000265F\\U00002661-\\U00002662\\U00002664\\U00002667\\U00002669-\\U0000267A"
- "\\U0000267C-\\U0000267E\\U00002680-\\U00002691\\U00002695\\U00002698\\U0000269A\\U0000269D-\\U0000269F\\U000026A2-\\U000026A9"
- "\\U000026AC-\\U000026AF\\U000026B2-\\U000026BC\\U000026BF-\\U000026C3\\U000026C6-\\U000026C7\\U000026C9-\\U000026CD"
- "\\U000026D0\\U000026D2\\U000026D5-\\U000026E8\\U000026EB-\\U000026EF\\U000026F6\\U000026FB-\\U000026FC\\U000026FE-\\U000026FF"
- "\\U00002388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5"
- "\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F"
- "\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF\\U0001F900-\\U0001F90B\\U0001F91F\\U0001F928-\\U0001F92F"
- "\\U0001F931-\\U0001F932\\U0001F94C\\U0001F95F-\\U0001F96B\\U0001F992-\\U0001F997\\U0001F9D0-\\U0001F9E6\\U0001F90C-\\U0001F90F"
- "\\U0001F93F\\U0001F94D-\\U0001F94F\\U0001F96C-\\U0001F97F\\U0001F998-\\U0001F9BF\\U0001F9C1-\\U0001F9CF\\U0001F9E7-\\U0001F9FF"
- "\\U0001F6C6-\\U0001F6CA\\U0001F6D3-\\U0001F6D4\\U0001F6E6-\\U0001F6E8\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6F7-\\U0001F6F8"
- "\\U0001F6D5-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F9-\\U0001F6FF"
- "]";
-
//------------------------------------------------------------------------------------------
//
// class RBBICharMonkey Character (Grapheme Cluster) specific implementation
UnicodeSet *fLVSet;
UnicodeSet *fLVTSet;
UnicodeSet *fHangulSet;
- UnicodeSet *fEmojiBaseSet;
- UnicodeSet *fEmojiModifierSet;
UnicodeSet *fExtendedPictSet;
- UnicodeSet *fEBGSet;
- UnicodeSet *fEmojiNRKSet;
UnicodeSet *fAnySet;
const UnicodeString *fText;
fHangulSet->addAll(*fLVSet);
fHangulSet->addAll(*fLVTSet);
- fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
- fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
- fExtendedPictSet = new UnicodeSet(gExtended_Pict, status);
- fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
- fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
- "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
+ fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
fAnySet = new UnicodeSet(0, 0x10ffff);
fSets = new UVector(status);
fSets->addElement(fSpacingSet, status);
fSets->addElement(fHangulSet, status);
fSets->addElement(fAnySet, status);
- fSets->addElement(fEmojiBaseSet, status);
- fSets->addElement(fEmojiModifierSet, status);
fSets->addElement(fZWJSet, status);
fSets->addElement(fExtendedPictSet, status);
- fSets->addElement(fEBGSet, status);
- fSets->addElement(fEmojiNRKSet,status);
if (U_FAILURE(status)) {
deferredStatus = status;
}
continue;
}
- // Rule (GB10) (Emoji_Base | EBG) Extend * x Emoji_Modifier
- if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
- continue;
- }
- if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) &&
- fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
- continue;
- }
-
- // Rule (GB11) (Glue_After_ZWJ | Emoji) Extend * ZWJ x (Glue_After_ZWJ | Emoji)
- if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
- (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
- continue;
- }
- if ((fExtendedPictSet->contains(cBase) || fEmojiNRKSet->contains(cBase)) && fExtendSet->contains(c0) && fZWJSet->contains(c1) &&
- (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
+ // Rule (GB11) Extended_Pictographic Extend * ZWJ x Extended_Pictographic
+ if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
continue;
}
// a break if there are three or more contiguous RIs. If there are
// only two, a break following will occur via other rules, and will include
// any trailing extend characters, which is needed behavior.
- if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
+ if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
&& fRegionalIndicatorSet->contains(c2)) {
break;
}
delete fLVTSet;
delete fHangulSet;
delete fAnySet;
- delete fEmojiBaseSet;
- delete fEmojiModifierSet;
delete fZWJSet;
delete fExtendedPictSet;
- delete fEBGSet;
- delete fEmojiNRKSet;
}
//------------------------------------------------------------------------------------------
UnicodeSet *fOtherSet;
UnicodeSet *fExtendSet;
UnicodeSet *fExtendNumLetSet;
+ UnicodeSet *fWSegSpaceSet;
UnicodeSet *fDictionarySet;
- UnicodeSet *fEBaseSet;
- UnicodeSet *fEBGSet;
- UnicodeSet *fEModifierSet;
UnicodeSet *fZWJSet;
UnicodeSet *fExtendedPictSet;
- UnicodeSet *fEmojiNRKSet;
const UnicodeString *fText;
};
fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]", status);
fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
- fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
+ fNumericSet = new UnicodeSet(u"[[\\p{Word_Break = Numeric}][\\uff10-\\uff19]]", status);
fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status);
+ fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
- fEBaseSet = new UnicodeSet(u"[[\\p{Word_Break = EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]", status);
- fEBGSet = new UnicodeSet(u"[\\p{Word_Break = EBG}]", status);
- fEModifierSet = new UnicodeSet(u"[\\p{Word_Break = EM}]", status);
fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
- fExtendedPictSet = new UnicodeSet(gExtended_Pict, status);
- fEmojiNRKSet = new UnicodeSet(
- u"[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]", status);
+ fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
fDictionarySet->addAll(*fKatakanaSet);
fOtherSet->removeAll(*fMidNumSet);
fOtherSet->removeAll(*fNumericSet);
fOtherSet->removeAll(*fExtendNumLetSet);
+ fOtherSet->removeAll(*fWSegSpaceSet);
fOtherSet->removeAll(*fFormatSet);
fOtherSet->removeAll(*fExtendSet);
fOtherSet->removeAll(*fRegionalIndicatorSet);
- fOtherSet->removeAll(*fEBaseSet);
- fOtherSet->removeAll(*fEBGSet);
- fOtherSet->removeAll(*fEModifierSet);
fOtherSet->removeAll(*fZWJSet);
fOtherSet->removeAll(*fExtendedPictSet);
- fOtherSet->removeAll(*fEmojiNRKSet);
// Inhibit dictionary characters from being tested at all.
fOtherSet->removeAll(*fDictionarySet);
fSets->addElement(fExtendSet, status);
fSets->addElement(fOtherSet, status);
fSets->addElement(fExtendNumLetSet, status);
+ fSets->addElement(fWSegSpaceSet, status);
- fSets->addElement(fEBaseSet, status);
- fSets->addElement(fEBGSet, status);
- fSets->addElement(fEModifierSet, status);
fSets->addElement(fZWJSet, status);
fSets->addElement(fExtendedPictSet, status);
- fSets->addElement(fEmojiNRKSet, status);
if (U_FAILURE(status)) {
deferredStatus = status;
break;
};
- // Rule (3c) ZWJ x (Glue_after_ZWJ | EmojiNRK).
+ // Rule (3c) ZWJ x Extended_Pictographic
// Not ignoring extend chars, so peek into input text to
// get the potential ZWJ, the character immediately preceding c2.
// Sloppy UChar32 indexing: p2-1 may reference trail half
// but char32At will get the full code point.
- if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
+ if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) {
+ continue;
+ }
+
+ // Rule (3d) Keep horizontal whitespace together.
+ if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
continue;
}
continue;
}
- // WB 14 (E_Base | EBG) x E_Modifier
- if ((fEBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) {
- continue;
- }
-
// Rule 15 - 17 Group pairs of Regional Indicators.
if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
break;
delete fFormatSet;
delete fExtendSet;
delete fExtendNumLetSet;
+ delete fWSegSpaceSet;
delete fRegionalIndicatorSet;
delete fDictionarySet;
delete fOtherSet;
- delete fEBaseSet;
- delete fEBGSet;
- delete fEModifierSet;
delete fZWJSet;
delete fExtendedPictSet;
- delete fEmojiNRKSet;
}
UnicodeSet *fB2;
UnicodeSet *fBA;
UnicodeSet *fBB;
+ UnicodeSet *fHH;
UnicodeSet *fHY;
UnicodeSet *fH2;
UnicodeSet *fH3;
UnicodeSet *fXX;
UnicodeSet *fEB;
UnicodeSet *fEM;
- UnicodeSet *fZJ;
- UnicodeSet *fExtendedPict;
- UnicodeSet *fEmojiNRK;
+ UnicodeSet *fZWJ;
BreakIterator *fCharBI;
const UnicodeString *fText;
fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
+ fHH = new UnicodeSet();
fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
- fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
- fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
- fExtendedPict = new UnicodeSet(gExtended_Pict, status);
+ fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
if (U_FAILURE(status)) {
deferredStatus = status;
fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
- fCM->addAll(*fZJ); // ZWJ behaves as a CM.
+ fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
+
+ fHH->add(u'\u2010'); // Hyphen, 'โ'
fSets->addElement(fBK, status);
fSets->addElement(fCR, status);
fSets->addElement(fSG, status);
fSets->addElement(fEB, status);
fSets->addElement(fEM, status);
- fSets->addElement(fZJ, status);
- fSets->addElement(fExtendedPict, status);
- fSets->addElement(fEmojiNRK, status);
+ fSets->addElement(fZWJ, status);
const char *rules =
"((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
"((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
+ "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
"\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
"((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
"((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
}
// LB 8 Break after zero width space
- if (fZW->contains(prevChar)) {
+ // ZW SP* รท
+ // Scan backwards from prevChar for SP* ZW
+ tPos = prevPos;
+ while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
+ tPos = fText->moveIndex32(tPos, -1);
+ }
+ if (fZW->contains(fText->char32At(tPos))) {
break;
}
- // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
+ // LB 25 Numbers
+ // Move this test up, before LB8a, because numbers can match a longer sequence that would
+ // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
+ if (fNumberMatcher->lookingAt(prevPos, status)) {
+ if (U_FAILURE(status)) {
+ break;
+ }
+ // Matched a number. But could have been just a single digit, which would
+ // not represent a "no break here" between prevChar and thisChar
+ int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
+ if (numEndIdx > pos) {
+ // Number match includes at least our two chars being checked
+ if (numEndIdx > nextPos) {
+ // Number match includes additional chars. Update pos and nextPos
+ // so that next loop iteration will continue at the end of the number,
+ // checking for breaks between last char in number & whatever follows.
+ pos = nextPos = numEndIdx;
+ do {
+ pos = fText->moveIndex32(pos, -1);
+ thisChar = fText->char32At(pos);
+ } while (fCM->contains(thisChar));
+ }
+ continue;
+ }
+ }
+
+ // LB 8a ZWJ x
// The monkey test's way of ignoring combining characters doesn't work
// for this rule. ZJ is also a CM. Need to get the actual character
// preceding "thisChar", not ignoring combining marks, possibly ZJ.
{
int32_t prevIdx = fText->moveIndex32(pos, -1);
UChar32 prevC = fText->char32At(prevIdx);
- if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
+ if (fZWJ->contains(prevC)) {
continue;
}
}
continue;
}
-
-
// LB 13 Don't break before closings.
- // NU x CL, NU x CP and NU x IS are not matched here so that they will
- // fall into LB 17 and the more general number regular expression.
//
- if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
- (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
- fEX->contains(thisChar) ||
- (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
- (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
+ if (fCL->contains(thisChar) ||
+ fCP->contains(thisChar) ||
+ fEX->contains(thisChar) ||
+ fSY->contains(thisChar)) {
continue;
}
// Scan backwards, checking for this sequence.
// The OP char could include combining marks, so we actually check for
// OP CM* SP*
- // Another Twist: The Rule 67 fixes may have changed a SP CM
+ // Another Twist: The Rule 9 fixes may have changed a SP CM
// sequence into a ID char, so before scanning back through spaces,
// verify that prevChar is indeed a space. The prevChar variable
// may differ from fText[prevPos]
}
+ // LB 14a Break before an IS that begins a number and follows a space
+ if (nextPos < fText->length()) {
+ // note: UnicodeString::char32At(length) returns ffff, not distinguishable
+ // from a legit ffff character. So test length separately.
+ UChar32 nextChar = fText->char32At(nextPos);
+ if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
+ break;
+ }
+ }
+
+ // LB14b Do not break before numeric separators, even after spaces.
+ if (fIS->contains(thisChar)) {
+ continue;
+ }
+
// LB 15 QU SP* x OP
if (fOP->contains(thisChar)) {
// Scan backwards from prevChar to see if it is preceded by QU CM* SP*
break;
}
+ // LB 20.09 Don't break between Hyphens and letters if a break precedes the hyphen.
+ // Formerly this was a Finnish tailoring.
+ // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
+ // ^($HY | $HH) $AL;
+ if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
+ prevPosX2 == -1) {
+ continue;
+ }
+
// LB 21
if (fBA->contains(thisChar) ||
fHY->contains(thisChar) ||
// LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
// PR x (ID | EB | EM)
// (ID | EB | EM) x PO
- if (fPR->contains(prevChar) &&
+ if (fPR->contains(prevChar) &&
(fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
continue;
}
- if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
+ if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
fPO->contains(thisChar)) {
continue;
}
continue;
}
-
-
- // LB 25 Numbers
- if (fNumberMatcher->lookingAt(prevPos, status)) {
- if (U_FAILURE(status)) {
- break;
- }
- // Matched a number. But could have been just a single digit, which would
- // not represent a "no break here" between prevChar and thisChar
- int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
- if (numEndIdx > pos) {
- // Number match includes at least our two chars being checked
- if (numEndIdx > nextPos) {
- // Number match includes additional chars. Update pos and nextPos
- // so that next loop iteration will continue at the end of the number,
- // checking for breaks between last char in number & whatever follows.
- pos = nextPos = numEndIdx;
- do {
- pos = fText->moveIndex32(pos, -1);
- thisChar = fText->char32At(pos);
- } while (fCM->contains(thisChar));
- }
- continue;
- }
- }
-
+ // LB 25 numbers match, moved up, before LB 8a,
// LB 26 Do not break a Korean syllable.
if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
continue;
}
- // LB30a RI RI <break> RI
- // RI x RI
+ // LB30a RI RI รท RI
+ // RI x RI
if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
break;
}
if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
+ // Two Regional Indicators have been paired.
+ // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
+ // following RI. This is a hack.
+ thisChar = -1;
continue;
}
delete fB2;
delete fBA;
delete fBB;
+ delete fHH;
delete fHY;
delete fH2;
delete fH3;
delete fXX;
delete fEB;
delete fEM;
- delete fZJ;
- delete fExtendedPict;
- delete fEmojiNRK;
+ delete fZWJ;
delete fCharBI;
delete fNumberMatcher;
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
forward[count] = i;
if (count < expectedcount && expected[count] != i) {
- test->errln("break forward test failed: expected %d but got %d",
- expected[count], i);
+ test->errln("%s:%d break forward test failed: expected %d but got %d",
+ __FILE__, __LINE__, expected[count], i);
break;
}
count ++;
}
if (count != expectedcount) {
printStringBreaks(ustr, expected, expectedcount);
- test->errln("break forward test failed: missed %d match",
- expectedcount - count);
+ test->errln("%s:%d break forward test failed: missed %d match",
+ __FILE__, __LINE__, expectedcount - count);
return;
}
// testing boundaries
int j = expected[i - 1];
if (!bi->isBoundary(j)) {
printStringBreaks(ustr, expected, expectedcount);
- test->errln("isBoundary() failed. Expected boundary at position %d", j);
+ test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
+ __FILE__, __LINE__, j);
return;
}
for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
if (bi->isBoundary(j)) {
printStringBreaks(ustr, expected, expectedcount);
- test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
+ test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
+ __FILE__, __LINE__, j);
return;
}
}
count --;
if (forward[count] != i) {
printStringBreaks(ustr, expected, expectedcount);
- test->errln("happy break test previous() failed: expected %d but got %d",
- forward[count], i);
+ test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
+ __FILE__, __LINE__, forward[count], i);
break;
}
}
// int j = expected[i] + 1;
int j = ustr.moveIndex32(expected[i], 1);
for (; j <= expected[i + 1]; j ++) {
- if (bi->preceding(j) != expected[i]) {
+ int32_t expectedPreceding = expected[i];
+ int32_t actualPreceding = bi->preceding(j);
+ if (actualPreceding != expectedPreceding) {
printStringBreaks(ustr, expected, expectedcount);
- test->errln("preceding(): Not expecting boundary at position %d", j);
+ test->errln("%s:%d preceding(%d): expected %d, got %d",
+ __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
return;
}
}
Locale locale("en");
UErrorCode status = U_ZERO_ERROR;
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
- BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
+ LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
+ if (U_FAILURE(status)) {
+ errcheckln(status, "%s:%d Creation of break iterator failed %s",
+ __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
UChar str[50];
static const char *strlist[] =
{
"\\u003b\\u0027\\u00b7\\u47a3",
};
int loop;
- if (U_FAILURE(status)) {
- errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
- return;
- }
for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
- // printf("looping %d\n", loop);
- u_unescape(strlist[loop], str, 20);
+ u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
UnicodeString ustr(str);
int forward[50];
int count = 0;
bi->setText(ustr);
- int prev = 0;
- int i;
- for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
- forward[count ++] = i;
- if (i > prev) {
- int j;
- for (j = prev + 1; j < i; j ++) {
- if (bi->isBoundary(j)) {
- printStringBreaks(ustr, forward, count);
- errln("happy boundary test failed: expected %d not a boundary",
- j);
- return;
- }
+ int prev = -1;
+ for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
+ ++count;
+ if (count >= UPRV_LENGTHOF(forward)) {
+ errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
+ __FILE__, __LINE__, loop, count, boundary);
+ return;
+ }
+ forward[count] = boundary;
+ if (boundary <= prev) {
+ errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
+ __FILE__, __LINE__, loop, prev, boundary);
+ break;
+ }
+ for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
+ if (bi->isBoundary(nonBoundary)) {
+ printStringBreaks(ustr, forward, count);
+ errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
+ __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
+ return;
}
}
- if (!bi->isBoundary(i)) {
+ if (!bi->isBoundary(boundary)) {
printStringBreaks(ustr, forward, count);
- errln("happy boundary test failed: expected %d a boundary",
- i);
+ errln("%s:%d happy boundary test failed: expected %d a boundary",
+ __FILE__, __LINE__, boundary);
return;
}
- prev = i;
+ prev = boundary;
}
}
- delete bi;
}
void RBBITest::TestLineBreaks(void)
// Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
// remain undevided by ICU char, word and line break.
void RBBITest::TestEmoji() {
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
UErrorCode status = U_ZERO_ERROR;
CharString testFileName;
}
}
}
+#endif
+}
+
+
+// TestBug12519 - Correct handling of Locales by assignment / copy / clone
+
+void RBBITest::TestBug12519() {
+ UErrorCode status = U_ZERO_ERROR;
+ LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
+ LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
+ if (!assertSuccess(WHERE, status)) {
+ dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+ assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
+
+ assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
+ assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
+
+ LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
+ assertTrue(WHERE, *biEn == *cloneEn);
+ assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
+
+ LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
+ assertTrue(WHERE, *biFr == *cloneFr);
+ assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
+
+ LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
+ UnicodeString text("Hallo Welt");
+ biDe->setText(text);
+ assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
+ *biDe = *biFr;
+ assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
+}
+
+void RBBITest::TestBug12677() {
+ // Check that stripping of comments from rules for getRules() is not confused by
+ // the presence of '#' characters in the rules that do not introduce comments.
+ UnicodeString rules(u"!!forward; \n"
+ "$x = [ab#]; # a set with a # literal. \n"
+ " # .; # a comment that looks sort of like a rule. \n"
+ " '#' '?'; # a rule with a quoted # \n"
+ );
+
+ UErrorCode status = U_ZERO_ERROR;
+ UParseError pe;
+ RuleBasedBreakIterator bi(rules, pe, status);
+ assertSuccess(WHERE, status);
+ UnicodeString rtRules = bi.getRules();
+ assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
+}
+
+
+void RBBITest::TestTableRedundancies() {
+ UErrorCode status = U_ZERO_ERROR;
+
+ LocalPointer<RuleBasedBreakIterator> bi (
+ (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
+ assertSuccess(WHERE, status);
+ if (U_FAILURE(status)) return;
+
+ RBBIDataWrapper *dw = bi->fData;
+ const RBBIStateTable *fwtbl = dw->fForwardTable;
+ int32_t numCharClasses = dw->fHeader->fCatCount;
+ // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
+
+ // Check for duplicate columns (character categories)
+
+ std::vector<UnicodeString> columns;
+ for (int32_t column = 0; column < numCharClasses; column++) {
+ UnicodeString s;
+ for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
+ RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
+ s.append(row->fNextState[column]);
+ }
+ columns.push_back(s);
+ }
+ // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
+ for (int c1=1; c1<numCharClasses; c1++) {
+ for (int c2 = c1+1; c2 < numCharClasses; c2++) {
+ if (columns.at(c1) == columns.at(c2)) {
+ errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
+ goto out;
+ }
+ }
+ }
+ out:
+
+ // Check for duplicate states
+ std::vector<UnicodeString> rows;
+ for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
+ UnicodeString s;
+ RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
+ assertTrue(WHERE, row->fAccepting >= -1);
+ s.append(row->fAccepting + 1); // values of -1 are expected.
+ s.append(row->fLookAhead);
+ s.append(row->fTagIdx);
+ for (int32_t column = 0; column < numCharClasses; column++) {
+ s.append(row->fNextState[column]);
+ }
+ rows.push_back(s);
+ }
+ for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
+ for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
+ if (rows.at(r1) == rows.at(r2)) {
+ errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
+ return;
+ }
+ }
+ }
+}
+
+// Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
+// even after next() has returned DONE.
+
+void RBBITest::TestBug13447() {
+ UErrorCode status = U_ZERO_ERROR;
+ LocalPointer<RuleBasedBreakIterator> bi(
+ (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
+ assertSuccess(WHERE, status);
+ if (U_FAILURE(status)) return;
+ UnicodeString data(u"1234");
+ bi->setText(data);
+ assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
+ assertEquals(WHERE, 4, bi->next());
+ assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
+ assertEquals(WHERE, UBRK_DONE, bi->next());
+ assertEquals(WHERE, 4, bi->current());
+ assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
+}
+
+// TestReverse exercises both the synthesized safe reverse rules and the logic
+// for filling the break iterator cache when starting from random positions
+// in the text.
+//
+// It's a monkey test, working on random data, with the expected data obtained
+// from forward iteration (no safe rules involved), comparing with results
+// when indexing into the interior of the string (safe rules needed).
+
+void RBBITest::TestReverse() {
+ UErrorCode status = U_ZERO_ERROR;
+
+ TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
+ BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
+ assertSuccess(WHERE, status, true);
+ status = U_ZERO_ERROR;
+ TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
+ BreakIterator::createWordInstance(Locale::getEnglish(), status)));
+ assertSuccess(WHERE, status, true);
+ status = U_ZERO_ERROR;
+ TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
+ BreakIterator::createLineInstance(Locale::getEnglish(), status)));
+ assertSuccess(WHERE, status, true);
+ status = U_ZERO_ERROR;
+ TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
+ BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
+ assertSuccess(WHERE, status, true);
+}
+
+void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
+ if (!bi) {
+ return;
+ }
+
+ // From the mapping trie in the break iterator's internal data, create a
+ // vector of UnicodeStrings, one for each character category, containing
+ // all of the code points that map to that category. Unicode planes 0 and 1 only,
+ // to avoid an execess of unassigned code points.
+
+ RBBIDataWrapper *data = bi->fData;
+ int32_t categoryCount = data->fHeader->fCatCount;
+ UTrie2 *trie = data->fTrie;
+
+ std::vector<UnicodeString> strings(categoryCount, UnicodeString());
+ for (int cp=0; cp<0x1fff0; ++cp) {
+ int cat = utrie2_get32(trie, cp);
+ cat &= ~0x4000; // And off the dictionary bit from the category.
+ assertTrue(WHERE, cat < categoryCount && cat >= 0);
+ if (cat < 0 || cat >= categoryCount) return;
+ strings[cat].append(cp);
+ }
+
+ icu_rand randomGen;
+ const int testStringLength = 10000;
+ UnicodeString testString;
+
+ for (int i=0; i<testStringLength; ++i) {
+ int charClass = randomGen() % categoryCount;
+ if (strings[charClass].length() > 0) {
+ int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
+ testString.append(cp);
+ }
+ }
+
+ typedef std::pair<UBool, int32_t> Result;
+ std::vector<Result> expectedResults;
+ bi->setText(testString);
+ for (int i=0; i<testString.length(); ++i) {
+ bool isboundary = bi->isBoundary(i);
+ int ruleStatus = bi->getRuleStatus();
+ expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
+ }
+
+ for (int i=testString.length()-1; i>=0; --i) {
+ bi->setText(testString); // clears the internal break cache
+ Result expected = expectedResults[i];
+ assertEquals(WHERE, expected.first, bi->isBoundary(i));
+ assertEquals(WHERE, expected.second, bi->getRuleStatus());
+ }
}
+// Ticket 13692 - finding word boundaries in very large numbers or words could
+// be very time consuming. When the problem was present, this void test
+// would run more than fifteen minutes, which is to say, the failure was noticeale.
+
+void RBBITest::TestBug13692() {
+ UErrorCode status = U_ZERO_ERROR;
+ LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
+ BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
+ if (!assertSuccess(WHERE, status, true)) {
+ return;
+ }
+ constexpr int32_t LENGTH = 1000000;
+ UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
+ for (int i=0; i<20; i+=2) {
+ longNumber.setCharAt(i, u' ');
+ }
+ bi->setText(longNumber);
+ assertFalse(WHERE, bi->isBoundary(LENGTH-5));
+ assertSuccess(WHERE, status);
+}
+
//
// TestDebug - A place-holder test for debugging purposes.
// For putting in fragments of other tests that can be invoked
// for tracing without a lot of unwanted extra stuff happening.
//
void RBBITest::TestDebug(void) {
+ UErrorCode status = U_ZERO_ERROR;
+ LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
+ BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
+ if (!assertSuccess(WHERE, status, true)) {
+ return;
+ }
+ const UnicodeString &rules = bi->getRules();
+ UParseError pe;
+ LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
+ assertSuccess(WHERE, status);
}
void RBBITest::TestProperties() {
}
}
-#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
+#endif // #if !UCONFIG_NO_BREAK_ITERATION