+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1999-2016, International Business Machines Corporation and
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <utility>
+#include <vector>
#include "unicode/brkiter.h"
#include "unicode/localpointer.h"
#include "charstr.h"
#include "cmemory.h"
+#include "cstr.h"
#include "intltest.h"
#include "rbbitst.h"
+#include "rbbidata.h"
#include "utypeinfo.h" // for 'typeid' to work
#include "uvector.h"
#include "uvectr32.h"
+
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
#include "unicode/filteredbrk.h"
#endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
-
//---------------------------------------------
// runIndexedTest
//---------------------------------------------
void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
{
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
+ fTestParams = params;
- switch (index) {
+ TESTCASE_AUTO_BEGIN;
#if !UCONFIG_NO_FILE_IO
- case 0: name = "TestBug4153072";
- if(exec) TestBug4153072(); break;
-#else
- case 0: name = "skip";
- break;
+ TESTCASE_AUTO(TestBug4153072);
#endif
-
- case 1: name = "skip";
- break;
- case 2: name = "TestStatusReturn";
- if(exec) TestStatusReturn(); break;
-
#if !UCONFIG_NO_FILE_IO
- case 3: name = "TestUnicodeFiles";
- if(exec) TestUnicodeFiles(); break;
- case 4: name = "TestEmptyString";
- if(exec) TestEmptyString(); break;
-#else
- case 3: case 4: name = "skip";
- break;
+ TESTCASE_AUTO(TestUnicodeFiles);
#endif
-
- case 5: name = "TestGetAvailableLocales";
- if(exec) TestGetAvailableLocales(); break;
-
- case 6: name = "TestGetDisplayName";
- if(exec) TestGetDisplayName(); break;
-
+ TESTCASE_AUTO(TestGetAvailableLocales);
+ TESTCASE_AUTO(TestGetDisplayName);
#if !UCONFIG_NO_FILE_IO
- case 7: name = "TestEndBehaviour";
- if(exec) TestEndBehaviour(); break;
- case 8: case 9: case 10: name = "skip";
- break;
- case 11: name = "TestWordBreaks";
- if(exec) TestWordBreaks(); break;
- case 12: name = "TestWordBoundary";
- if(exec) TestWordBoundary(); break;
- case 13: name = "TestLineBreaks";
- if(exec) TestLineBreaks(); break;
- case 14: name = "TestSentBreaks";
- if(exec) TestSentBreaks(); break;
- case 15: name = "TestExtended";
- if(exec) TestExtended(); break;
-#else
- case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
- break;
+ TESTCASE_AUTO(TestEndBehaviour);
+ TESTCASE_AUTO(TestWordBreaks);
+ TESTCASE_AUTO(TestWordBoundary);
+ TESTCASE_AUTO(TestLineBreaks);
+ TESTCASE_AUTO(TestSentBreaks);
+ TESTCASE_AUTO(TestExtended);
#endif
-
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
- case 16:
- name = "TestMonkey"; if(exec) TestMonkey(params); break;
-#else
- case 16:
- name = "skip"; break;
+ TESTCASE_AUTO(TestMonkey);
#endif
-
#if !UCONFIG_NO_FILE_IO
- case 17: name = "TestBug3818";
- if(exec) TestBug3818(); break;
-#else
- case 17: name = "skip";
- break;
+ TESTCASE_AUTO(TestBug3818);
#endif
-
- case 18: name = "skip";
- break;
- case 19: name = "TestDebug";
- if(exec) TestDebug(); break;
- case 20: name = "skip";
- break;
-
+ TESTCASE_AUTO(TestDebug);
#if !UCONFIG_NO_FILE_IO
- case 21: name = "TestBug5775";
- if (exec) TestBug5775(); break;
-#else
- case 21: name = "skip";
- break;
+ TESTCASE_AUTO(TestBug5775);
#endif
-
- case 22: name = "TestBug9983";
- if (exec) TestBug9983(); break;
- case 23: name = "TestDictRules";
- if (exec) TestDictRules(); break;
- case 24: name = "TestBug5532";
- if (exec) TestBug5532(); break;
- default: name = ""; break; //needed to end loop
- }
-}
-
-
-//---------------------------------------------------------------------------
-//
-// class BITestData Holds a set of Break iterator test data and results
-// Includes
-// - the string data to be broken
-// - a vector of the expected break positions.
-// - a vector of source line numbers for the data,
-// (to help see where errors occured.)
-// - The expected break tag values.
-// - Vectors of actual break positions and tag values.
-// - Functions for comparing actual with expected and
-// reporting errors.
-//
-//----------------------------------------------------------------------------
-class BITestData {
-public:
- UnicodeString fDataToBreak;
- UVector fExpectedBreakPositions;
- UVector fExpectedTags;
- UVector fLineNum;
- UVector fActualBreakPositions; // Test Results.
- UVector fActualTags;
-
- BITestData(UErrorCode &status);
- void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
- void checkResults(const char *heading, RBBITest *test);
- void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
- void clearResults();
-};
-
-//
-// Constructor.
-//
-BITestData::BITestData(UErrorCode &status)
-: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
- fActualTags(status)
-{
-}
-
-//
-// addDataChunk. Add a section (non-breaking) piece if data to the test data.
-// The macro form collects the line number, which is helpful
-// when tracking down failures.
-//
-// A null data item is inserted at the start of each test's data
-// to put the starting zero into the data list. The position saved for
-// each non-null item is its ending position.
-//
-#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
-void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
- if (U_FAILURE(status)) {return;}
- if (data != NULL) {
- fDataToBreak.append(CharsToUnicodeString(data));
- }
- fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
- fExpectedTags.addElement(tag, status);
- fLineNum.addElement(lineNum, status);
-}
-
-
-//
-// checkResults. Compare the actual and expected break positions, report any differences.
-//
-void BITestData::checkResults(const char *heading, RBBITest *test) {
- int32_t expectedIndex = 0;
- int32_t actualIndex = 0;
-
- for (;;) {
- // If we've run through both the expected and actual results vectors, we're done.
- // break out of the loop.
- if (expectedIndex >= fExpectedBreakPositions.size() &&
- actualIndex >= fActualBreakPositions.size()) {
- break;
- }
-
-
- if (expectedIndex >= fExpectedBreakPositions.size()) {
- err(heading, test, expectedIndex-1, actualIndex);
- actualIndex++;
- continue;
- }
-
- if (actualIndex >= fActualBreakPositions.size()) {
- err(heading, test, expectedIndex, actualIndex-1);
- expectedIndex++;
- continue;
- }
-
- if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
- err(heading, test, expectedIndex, actualIndex);
- // Try to resync the positions of the indices, to avoid a rash of spurious erros.
- if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
- actualIndex++;
- } else {
- expectedIndex++;
- }
- continue;
- }
-
- if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
- test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
- heading, fLineNum.elementAt(expectedIndex),
- fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
- }
-
- actualIndex++;
- expectedIndex++;
- }
-}
-
-//
-// err - An error was found. Report it, along with information about where the
-// incorrectly broken test data appeared in the source file.
-//
-void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
-{
- int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
- int32_t actual = fActualBreakPositions.elementAti(actualIdx);
- int32_t o = 0;
- int32_t line = fLineNum.elementAti(expectedIdx);
- if (expectedIdx > 0) {
- // The line numbers are off by one because a premature break occurs somewhere
- // within the previous item, rather than at the start of the current (expected) item.
- // We want to report the offset of the unexpected break from the start of
- // this previous item.
- o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
- }
- if (actual < expected) {
- test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
- } else {
- test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
- }
-}
-
-
-void BITestData::clearResults() {
- fActualBreakPositions.removeAllElements();
- fActualTags.removeAllElements();
+ TESTCASE_AUTO(TestBug9983);
+ TESTCASE_AUTO(TestDictRules);
+ TESTCASE_AUTO(TestBug5532);
+ TESTCASE_AUTO(TestBug7547);
+ TESTCASE_AUTO(TestBug12797);
+ TESTCASE_AUTO(TestBug12918);
+ TESTCASE_AUTO(TestBug12932);
+ TESTCASE_AUTO(TestEmoji);
+ TESTCASE_AUTO(TestBug12519);
+ TESTCASE_AUTO(TestBug12677);
+ TESTCASE_AUTO(TestTableRedundancies);
+ TESTCASE_AUTO(TestBug13447);
+ TESTCASE_AUTO(TestReverse);
+ TESTCASE_AUTO(TestBug13692);
+ TESTCASE_AUTO_END;
}
//--------------------------------------------------------------------------------------
RBBITest::RBBITest() {
+ fTestParams = NULL;
}
RBBITest::~RBBITest() {
}
-//-----------------------------------------------------------------------------------
-//
-// Test for status {tag} return value from break rules.
-// TODO: a more thorough test.
-//
-//-----------------------------------------------------------------------------------
-void RBBITest::TestStatusReturn() {
- UnicodeString rulesString1("$Letters = [:L:];\n"
- "$Numbers = [:N:];\n"
- "$Letters+{1};\n"
- "$Numbers+{2};\n"
- "Help\\ /me\\!{4};\n"
- "[^$Letters $Numbers];\n"
- "!.*;\n", -1, US_INV);
- UnicodeString testString1 = "abc123..abc Help me Help me!";
- // 01234567890123456789012345678
- int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
- int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
-
- UErrorCode status=U_ZERO_ERROR;
- UParseError parseError;
-
- LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
- if(U_FAILURE(status)) {
- dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status));
- return;
- }
- int32_t pos;
- int32_t i = 0;
- bi->setText(testString1);
- for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
- if (pos != bounds1[i]) {
- errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
- break;
- }
-
- int tag = bi->getRuleStatus();
- if (tag != brkStatus[i]) {
- errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
- break;
- }
- i++;
- }
-}
-
static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
UErrorCode status = U_ZERO_ERROR;
printf("code alpha extend alphanum type word sent line name\n");
int nextExpectedIndex = 0;
utext_setNativeIndex(tstr, 0);
- for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
+ for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
printf("------------------------------------------------ %d\n", j);
++nextExpectedIndex;
delete bi;
}
-//----------------------------------------------------------------------------
-//
-// generalIteratorTest Given a break iterator and a set of test data,
-// Run the tests and report the results.
-//
-//----------------------------------------------------------------------------
-void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
-{
-
- bi.setText(td.fDataToBreak);
-
- testFirstAndNext(bi, td);
-
- testLastAndPrevious(bi, td);
-
- testFollowing(bi, td);
- testPreceding(bi, td);
- testIsBoundary(bi, td);
- doMultipleSelectionTest(bi, td);
-}
-
-
-//
-// testFirstAndNext. Run the iterator forwards in the obvious first(), next()
-// kind of loop.
-//
-void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
-{
- UErrorCode status = U_ZERO_ERROR;
- int32_t p;
- int32_t lastP = -1;
- int32_t tag;
-
- logln("Test first and next");
- bi.setText(td.fDataToBreak);
- td.clearResults();
-
- for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
- td.fActualBreakPositions.addElement(p, status); // Save result.
- tag = bi.getRuleStatus();
- td.fActualTags.addElement(tag, status);
- if (p <= lastP) {
- // If the iterator is not making forward progress, stop.
- // No need to raise an error here, it'll be detected in the normal check of results.
- break;
- }
- lastP = p;
- }
- td.checkResults("testFirstAndNext", this);
-}
-
-
-//
-// TestLastAndPrevious. Run the iterator backwards, starting with last().
-//
-void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
-{
- UErrorCode status = U_ZERO_ERROR;
- int32_t p;
- int32_t lastP = 0x7ffffffe;
- int32_t tag;
-
- logln("Test last and previous");
- bi.setText(td.fDataToBreak);
- td.clearResults();
-
- for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
- // Save break position. Insert it at start of vector of results, shoving
- // already-saved results further towards the end.
- td.fActualBreakPositions.insertElementAt(p, 0, status);
- // bi.previous(); // TODO: Why does this fix things up????
- // bi.next();
- tag = bi.getRuleStatus();
- td.fActualTags.insertElementAt(tag, 0, status);
- if (p >= lastP) {
- // If the iterator is not making progress, stop.
- // No need to raise an error here, it'll be detected in the normal check of results.
- break;
- }
- lastP = p;
- }
- td.checkResults("testLastAndPrevious", this);
-}
-
-
-void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
-{
- UErrorCode status = U_ZERO_ERROR;
- int32_t p;
- int32_t tag;
- int32_t lastP = -2; // A value that will never be returned as a break position.
- // cannot be -1; that is returned for DONE.
- int i;
-
- logln("testFollowing():");
- bi.setText(td.fDataToBreak);
- td.clearResults();
-
- // Save the starting point, since we won't get that out of following.
- p = bi.first();
- td.fActualBreakPositions.addElement(p, status); // Save result.
- tag = bi.getRuleStatus();
- td.fActualTags.addElement(tag, status);
-
- for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
- p = bi.following(i);
- if (p != lastP) {
- if (p == RuleBasedBreakIterator::DONE) {
- break;
- }
- // We've reached a new break position. Save it.
- td.fActualBreakPositions.addElement(p, status); // Save result.
- tag = bi.getRuleStatus();
- td.fActualTags.addElement(tag, status);
- lastP = p;
- }
- }
- // The loop normally exits by means of the break in the middle.
- // Make sure that the index was at the correct position for the break iterator to have
- // returned DONE.
- if (i != td.fDataToBreak.length()) {
- errln("testFollowing(): iterator returned DONE prematurely.");
- }
-
- // Full check of all results.
- td.checkResults("testFollowing", this);
-}
-
-
-
-void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
- UErrorCode status = U_ZERO_ERROR;
- int32_t p;
- int32_t tag;
- int32_t lastP = 0x7ffffffe;
- int i;
-
- logln("testPreceding():");
- bi.setText(td.fDataToBreak);
- td.clearResults();
-
- p = bi.last();
- td.fActualBreakPositions.addElement(p, status);
- tag = bi.getRuleStatus();
- td.fActualTags.addElement(tag, status);
-
- for (i = td.fDataToBreak.length(); i>=-1; i--) {
- p = bi.preceding(i);
- if (p != lastP) {
- if (p == RuleBasedBreakIterator::DONE) {
- break;
- }
- // We've reached a new break position. Save it.
- td.fActualBreakPositions.insertElementAt(p, 0, status);
- lastP = p;
- tag = bi.getRuleStatus();
- td.fActualTags.insertElementAt(tag, 0, status);
- }
- }
- // The loop normally exits by means of the break in the middle.
- // Make sure that the index was at the correct position for the break iterator to have
- // returned DONE.
- if (i != 0) {
- errln("testPreceding(): iterator returned DONE prematurely.");
- }
-
- // Full check of all results.
- td.checkResults("testPreceding", this);
-}
-
-
-
-void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
- UErrorCode status = U_ZERO_ERROR;
- int i;
- int32_t tag;
-
- logln("testIsBoundary():");
- bi.setText(td.fDataToBreak);
- td.clearResults();
-
- for (i = 0; i <= td.fDataToBreak.length(); i++) {
- if (bi.isBoundary(i)) {
- td.fActualBreakPositions.addElement(i, status); // Save result.
- tag = bi.getRuleStatus();
- td.fActualTags.addElement(tag, status);
- }
- }
- td.checkResults("testIsBoundary: ", this);
-}
-
-
-
-void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
-{
- iterator.setText(td.fDataToBreak);
-
- RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
- int32_t offset = iterator.first();
- int32_t testOffset;
- int32_t count = 0;
-
- logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
-
- if (*testIterator != iterator)
- errln("clone() or operator!= failed: two clones compared unequal");
-
- do {
- testOffset = testIterator->first();
- testOffset = testIterator->next(count);
- if (offset != testOffset)
- errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
-
- if (offset != RuleBasedBreakIterator::DONE) {
- count++;
- offset = iterator.next();
-
- if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
- errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
- if (count > 10000 || offset == -1) {
- errln("operator== failed too many times. Stopping test.");
- if (offset == -1) {
- errln("Does (RuleBasedBreakIterator::DONE == -1)?");
- }
- return;
- }
- }
- }
- } while (offset != RuleBasedBreakIterator::DONE);
-
- // now do it backwards...
- offset = iterator.last();
- count = 0;
-
- do {
- testOffset = testIterator->last();
- testOffset = testIterator->next(count); // next() with a negative arg is same as previous
- if (offset != testOffset)
- errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
-
- if (offset != RuleBasedBreakIterator::DONE) {
- count--;
- offset = iterator.previous();
- }
- } while (offset != RuleBasedBreakIterator::DONE);
-
- delete testIterator;
-}
-
//---------------------------------------------
//
// other tests
//
//---------------------------------------------
-void RBBITest::TestEmptyString()
-{
- UnicodeString text = "";
- UErrorCode status = U_ZERO_ERROR;
-
- BITestData x(status);
- ADD_DATACHUNK(x, "", 0, status); // Break at start of data
- RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
- if (U_FAILURE(status))
- {
- errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
- return;
- }
- generalIteratorTest(*bi, x);
- delete bi;
-}
void RBBITest::TestGetAvailableLocales()
{
}
-int32_t TestParams::getSrcLine(int bp) {
+int32_t TestParams::getSrcLine(int32_t bp) {
if (bp >= textMap->size()) {
bp = textMap->size() - 1;
}
}
-int32_t TestParams::getExpectedBreak(int bp) {
+int32_t TestParams::getExpectedBreak(int32_t bp) {
if (bp >= textMap->size()) {
return 0;
}
}
-int32_t TestParams::getSrcCol(int bp) {
+int32_t TestParams::getSrcCol(int32_t bp) {
if (bp >= textMap->size()) {
bp = textMap->size() - 1;
}
expectedTagVal = 0;
}
int32_t line = t->getSrcLine(bp);
- int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
+ int32_t rs = t->bi->getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
" Actual, Expected status = %4d, %4d",
//
// Run the iterator backwards, verify that the same breaks are found.
//
- prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
- for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
+ prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
+ bp = t->bi->last();
+ while (bp != BreakIterator::DONE) {
if (prevBP == bp) {
// Fail for lack of progress.
errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
}
prevBP = bp;
+ bp = t->bi->previous();
}
// Verify that there were no missed breaks prior to the last one found
}
// Check following()
- for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
+ for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
int32_t actualBreak = t->bi->following(i);
int32_t expectedBreak = BreakIterator::DONE;
- for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
+ for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
if (t->getExpectedBreak(j) != 0) {
expectedBreak = j;
break;
}
// Check preceding()
- for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
+ for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
int32_t actualBreak = t->bi->preceding(i);
int32_t expectedBreak = BreakIterator::DONE;
// Therefore, start looking at the expected break data not at i-1, but at
// the start of code point index - 1.
utext_setNativeIndex(t->textToBreak, i);
- int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
+ int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
for (; j >= 0; j--) {
if (t->getExpectedBreak(j) != 0) {
expectedBreak = j;
void RBBITest::TestExtended() {
-#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+ // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
+ // data driven test closely entangles filtered and regular data.
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
UErrorCode status = U_ZERO_ERROR;
Locale locale("");
- UnicodeString rules;
TestParams tp(status);
- RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
+ RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
if (U_FAILURE(status)) {
dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
}
-
//
// Open and read the test data file.
//
const char *testDataDirectory = IntlTest::getSourceTestData(status);
- char testFileName[1000];
- if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
- errln("Can't open test data. Path too long.");
- return;
- }
- strcpy(testFileName, testDataDirectory);
- strcat(testFileName, "rbbitst.txt");
+ CharString testFileName(testDataDirectory, -1, status);
+ testFileName.append("rbbitst.txt", -1, status);
int len;
- UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
+ UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
if (U_FAILURE(status)) {
- return; /* something went wrong, error already output */
+ errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
+ return;
}
-
bool skipTest = false; // Skip this test?
//
PARSE_COMMENT,
PARSE_TAG,
PARSE_DATA,
- PARSE_NUM
+ PARSE_NUM,
+ PARSE_RULES
}
parseState = PARSE_TAG;
EParseState savedState = PARSE_TAG;
- static const UChar CH_LF = 0x0a;
- static const UChar CH_CR = 0x0d;
- static const UChar CH_HASH = 0x23;
- /*static const UChar CH_PERIOD = 0x2e;*/
- static const UChar CH_LT = 0x3c;
- static const UChar CH_GT = 0x3e;
- static const UChar CH_BACKSLASH = 0x5c;
- static const UChar CH_BULLET = 0x2022;
-
int32_t lineNum = 1;
int32_t colStart = 0;
int32_t column = 0;
int32_t charIdx = 0;
- int32_t tagValue = 0; // The numeric value of a <nnn> tag.
+ int32_t tagValue = 0; // The numeric value of a <nnn> tag.
+
+ UnicodeString rules; // Holds rules from a <rules> ... </rules> block
+ int32_t rulesFirstLine; // Line number of the start of current <rules> block
for (charIdx = 0; charIdx < len; ) {
status = U_ZERO_ERROR;
UChar c = testString.charAt(charIdx);
charIdx++;
- if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
+ if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
// treat CRLF as a unit
- c = CH_LF;
+ c = u'\n';
charIdx++;
}
- if (c == CH_LF || c == CH_CR) {
+ if (c == u'\n' || c == u'\r') {
lineNum++;
colStart = charIdx;
}
switch (parseState) {
case PARSE_COMMENT:
- if (c == 0x0a || c == 0x0d) {
+ if (c == u'\n' || c == u'\r') {
parseState = savedState;
}
break;
case PARSE_TAG:
{
- if (c == CH_HASH) {
+ if (c == u'#') {
parseState = PARSE_COMMENT;
savedState = PARSE_TAG;
break;
if (u_isUWhiteSpace(c)) {
break;
}
- if (testString.compare(charIdx-1, 6, "<word>") == 0) {
+ if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createWordInstance(locale, status);
skipTest = false;
charIdx += 5;
break;
}
- if (testString.compare(charIdx-1, 6, "<char>") == 0) {
+ if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createCharacterInstance(locale, status);
skipTest = false;
charIdx += 5;
break;
}
- if (testString.compare(charIdx-1, 6, "<line>") == 0) {
+ if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createLineInstance(locale, status);
skipTest = false;
charIdx += 5;
break;
}
- if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
+ if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createSentenceInstance(locale, status);
skipTest = false;
charIdx += 5;
break;
}
- if (testString.compare(charIdx-1, 7, "<title>") == 0) {
+ if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createTitleInstance(locale, status);
charIdx += 6;
break;
}
+ if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
+ testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
+ charIdx = testString.indexOf(u'>', charIdx) + 1;
+ parseState = PARSE_RULES;
+ rules.remove();
+ rulesFirstLine = lineNum;
+ break;
+ }
+
// <locale loc_name>
localeMatcher.reset(testString);
if (localeMatcher.lookingAt(charIdx-1, status)) {
TEST_ASSERT_SUCCESS(status);
break;
}
- if (testString.compare(charIdx-1, 6, "<data>") == 0) {
+ if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
parseState = PARSE_DATA;
charIdx += 5;
tp.dataToBreak = "";
}
break;
+ case PARSE_RULES:
+ if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
+ charIdx += 7;
+ parseState = PARSE_TAG;
+ delete tp.bi;
+ UParseError pe;
+ tp.bi = new RuleBasedBreakIterator(rules, pe, status);
+ skipTest = U_FAILURE(status);
+ if (U_FAILURE(status)) {
+ errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
+ rulesFirstLine + pe.line - 1, u_errorName(status));
+ }
+ } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
+ charIdx += 10;
+ parseState = PARSE_TAG;
+ UErrorCode ec = U_ZERO_ERROR;
+ UParseError pe;
+ RuleBasedBreakIterator bi(rules, pe, ec);
+ if (U_SUCCESS(ec)) {
+ errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
+ rulesFirstLine + pe.line - 1);
+ }
+ } else {
+ rules.append(c);
+ }
+ break;
+
case PARSE_DATA:
- if (c == CH_BULLET) {
+ if (c == u'\u2022') { // u'•'
int32_t breakIdx = tp.dataToBreak.length();
tp.expectedBreaks->setSize(breakIdx+1);
tp.expectedBreaks->setElementAt(-1, breakIdx);
break;
}
- if (testString.compare(charIdx-1, 7, "</data>") == 0) {
+ if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
// Add final entry to mappings from break location to source file position.
// Need one extra because last break position returned is after the
// last char in the data, not at the last char.
break;
}
- if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
+ if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
// Named character, e.g. \N{COMBINING GRAVE ACCENT}
// Get the code point from the name and insert it into the test data.
// (Damn, no API takes names in Unicode !!!
// we've got to take it back to char *)
- int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
+ int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
int32_t nameLength = nameEndIdx - (charIdx+2);
char charNameBuf[200];
UChar32 theChar = -1;
-
- if (testString.compare(charIdx-1, 2, "<>") == 0) {
+ if (testString.compare(charIdx-1, 2, u"<>") == 0) {
charIdx++;
int32_t breakIdx = tp.dataToBreak.length();
tp.expectedBreaks->setSize(breakIdx+1);
break;
}
- if (c == CH_LT) {
+ if (c == u'<') {
tagValue = 0;
parseState = PARSE_NUM;
break;
}
- if (c == CH_HASH && column==3) { // TODO: why is column off so far?
+ if (c == u'#' && column==3) { // TODO: why is column off so far?
parseState = PARSE_COMMENT;
savedState = PARSE_DATA;
break;
}
- if (c == CH_BACKSLASH) {
+ if (c == u'\\') {
// Check for \ at end of line, a line continuation.
// Advance over (discard) the newline
UChar32 cp = testString.char32At(charIdx);
- if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
+ if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
// We have a CR LF
// Need an extra increment of the input ptr to move over both of them
charIdx++;
}
- if (cp == CH_LF || cp == CH_CR) {
+ if (cp == u'\n' || cp == u'\r') {
lineNum++;
colStart = charIdx;
charIdx++;
break;
}
- if (c == CH_GT) {
+ if (c == u'>') {
// Finished the number. Add the info to the expected break data,
// and switch parse state back to doing plain data.
parseState = PARSE_DATA;
}
+ // Reached end of test file. Raise an error if parseState indicates that we are
+ // within a block that should have been terminated.
+
+ if (parseState == PARSE_RULES) {
+ errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
+ lineNum, rulesFirstLine);
+ }
+ if (parseState == PARSE_DATA) {
+ errln("rbbitst.txt:%d <data> block not closed.", lineNum);
+ }
+
+
end_test:
delete [] testFile;
#endif
fileSize = ftell(f);
fileBuf = new char[fileSize];
fseek(f, 0, SEEK_SET);
- amt_read = fread(fileBuf, 1, fileSize, f);
+ amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
if (amt_read != fileSize || fileSize <= 0) {
errln("Error reading test data file.");
goto cleanUpAndReturn;
// Check for test cases from the Unicode test data files that are known to fail
-// and should be skipped because ICU is not yet able to fully implement the spec.
-// See ticket #7270.
+// and should be skipped as known issues because ICU does not fully implement
+// the Unicode specifications, or because ICU includes tailorings that differ from
+// the Unicode standard.
+//
+// Test cases are identified by the test data sequence, which tends to be more stable
+// across Unicode versions than the test file line numbers.
+//
+// The test case with ticket "10666" is a dummy, included as an example.
UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
- static const UChar badTestCases[][4] = { // Line Numbers from Unicode 7.0.0 file.
- {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000}, // Line 5198
- {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000}, // Line 5202
- {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000}, // Line 5214
- {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000}, // Line 5246
- {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000}, // Line 5298
- {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000} // Line 5302
+ static struct TestCase {
+ const char *fTicketNum;
+ const char *fFileName;
+ const UChar *fString;
+ } badTestCases[] = {
+ {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
+ // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
+ // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
+ // ICU is out of sync with Unicode.
+ {"8151", "LineBreakTest.txt", u"-#"},
+ {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
+ {"8151", "LineBreakTest.txt", u"\u002d\u00a7"},
+ {"8151", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
+ {"8151", "LineBreakTest.txt", u"\u002d\U00050005"},
+ {"8151", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
+ {"8151", "LineBreakTest.txt", u"\u002d\u0e01"},
+ {"8151", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
+
+ // Issue ICU-12017 Improve line break around numbers
+ {"12017", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
+ {"12017", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
+ {"12017", "LineBreakTest.txt", u"find .com"},
+ {"12017", "LineBreakTest.txt", u"equals .35 cents"},
+ {"12017", "LineBreakTest.txt", u"a.2 "},
+ {"12017", "LineBreakTest.txt", u"a.2 \u0915"},
+ {"12017", "LineBreakTest.txt", u"a.2 \u672C"},
+ {"12017", "LineBreakTest.txt", u"a.2\u3000\u672C"},
+ {"12017", "LineBreakTest.txt", u"a.2\u3000\u307E"},
+ {"12017", "LineBreakTest.txt", u"a.2\u3000\u0033"},
+ {"12017", "LineBreakTest.txt", u"A.1 \uBABB"},
+ {"12017", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
+ {"12017", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
+ {"12017", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
+ {"12017", "LineBreakTest.txt", u"a.2\u3000\u300C"},
};
- if (strcmp(fileName, "LineBreakTest.txt") != 0) {
- return FALSE;
- }
- for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
- if (testCase == UnicodeString(badTestCases[i])) {
- return logKnownIssue("7270");
+ for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
+ const TestCase &badCase = badTestCases[n];
+ if (!strcmp(fileName, badCase.fFileName) &&
+ testCase == UnicodeString(badCase.fString)) {
+ return logKnownIssue(badCase.fTicketNum);
}
}
return FALSE;
UnicodeSet *fCRLFSet;
UnicodeSet *fControlSet;
UnicodeSet *fExtendSet;
+ UnicodeSet *fZWJSet;
UnicodeSet *fRegionalIndicatorSet;
UnicodeSet *fPrependSet;
UnicodeSet *fSpacingSet;
UnicodeSet *fLVSet;
UnicodeSet *fLVTSet;
UnicodeSet *fHangulSet;
+ UnicodeSet *fExtendedPictSet;
UnicodeSet *fAnySet;
- UnicodeSet *fEmojiModifierSet;
- UnicodeSet *fEmojiBaseSet;
- UnicodeSet *fZWJSet;
- UnicodeSet *fGAZSet;
const UnicodeString *fText;
};
fText = NULL;
fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
- fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]"), status);
- fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"), status);
- fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
+ fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
+ fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
+ fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
+ fRegionalIndicatorSet =
+ new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
fHangulSet->addAll(*fTSet);
fHangulSet->addAll(*fLVSet);
fHangulSet->addAll(*fLVTSet);
- fAnySet = new UnicodeSet(0, 0x10ffff);
-
-
- fEmojiBaseSet = new UnicodeSet(UnicodeString(
- "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C2-\\U0001F3C4\\U0001F3C7\\U0001F3CA-\\U0001F3CC"
- "\\U0001F442-\\U0001F443\\U0001F446-\\U0001F450\\U0001F466-\\U0001F478\\U0001F47C"
- "\\U0001F481-\\U0001F483\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F574-\\U0001F575\\U0001F57A\\U0001F590\\U0001F595-\\U0001F596"
- "\\U0001F645-\\U0001F647\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F6CC"
- "\\U0001F918-\\U0001F91E\\U0001F926\\U0001F930\\U0001F933-\\U0001F939\\U0001F93C-\\U0001F93E]"), status);
+ fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
+ fAnySet = new UnicodeSet(0, 0x10ffff);
- fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF);
- fZWJSet = new UnicodeSet(0x200D, 0x200D);
- fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2695-\\u2696\\u2708\\u2764"
- "\\U0001F308\\U0001F33E\\U0001F373\\U0001F393\\U0001F3A4\\U0001F3A8\\U0001F3EB\\U0001F3ED"
- "\\U0001F466-\\U0001F469\\U0001F48B\\U0001F4BB-\\U0001F4BC\\U0001F527\\U0001F52C\\U0001F5E8"
- "\\U0001F680\\U0001F692]"), status);
-
- fSets = new UVector(status);
+ fSets = new UVector(status);
fSets->addElement(fCRLFSet, status);
fSets->addElement(fControlSet, status);
fSets->addElement(fExtendSet, status);
fSets->addElement(fSpacingSet, status);
fSets->addElement(fHangulSet, status);
fSets->addElement(fAnySet, status);
- fSets->addElement(fEmojiBaseSet, status);
- fSets->addElement(fEmojiModifierSet, status);
fSets->addElement(fZWJSet, status);
- fSets->addElement(fGAZSet, status);
+ fSets->addElement(fExtendedPictSet, status);
if (U_FAILURE(status)) {
deferredStatus = status;
}
continue;
}
- // Rule (GB10) ($E_Base | $GAZ) $Extend* $E_Modifier;
- if ((fEmojiBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
- continue;
- }
- if ((fEmojiBaseSet->contains(cBase) || fGAZSet->contains(cBase)) &&
- fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
- continue;
- }
-
- // Rule (GB11) ZWJ x Glue_After_Zwj
- if (fZWJSet->contains(c1) && fGAZSet->contains(c2)) {
+ // Rule (GB11) Extended_Pictographic Extend * ZWJ x Extended_Pictographic
+ if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
continue;
}
// a break if there are three or more contiguous RIs. If there are
// only two, a break following will occur via other rules, and will include
// any trailing extend characters, which is needed behavior.
- if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
+ if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
&& fRegionalIndicatorSet->contains(c2)) {
break;
}
delete fLVTSet;
delete fHangulSet;
delete fAnySet;
- delete fEmojiBaseSet;
- delete fEmojiModifierSet;
delete fZWJSet;
- delete fGAZSet;
+ delete fExtendedPictSet;
}
//------------------------------------------------------------------------------------------
UnicodeSet *fKatakanaSet;
UnicodeSet *fHebrew_LetterSet;
UnicodeSet *fALetterSet;
- // TODO(jungshik): Do we still need this change?
- // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
UnicodeSet *fSingle_QuoteSet;
UnicodeSet *fDouble_QuoteSet;
UnicodeSet *fMidNumLetSet;
UnicodeSet *fOtherSet;
UnicodeSet *fExtendSet;
UnicodeSet *fExtendNumLetSet;
- UnicodeSet *fDictionaryCjkSet;
- UnicodeSet *fEBaseSet;
- UnicodeSet *fEModifierSet;
- UnicodeSet *fZWSSet;
- UnicodeSet *fGAZSet;
+ UnicodeSet *fWSegSpaceSet;
+ UnicodeSet *fDictionarySet;
+ UnicodeSet *fZWJSet;
+ UnicodeSet *fExtendedPictSet;
const UnicodeString *fText;
};
fSets = new UVector(status);
- fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
- fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
- fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
- fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
- // Exclude Hangul syllables from ALetterSet during testing.
- // Leave CJK dictionary characters out from the monkey tests!
-#if 0
- fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
- "[\\p{Line_Break = Complex_Context}"
- "-\\p{Grapheme_Cluster_Break = Extend}"
- "-\\p{Grapheme_Cluster_Break = Control}"
- "]]",
- status);
-#endif
- fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
- fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
- fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
- fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
- fALetterSet->removeAll(*fDictionaryCjkSet);
- fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status);
- fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status);
- fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
- fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter} - [\\:]]"), status);
- fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
- // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
- // we should figure out why
- fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
- fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
- fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
- fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
-
- fEBaseSet = new UnicodeSet(UnicodeString(
- "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C2-\\U0001F3C4\\U0001F3C7\\U0001F3CA-\\U0001F3CC"
- "\\U0001F442-\\U0001F443\\U0001F446-\\U0001F450\\U0001F466-\\U0001F478\\U0001F47C"
- "\\U0001F481-\\U0001F483\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F574-\\U0001F575\\U0001F57A\\U0001F590\\U0001F595-\\U0001F596"
- "\\U0001F645-\\U0001F647\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F6CC"
- "\\U0001F918-\\U0001F91E\\U0001F926\\U0001F930\\U0001F933-\\U0001F939\\U0001F93C-\\U0001F93E]"), status);
-
- fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
- fZWSSet = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);;
- fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2695-\\u2696\\u2708\\u2764"
- "\\U0001F308\\U0001F33E\\U0001F373\\U0001F393\\U0001F3A4\\U0001F3A8\\U0001F3EB\\U0001F3ED"
- "\\U0001F466-\\U0001F469\\U0001F48B\\U0001F4BB-\\U0001F4BC\\U0001F527\\U0001F52C\\U0001F5E8"
- "\\U0001F680\\U0001F692]"), status);
- fExtendSet->removeAll(*fZWSSet);
-
+ fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
+ fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
+ fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
+ fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
+ fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
+ fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
+ fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
+ fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
+ fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
+ fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
+ fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]", status);
+ fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
+ fNumericSet = new UnicodeSet(u"[[\\p{Word_Break = Numeric}][\\uff10-\\uff19]]", status);
+ fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
+ fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
+ fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status);
+ fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
+
+ fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
+ fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
+
+ fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
+ fDictionarySet->addAll(*fKatakanaSet);
+ fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
+
+ fALetterSet->removeAll(*fDictionarySet);
fOtherSet = new UnicodeSet();
if(U_FAILURE(status)) {
- deferredStatus = status;
- return;
+ IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
+ deferredStatus = status;
+ return;
}
fOtherSet->complement();
fOtherSet->removeAll(*fMidNumSet);
fOtherSet->removeAll(*fNumericSet);
fOtherSet->removeAll(*fExtendNumLetSet);
+ fOtherSet->removeAll(*fWSegSpaceSet);
fOtherSet->removeAll(*fFormatSet);
fOtherSet->removeAll(*fExtendSet);
fOtherSet->removeAll(*fRegionalIndicatorSet);
- fOtherSet->removeAll(*fEBaseSet);
- fOtherSet->removeAll(*fEModifierSet);
- fOtherSet->removeAll(*fZWSSet);
- fOtherSet->removeAll(*fGAZSet);
-
+ fOtherSet->removeAll(*fZWJSet);
+ fOtherSet->removeAll(*fExtendedPictSet);
+
// Inhibit dictionary characters from being tested at all.
- fOtherSet->removeAll(*fDictionaryCjkSet);
- fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
+ fOtherSet->removeAll(*fDictionarySet);
fSets->addElement(fCRSet, status);
fSets->addElement(fLFSet, status);
fSets->addElement(fALetterSet, status);
fSets->addElement(fSingle_QuoteSet, status);
fSets->addElement(fDouble_QuoteSet, status);
- //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
+ //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters
+ // from the test data. They are all in the dictionary set,
+ // which this (old, to be retired) monkey test cannot handle.
fSets->addElement(fMidLetterSet, status);
fSets->addElement(fMidNumLetSet, status);
fSets->addElement(fMidNumSet, status);
fSets->addElement(fExtendSet, status);
fSets->addElement(fOtherSet, status);
fSets->addElement(fExtendNumLetSet, status);
+ fSets->addElement(fWSegSpaceSet, status);
- fSets->addElement(fEBaseSet, status);
- fSets->addElement(fEModifierSet, status);
- fSets->addElement(fZWSSet, status);
- fSets->addElement(fGAZSet, status);
+ fSets->addElement(fZWJSet, status);
+ fSets->addElement(fExtendedPictSet, status);
if (U_FAILURE(status)) {
deferredStatus = status;
break;
};
}
- while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWSSet->contains(c3));
+ while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
if (p1 == p2) {
break;
};
- // Rule (3c) ZWJ x GAZ (Glue after ZWJ).
+ // Rule (3c) ZWJ x Extended_Pictographic
// Not ignoring extend chars, so peek into input text to
// get the potential ZWJ, the character immediately preceding c2.
// Sloppy UChar32 indexing: p2-1 may reference trail half
// but char32At will get the full code point.
- if (fZWSSet->contains(fText->char32At(p2-1)) && fGAZSet->contains(c2)) {
+ if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) {
+ continue;
+ }
+
+ // Rule (3d) Keep horizontal whitespace together.
+ if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
continue;
}
}
// Rule (13) Katakana x Katakana
+ // Note: matches UAX 29 rules, but doesn't come into play for ICU because
+ // all Katakana are handled by the dictionary breaker.
if (fKatakanaSet->contains(c1) &&
fKatakanaSet->contains(c2)) {
continue;
continue;
}
- // Rule 13c
+ // Rule 15 - 17 Group pairs of Regional Indicators.
if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
break;
}
continue;
}
- // Rule 13d
- if ((fEBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEModifierSet->contains(c2)) {
- continue;
- }
-
- // Rule 14. Break found here.
+ // Rule 999. Break found here.
break;
}
delete fFormatSet;
delete fExtendSet;
delete fExtendNumLetSet;
+ delete fWSegSpaceSet;
delete fRegionalIndicatorSet;
- delete fDictionaryCjkSet;
+ delete fDictionarySet;
delete fOtherSet;
- delete fEBaseSet;
- delete fEModifierSet;
- delete fZWSSet;
- delete fGAZSet;
+ delete fZWJSet;
+ delete fExtendedPictSet;
}
UnicodeSet *fB2;
UnicodeSet *fBA;
UnicodeSet *fBB;
+ UnicodeSet *fHH;
UnicodeSet *fHY;
UnicodeSet *fH2;
UnicodeSet *fH3;
UnicodeSet *fXX;
UnicodeSet *fEB;
UnicodeSet *fEM;
- UnicodeSet *fZJ;
+ UnicodeSet *fZWJ;
BreakIterator *fCharBI;
const UnicodeString *fText;
fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
+ fHH = new UnicodeSet();
fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
- fEB = new UnicodeSet(UnicodeString(
- "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C2-\\U0001F3C4\\U0001F3C7\\U0001F3CA-\\U0001F3CC"
- "\\U0001F442-\\U0001F443\\U0001F446-\\U0001F450\\U0001F466-\\U0001F478\\U0001F47C"
- "\\U0001F481-\\U0001F483\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F574-\\U0001F575\\U0001F57A\\U0001F590\\U0001F595-\\U0001F596"
- "\\U0001F645-\\U0001F647\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F6CC"
- "\\U0001F918-\\U0001F91E\\U0001F926\\U0001F930\\U0001F933-\\U0001F939\\U0001F93C-\\U0001F93E]"), status);
- fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
- fZJ = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);
+ fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
+ fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
+ fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
if (U_FAILURE(status)) {
deferredStatus = status;
fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
+ fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
- fID->addAll(*fEB); // Emoji Base and Emoji Modifier behave as ID.
- fID->addAll(*fEM);
- fAL->removeAll(*fEM);
-
-
- fAL->remove((UChar32)0x2695); // move u2695 from Al to Id
- fAL->remove((UChar32)0x2696); // move u2696 from Al to Id
- fAL->remove((UChar32)0x2764); // Emoji Proposal: move u2764 from Al to Id
- fAI->remove((UChar32)0x2640); // new ZWJ seqs
- fAI->remove((UChar32)0x2642); // new ZWJ seqs
- fID->add((UChar32)0x2695);
- fID->add((UChar32)0x2696);
- fID->add((UChar32)0x2764);
- fID->add((UChar32)0x2640);
- fID->add((UChar32)0x2642);
+ fHH->add(u'\u2010'); // Hyphen, '‐'
fSets->addElement(fBK, status);
fSets->addElement(fCR, status);
fSets->addElement(fSG, status);
fSets->addElement(fEB, status);
fSets->addElement(fEM, status);
- fSets->addElement(fZJ, status);
+ fSets->addElement(fZWJ, status);
+
const char *rules =
- "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
- "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
- "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
- "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
- "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
- "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
+ "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
+ "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
+ "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
+ "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
+ "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
+ "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
+ "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
fNumberMatcher = new RegexMatcher(
UnicodeString(rules, -1, US_INV), 0, status);
// LB 10 Treat any remaining combining mark as AL
if (fCM->contains(*posChar)) {
- *posChar = 0x41; // thisChar = 'A';
+ *posChar = u'A';
}
// Push the updated nextPos and nextChar back to our caller.
}
// LB 8 Break after zero width space
- if (fZW->contains(prevChar)) {
+ // ZW SP* ÷
+ // Scan backwards from prevChar for SP* ZW
+ tPos = prevPos;
+ while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
+ tPos = fText->moveIndex32(tPos, -1);
+ }
+ if (fZW->contains(fText->char32At(tPos))) {
break;
}
- // LB 8a ZJ x ID
+ // LB 25 Numbers
+ // Move this test up, before LB8a, because numbers can match a longer sequence that would
+ // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
+ if (fNumberMatcher->lookingAt(prevPos, status)) {
+ if (U_FAILURE(status)) {
+ break;
+ }
+ // Matched a number. But could have been just a single digit, which would
+ // not represent a "no break here" between prevChar and thisChar
+ int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
+ if (numEndIdx > pos) {
+ // Number match includes at least our two chars being checked
+ if (numEndIdx > nextPos) {
+ // Number match includes additional chars. Update pos and nextPos
+ // so that next loop iteration will continue at the end of the number,
+ // checking for breaks between last char in number & whatever follows.
+ pos = nextPos = numEndIdx;
+ do {
+ pos = fText->moveIndex32(pos, -1);
+ thisChar = fText->char32At(pos);
+ } while (fCM->contains(thisChar));
+ }
+ continue;
+ }
+ }
+
+ // LB 8a ZWJ x
// The monkey test's way of ignoring combining characters doesn't work
// for this rule. ZJ is also a CM. Need to get the actual character
// preceding "thisChar", not ignoring combining marks, possibly ZJ.
{
int32_t prevIdx = fText->moveIndex32(pos, -1);
UChar32 prevC = fText->char32At(prevIdx);
- if (fZJ->contains(prevC) && fID->contains(thisChar)) {
+ if (fZWJ->contains(prevC)) {
continue;
}
}
continue;
}
-
-
// LB 13 Don't break before closings.
- // NU x CL, NU x CP and NU x IS are not matched here so that they will
- // fall into LB 17 and the more general number regular expression.
//
- if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
- (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
- fEX->contains(thisChar) ||
- (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
- (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
+ if (fCL->contains(thisChar) ||
+ fCP->contains(thisChar) ||
+ fEX->contains(thisChar) ||
+ fSY->contains(thisChar)) {
continue;
}
// Scan backwards, checking for this sequence.
// The OP char could include combining marks, so we actually check for
// OP CM* SP*
- // Another Twist: The Rule 67 fixes may have changed a SP CM
+ // Another Twist: The Rule 9 fixes may have changed a SP CM
// sequence into a ID char, so before scanning back through spaces,
// verify that prevChar is indeed a space. The prevChar variable
// may differ from fText[prevPos]
}
+ // LB 14a Break before an IS that begins a number and follows a space
+ if (nextPos < fText->length()) {
+ // note: UnicodeString::char32At(length) returns ffff, not distinguishable
+ // from a legit ffff character. So test length separately.
+ UChar32 nextChar = fText->char32At(nextPos);
+ if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
+ break;
+ }
+ }
+
+ // LB14b Do not break before numeric separators, even after spaces.
+ if (fIS->contains(thisChar)) {
+ continue;
+ }
+
// LB 15 QU SP* x OP
if (fOP->contains(thisChar)) {
// Scan backwards from prevChar to see if it is preceded by QU CM* SP*
break;
}
+ // LB 20.09 Don't break between Hyphens and letters if a break precedes the hyphen.
+ // Formerly this was a Finnish tailoring.
+ // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
+ // ^($HY | $HH) $AL;
+ if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
+ prevPosX2 == -1) {
+ continue;
+ }
+
// LB 21
if (fBA->contains(thisChar) ||
fHY->contains(thisChar) ||
if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
(fEX->contains(prevChar) && fIN->contains(thisChar)) ||
(fHL->contains(prevChar) && fIN->contains(thisChar)) ||
- (fID->contains(prevChar) && fIN->contains(thisChar)) ||
+ ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
(fIN->contains(prevChar) && fIN->contains(thisChar)) ||
(fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
continue;
}
- // LB 23 ID x PO
- // AL x NU
- // HL x NU
- // NU x AL
- if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
- (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
- (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
- (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
- (fNU->contains(prevChar) && fHL->contains(thisChar)) ) {
+ // LB 23 (AL | HL) x NU
+ // NU x (AL | HL)
+ if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
continue;
}
-
- // LB 24 Do not break between prefix and letters or ideographs.
- // PR x ID
- // PR x (AL | HL)
- // PO x (AL | HL)
- // (AL | HL) x PR // Apple early addition
- // (AL | HL) x PO // Apple early addition
- if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
- (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
- (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
- ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fPR->contains(thisChar)) ||
- ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fPO->contains(thisChar)) ) {
+ if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
continue;
}
+ // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
+ // PR x (ID | EB | EM)
+ // (ID | EB | EM) x PO
+ if (fPR->contains(prevChar) &&
+ (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
+ continue;
+ }
+ if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
+ fPO->contains(thisChar)) {
+ continue;
+ }
-
- // LB 25 Numbers
- if (fNumberMatcher->lookingAt(prevPos, status)) {
- if (U_FAILURE(status)) {
- break;
- }
- // Matched a number. But could have been just a single digit, which would
- // not represent a "no break here" between prevChar and thisChar
- int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
- if (numEndIdx > pos) {
- // Number match includes at least our two chars being checked
- if (numEndIdx > nextPos) {
- // Number match includes additional chars. Update pos and nextPos
- // so that next loop iteration will continue at the end of the number,
- // checking for breaks between last char in number & whatever follows.
- pos = nextPos = numEndIdx;
- do {
- pos = fText->moveIndex32(pos, -1);
- thisChar = fText->char32At(pos);
- } while (fCM->contains(thisChar));
- }
- continue;
- }
+ // LB 24 Do not break between prefix and letters or ideographs.
+ // (PR | PO) x (AL | HL)
+ // (AL | HL) x (PR | PO)
+ if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
+ (fAL->contains(thisChar) || fHL->contains(thisChar))) {
+ continue;
+ }
+ if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
+ (fPR->contains(thisChar) || fPO->contains(thisChar))) {
+ continue;
}
+ // LB 25 numbers match, moved up, before LB 8a,
// LB 26 Do not break a Korean syllable.
if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
continue;
}
- // LB30a RI RI <break> RI
- // RI x RI
+ // LB30a RI RI ÷ RI
+ // RI x RI
if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
break;
}
if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
+ // Two Regional Indicators have been paired.
+ // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
+ // following RI. This is a hack.
+ thisChar = -1;
continue;
}
delete fB2;
delete fBA;
delete fBB;
+ delete fHH;
delete fHY;
delete fH2;
delete fH3;
delete fXX;
delete fEB;
delete fEM;
- delete fZJ;
+ delete fZWJ;
delete fCharBI;
delete fNumberMatcher;
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
forward[count] = i;
if (count < expectedcount && expected[count] != i) {
- test->errln("break forward test failed: expected %d but got %d",
- expected[count], i);
+ test->errln("%s:%d break forward test failed: expected %d but got %d",
+ __FILE__, __LINE__, expected[count], i);
break;
}
count ++;
}
if (count != expectedcount) {
printStringBreaks(ustr, expected, expectedcount);
- test->errln("break forward test failed: missed %d match",
- expectedcount - count);
+ test->errln("%s:%d break forward test failed: missed %d match",
+ __FILE__, __LINE__, expectedcount - count);
return;
}
// testing boundaries
int j = expected[i - 1];
if (!bi->isBoundary(j)) {
printStringBreaks(ustr, expected, expectedcount);
- test->errln("isBoundary() failed. Expected boundary at position %d", j);
+ test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
+ __FILE__, __LINE__, j);
return;
}
for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
if (bi->isBoundary(j)) {
printStringBreaks(ustr, expected, expectedcount);
- test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
+ test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
+ __FILE__, __LINE__, j);
return;
}
}
count --;
if (forward[count] != i) {
printStringBreaks(ustr, expected, expectedcount);
- test->errln("happy break test previous() failed: expected %d but got %d",
- forward[count], i);
+ test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
+ __FILE__, __LINE__, forward[count], i);
break;
}
}
// int j = expected[i] + 1;
int j = ustr.moveIndex32(expected[i], 1);
for (; j <= expected[i + 1]; j ++) {
- if (bi->preceding(j) != expected[i]) {
+ int32_t expectedPreceding = expected[i];
+ int32_t actualPreceding = bi->preceding(j);
+ if (actualPreceding != expectedPreceding) {
printStringBreaks(ustr, expected, expectedcount);
- test->errln("preceding(): Not expecting boundary at position %d", j);
+ test->errln("%s:%d preceding(%d): expected %d, got %d",
+ __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
return;
}
}
Locale locale("en");
UErrorCode status = U_ZERO_ERROR;
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
- BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
+ LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
+ if (U_FAILURE(status)) {
+ errcheckln(status, "%s:%d Creation of break iterator failed %s",
+ __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
UChar str[50];
static const char *strlist[] =
{
"\\u003b\\u0027\\u00b7\\u47a3",
};
int loop;
- if (U_FAILURE(status)) {
- errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
- return;
- }
for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
- // printf("looping %d\n", loop);
- u_unescape(strlist[loop], str, 20);
+ u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
UnicodeString ustr(str);
int forward[50];
int count = 0;
bi->setText(ustr);
- int prev = 0;
- int i;
- for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
- forward[count ++] = i;
- if (i > prev) {
- int j;
- for (j = prev + 1; j < i; j ++) {
- if (bi->isBoundary(j)) {
- printStringBreaks(ustr, forward, count);
- errln("happy boundary test failed: expected %d not a boundary",
- j);
- return;
- }
+ int prev = -1;
+ for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
+ ++count;
+ if (count >= UPRV_LENGTHOF(forward)) {
+ errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
+ __FILE__, __LINE__, loop, count, boundary);
+ return;
+ }
+ forward[count] = boundary;
+ if (boundary <= prev) {
+ errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
+ __FILE__, __LINE__, loop, prev, boundary);
+ break;
+ }
+ for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
+ if (bi->isBoundary(nonBoundary)) {
+ printStringBreaks(ustr, forward, count);
+ errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
+ __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
+ return;
}
}
- if (!bi->isBoundary(i)) {
+ if (!bi->isBoundary(boundary)) {
printStringBreaks(ustr, forward, count);
- errln("happy boundary test failed: expected %d a boundary",
- i);
+ errln("%s:%d happy boundary test failed: expected %d a boundary",
+ __FILE__, __LINE__, boundary);
return;
}
- prev = i;
+ prev = boundary;
}
}
- delete bi;
}
void RBBITest::TestLineBreaks(void)
#endif
}
-void RBBITest::TestMonkey(char *params) {
+void RBBITest::TestMonkey() {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
UErrorCode status = U_ZERO_ERROR;
loopCount = 10000;
}
- if (params) {
- UnicodeString p(params);
+ if (fTestParams) {
+ UnicodeString p(fTestParams);
loopCount = getIntParam("loop", p, loopCount);
seed = getIntParam("seed", p, seed);
TEST_ASSERT(iterationCount == 6);
}
+// Bug 7547 - verify that building a break itereator from empty rules produces an error.
+//
+void RBBITest::TestBug7547() {
+ UnicodeString rules;
+ UErrorCode status = U_ZERO_ERROR;
+ UParseError parseError;
+ RuleBasedBreakIterator breakIterator(rules, parseError, status);
+ if (status != U_BRK_RULE_SYNTAX) {
+ errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
+ }
+ if (parseError.line != 1 || parseError.offset != 0) {
+ errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
+ }
+}
+
+
+void RBBITest::TestBug12797() {
+ UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
+ UErrorCode status = U_ZERO_ERROR;
+ UParseError parseError;
+ RuleBasedBreakIterator bi(rules, parseError, status);
+ if (U_FAILURE(status)) {
+ errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+ UnicodeString text = "abc";
+ bi.setText(text);
+ bi.first();
+ int32_t boundary = bi.next();
+ if (boundary != 3) {
+ errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
+ }
+}
+
+void RBBITest::TestBug12918() {
+ // This test triggers an assertion failure in dictbe.cpp
+ const UChar *crasherString = u"\u3325\u4a16";
+ UErrorCode status = U_ZERO_ERROR;
+ UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
+ if (U_FAILURE(status)) {
+ dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+ ubrk_first(iter);
+ int32_t pos = 0;
+ int32_t lastPos = -1;
+ while((pos = ubrk_next(iter)) != UBRK_DONE) {
+ if (pos <= lastPos) {
+ errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
+ break;
+ }
+ }
+ ubrk_close(iter);
+}
+
+void RBBITest::TestBug12932() {
+ // Node Stack overflow in the RBBI rule parser caused a seg fault.
+ UnicodeString ruleStr(
+ "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
+ "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
+ "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
+ ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
+ ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
+ ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
+
+ UErrorCode status = U_ZERO_ERROR;
+ UParseError parseError;
+ RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
+ if (status != U_BRK_RULE_SYNTAX) {
+ errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
+ __FILE__, __LINE__, u_errorName(status));
+ }
+}
+
+
+// Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
+// remain undevided by ICU char, word and line break.
+void RBBITest::TestEmoji() {
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
+ UErrorCode status = U_ZERO_ERROR;
+
+ CharString testFileName;
+ testFileName.append(IntlTest::getSourceTestData(status), status);
+ testFileName.appendPathPart("emoji-test.txt", status);
+ if (U_FAILURE(status)) {
+ errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+ logln("Opening data file %s\n", testFileName.data());
+
+ int len;
+ UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
+ if (U_FAILURE(status) || testFile == NULL) {
+ errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+ UnicodeString testFileAsString(testFile, len);
+ delete [] testFile;
+
+ RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
+ RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
+ // hexMatcher group(1) is a hex number, or empty string if no hex number present.
+ int32_t lineNumber = 0;
+
+ LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
+ LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
+ LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
+ if (U_FAILURE(status)) {
+ dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+
+ while (lineMatcher.find()) {
+ ++lineNumber;
+ UnicodeString line = lineMatcher.group(status);
+ hexMatcher.reset(line);
+ UnicodeString testString; // accumulates the emoji sequence.
+ while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
+ UnicodeString hex = hexMatcher.group(1, status);
+ if (hex.length() > 8) {
+ errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
+ break;
+ }
+ CharString hex8;
+ hex8.appendInvariantChars(hex, status);
+ UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
+ if (c<=0x10ffff) {
+ testString.append(c);
+ } else {
+ errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
+ __FILE__, __LINE__, lineNumber, hex8.data());
+ break;
+ }
+ }
+
+ if (testString.length() > 1) {
+ charBreaks->setText(testString);
+ charBreaks->first();
+ int32_t firstBreak = charBreaks->next();
+ if (testString.length() != firstBreak) {
+ errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
+ __FILE__, __LINE__, lineNumber, firstBreak);
+ }
+ wordBreaks->setText(testString);
+ wordBreaks->first();
+ firstBreak = wordBreaks->next();
+ if (testString.length() != firstBreak) {
+ errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
+ __FILE__, __LINE__, lineNumber, firstBreak);
+ }
+ lineBreaks->setText(testString);
+ lineBreaks->first();
+ firstBreak = lineBreaks->next();
+ if (testString.length() != firstBreak) {
+ errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
+ __FILE__, __LINE__, lineNumber, firstBreak);
+ }
+ }
+ }
+#endif
+}
+
+
+// TestBug12519 - Correct handling of Locales by assignment / copy / clone
+
+void RBBITest::TestBug12519() {
+ UErrorCode status = U_ZERO_ERROR;
+ LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
+ LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
+ if (!assertSuccess(WHERE, status)) {
+ dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+ assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
+
+ assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
+ assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
+
+ LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
+ assertTrue(WHERE, *biEn == *cloneEn);
+ assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
+
+ LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
+ assertTrue(WHERE, *biFr == *cloneFr);
+ assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
+
+ LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
+ UnicodeString text("Hallo Welt");
+ biDe->setText(text);
+ assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
+ *biDe = *biFr;
+ assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
+}
+
+void RBBITest::TestBug12677() {
+ // Check that stripping of comments from rules for getRules() is not confused by
+ // the presence of '#' characters in the rules that do not introduce comments.
+ UnicodeString rules(u"!!forward; \n"
+ "$x = [ab#]; # a set with a # literal. \n"
+ " # .; # a comment that looks sort of like a rule. \n"
+ " '#' '?'; # a rule with a quoted # \n"
+ );
+
+ UErrorCode status = U_ZERO_ERROR;
+ UParseError pe;
+ RuleBasedBreakIterator bi(rules, pe, status);
+ assertSuccess(WHERE, status);
+ UnicodeString rtRules = bi.getRules();
+ assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
+}
+
+
+void RBBITest::TestTableRedundancies() {
+ UErrorCode status = U_ZERO_ERROR;
+
+ LocalPointer<RuleBasedBreakIterator> bi (
+ (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
+ assertSuccess(WHERE, status);
+ if (U_FAILURE(status)) return;
+
+ RBBIDataWrapper *dw = bi->fData;
+ const RBBIStateTable *fwtbl = dw->fForwardTable;
+ int32_t numCharClasses = dw->fHeader->fCatCount;
+ // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
+
+ // Check for duplicate columns (character categories)
+
+ std::vector<UnicodeString> columns;
+ for (int32_t column = 0; column < numCharClasses; column++) {
+ UnicodeString s;
+ for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
+ RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
+ s.append(row->fNextState[column]);
+ }
+ columns.push_back(s);
+ }
+ // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
+ for (int c1=1; c1<numCharClasses; c1++) {
+ for (int c2 = c1+1; c2 < numCharClasses; c2++) {
+ if (columns.at(c1) == columns.at(c2)) {
+ errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
+ goto out;
+ }
+ }
+ }
+ out:
+
+ // Check for duplicate states
+ std::vector<UnicodeString> rows;
+ for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
+ UnicodeString s;
+ RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
+ assertTrue(WHERE, row->fAccepting >= -1);
+ s.append(row->fAccepting + 1); // values of -1 are expected.
+ s.append(row->fLookAhead);
+ s.append(row->fTagIdx);
+ for (int32_t column = 0; column < numCharClasses; column++) {
+ s.append(row->fNextState[column]);
+ }
+ rows.push_back(s);
+ }
+ for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
+ for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
+ if (rows.at(r1) == rows.at(r2)) {
+ errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
+ return;
+ }
+ }
+ }
+}
+
+// Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
+// even after next() has returned DONE.
+
+void RBBITest::TestBug13447() {
+ UErrorCode status = U_ZERO_ERROR;
+ LocalPointer<RuleBasedBreakIterator> bi(
+ (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
+ assertSuccess(WHERE, status);
+ if (U_FAILURE(status)) return;
+ UnicodeString data(u"1234");
+ bi->setText(data);
+ assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
+ assertEquals(WHERE, 4, bi->next());
+ assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
+ assertEquals(WHERE, UBRK_DONE, bi->next());
+ assertEquals(WHERE, 4, bi->current());
+ assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
+}
+
+// TestReverse exercises both the synthesized safe reverse rules and the logic
+// for filling the break iterator cache when starting from random positions
+// in the text.
+//
+// It's a monkey test, working on random data, with the expected data obtained
+// from forward iteration (no safe rules involved), comparing with results
+// when indexing into the interior of the string (safe rules needed).
+
+void RBBITest::TestReverse() {
+ UErrorCode status = U_ZERO_ERROR;
+
+ TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
+ BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
+ assertSuccess(WHERE, status, true);
+ status = U_ZERO_ERROR;
+ TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
+ BreakIterator::createWordInstance(Locale::getEnglish(), status)));
+ assertSuccess(WHERE, status, true);
+ status = U_ZERO_ERROR;
+ TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
+ BreakIterator::createLineInstance(Locale::getEnglish(), status)));
+ assertSuccess(WHERE, status, true);
+ status = U_ZERO_ERROR;
+ TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
+ BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
+ assertSuccess(WHERE, status, true);
+}
+
+void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
+ if (!bi) {
+ return;
+ }
+
+ // From the mapping trie in the break iterator's internal data, create a
+ // vector of UnicodeStrings, one for each character category, containing
+ // all of the code points that map to that category. Unicode planes 0 and 1 only,
+ // to avoid an execess of unassigned code points.
+
+ RBBIDataWrapper *data = bi->fData;
+ int32_t categoryCount = data->fHeader->fCatCount;
+ UTrie2 *trie = data->fTrie;
+
+ std::vector<UnicodeString> strings(categoryCount, UnicodeString());
+ for (int cp=0; cp<0x1fff0; ++cp) {
+ int cat = utrie2_get32(trie, cp);
+ cat &= ~0x4000; // And off the dictionary bit from the category.
+ assertTrue(WHERE, cat < categoryCount && cat >= 0);
+ if (cat < 0 || cat >= categoryCount) return;
+ strings[cat].append(cp);
+ }
+
+ icu_rand randomGen;
+ const int testStringLength = 10000;
+ UnicodeString testString;
+
+ for (int i=0; i<testStringLength; ++i) {
+ int charClass = randomGen() % categoryCount;
+ if (strings[charClass].length() > 0) {
+ int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
+ testString.append(cp);
+ }
+ }
+
+ typedef std::pair<UBool, int32_t> Result;
+ std::vector<Result> expectedResults;
+ bi->setText(testString);
+ for (int i=0; i<testString.length(); ++i) {
+ bool isboundary = bi->isBoundary(i);
+ int ruleStatus = bi->getRuleStatus();
+ expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
+ }
+
+ for (int i=testString.length()-1; i>=0; --i) {
+ bi->setText(testString); // clears the internal break cache
+ Result expected = expectedResults[i];
+ assertEquals(WHERE, expected.first, bi->isBoundary(i));
+ assertEquals(WHERE, expected.second, bi->getRuleStatus());
+ }
+}
+
+
+// Ticket 13692 - finding word boundaries in very large numbers or words could
+// be very time consuming. When the problem was present, this void test
+// would run more than fifteen minutes, which is to say, the failure was noticeale.
+
+void RBBITest::TestBug13692() {
+ UErrorCode status = U_ZERO_ERROR;
+ LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
+ BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
+ if (!assertSuccess(WHERE, status, true)) {
+ return;
+ }
+ constexpr int32_t LENGTH = 1000000;
+ UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
+ for (int i=0; i<20; i+=2) {
+ longNumber.setCharAt(i, u' ');
+ }
+ bi->setText(longNumber);
+ assertFalse(WHERE, bi->isBoundary(LENGTH-5));
+ assertSuccess(WHERE, status);
+}
//
// TestDebug - A place-holder test for debugging purposes.
// for tracing without a lot of unwanted extra stuff happening.
//
void RBBITest::TestDebug(void) {
-#if 0
- UErrorCode status = U_ZERO_ERROR;
- int pos = 0;
- int ruleStatus = 0;
-
- RuleBasedBreakIterator* bi =
- // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
- // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
- (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
- UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
- // UnicodeString s("Aaa. Bcd");
- s = s.unescape();
- bi->setText(s);
- UBool r = bi->isBoundary(8);
- printf("%s", r?"true":"false");
- return;
- pos = bi->last();
- do {
- // ruleStatus = bi->getRuleStatus();
- printf("%d\t%d\n", pos, ruleStatus);
- pos = bi->previous();
- } while (pos != BreakIterator::DONE);
-#endif
+ UErrorCode status = U_ZERO_ERROR;
+ LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
+ BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
+ if (!assertSuccess(WHERE, status, true)) {
+ return;
+ }
+ const UnicodeString &rules = bi->getRules();
+ UParseError pe;
+ LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
+ assertSuccess(WHERE, status);
}
void RBBITest::TestProperties() {
}
}
-#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
+#endif // #if !UCONFIG_NO_BREAK_ITERATION