ICU-64243.0.1.tar.gz

[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
diff --git a/icuSources/test/intltest/rbbitst.cpp b/icuSources/test/intltest/rbbitst.cpp

index 92958e72f7cde2034dce20d1597c86b1646aa830..ebd123640f669d458c3ded7de93088d854ddfc52 100644 (file)
--- a/icuSources/test/intltest/rbbitst.cpp
+++ b/icuSources/test/intltest/rbbitst.cpp
@@ -17,6 +17,8 @@
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
+#include <utility>
+#include <vector>
  
  #include "unicode/brkiter.h"
  #include "unicode/localpointer.h"
@@ -39,10 +41,12 @@
  #include "cstr.h"
  #include "intltest.h"
  #include "rbbitst.h"
+#include "rbbidata.h"
  #include "utypeinfo.h"  // for 'typeid' to work
  #include "uvector.h"
  #include "uvectr32.h"
  
+
  #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
  #include "unicode/filteredbrk.h"
  #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
@@ -53,7 +57,6 @@
  #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
      errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  
-
  //---------------------------------------------
  // runIndexedTest
  //---------------------------------------------
@@ -74,10 +77,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
  #if !UCONFIG_NO_FILE_IO
      TESTCASE_AUTO(TestBug4153072);
  #endif
-    TESTCASE_AUTO(TestStatusReturn);
  #if !UCONFIG_NO_FILE_IO
      TESTCASE_AUTO(TestUnicodeFiles);
-    TESTCASE_AUTO(TestEmptyString);
  #endif
      TESTCASE_AUTO(TestGetAvailableLocales);
      TESTCASE_AUTO(TestGetDisplayName);
@@ -107,151 +108,16 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
      TESTCASE_AUTO(TestBug12918);
      TESTCASE_AUTO(TestBug12932);
      TESTCASE_AUTO(TestEmoji);
+    TESTCASE_AUTO(TestBug12519);
+    TESTCASE_AUTO(TestBug12677);
+    TESTCASE_AUTO(TestTableRedundancies);
+    TESTCASE_AUTO(TestBug13447);
+    TESTCASE_AUTO(TestReverse);
+    TESTCASE_AUTO(TestBug13692);
      TESTCASE_AUTO_END;
  }
  
  
-//---------------------------------------------------------------------------
-//
-//   class BITestData   Holds a set of Break iterator test data and results
-//                      Includes
-//                         - the string data to be broken
-//                         - a vector of the expected break positions.
-//                         - a vector of source line numbers for the data,
-//                               (to help see where errors occured.)
-//                         - The expected break tag values.
-//                         - Vectors of actual break positions and tag values.
-//                         - Functions for comparing actual with expected and
-//                            reporting errors.
-//
-//----------------------------------------------------------------------------
-class BITestData {
-public:
-    UnicodeString    fDataToBreak;
-    UVector          fExpectedBreakPositions;
-    UVector          fExpectedTags;
-    UVector          fLineNum;
-    UVector          fActualBreakPositions;   // Test Results.
-    UVector          fActualTags;
-
-    BITestData(UErrorCode &status);
-    void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
-    void             checkResults(const char *heading, RBBITest *test);
-    void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
-    void             clearResults();
-};
-
-//
-// Constructor.
-//
-BITestData::BITestData(UErrorCode &status)
-: fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
-  fActualTags(status)
-{
-}
-
-//
-// addDataChunk.   Add a section (non-breaking) piece if data to the test data.
-//                 The macro form collects the line number, which is helpful
-//                 when tracking down failures.
-//
-//                 A null data item is inserted at the start of each test's data
-//                  to put the starting zero into the data list.  The position saved for
-//                  each non-null item is its ending position.
-//
-#define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
-void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
-    if (U_FAILURE(status)) {return;}
-    if (data != NULL) {
-        fDataToBreak.append(CharsToUnicodeString(data));
-    }
-    fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
-    fExpectedTags.addElement(tag, status);
-    fLineNum.addElement(lineNum, status);
-}
-
-
-//
-//  checkResults.   Compare the actual and expected break positions, report any differences.
-//
-void BITestData::checkResults(const char *heading, RBBITest *test) {
-    int32_t   expectedIndex = 0;
-    int32_t   actualIndex = 0;
-
-    for (;;) {
-        // If we've run through both the expected and actual results vectors, we're done.
-        //   break out of the loop.
-        if (expectedIndex >= fExpectedBreakPositions.size() &&
-            actualIndex   >= fActualBreakPositions.size()) {
-            break;
-        }
-
-
-        if (expectedIndex >= fExpectedBreakPositions.size()) {
-            err(heading, test, expectedIndex-1, actualIndex);
-            actualIndex++;
-            continue;
-        }
-
-        if (actualIndex >= fActualBreakPositions.size()) {
-            err(heading, test, expectedIndex, actualIndex-1);
-            expectedIndex++;
-            continue;
-        }
-
-        if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
-            err(heading, test, expectedIndex, actualIndex);
-            // Try to resync the positions of the indices, to avoid a rash of spurious erros.
-            if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
-                actualIndex++;
-            } else {
-                expectedIndex++;
-            }
-            continue;
-        }
-
-        if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
-            test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
-                heading, fLineNum.elementAt(expectedIndex),
-                fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
-        }
-
-        actualIndex++;
-        expectedIndex++;
-    }
-}
-
-//
-//  err   -  An error was found.  Report it, along with information about where the
-//                                incorrectly broken test data appeared in the source file.
-//
-void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
-{
-    int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
-    int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
-    int32_t   o        = 0;
-    int32_t   line     = fLineNum.elementAti(expectedIdx);
-    if (expectedIdx > 0) {
-        // The line numbers are off by one because a premature break occurs somewhere
-        //    within the previous item, rather than at the start of the current (expected) item.
-        //    We want to report the offset of the unexpected break from the start of
-        //      this previous item.
-        o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
-    }
-    if (actual < expected) {
-        test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
-    } else {
-        test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
-    }
-}
-
-
-void BITestData::clearResults() {
-    fActualBreakPositions.removeAllElements();
-    fActualTags.removeAllElements();
-}
-
-
  //--------------------------------------------------------------------------------------
  //
  //    RBBITest    constructor and destructor
@@ -266,51 +132,6 @@ RBBITest::RBBITest() {
  RBBITest::~RBBITest() {
  }
  
-//-----------------------------------------------------------------------------------
-//
-//   Test for status {tag} return value from break rules.
-//        TODO:  a more thorough test.
-//
-//-----------------------------------------------------------------------------------
-void RBBITest::TestStatusReturn() {
-     UnicodeString rulesString1("$Letters = [:L:];\n"
-                                  "$Numbers = [:N:];\n"
-                                  "$Letters+{1};\n"
-                                  "$Numbers+{2};\n"
-                                  "Help\\ /me\\!{4};\n"
-                                  "[^$Letters $Numbers];\n"
-                                  "!.*;\n", -1, US_INV);
-     UnicodeString testString1  = "abc123..abc Help me Help me!";
-                                // 01234567890123456789012345678
-     int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
-     int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
-
-     UErrorCode status=U_ZERO_ERROR;
-     UParseError    parseError;
-
-     LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
-     if(U_FAILURE(status)) {
-         dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__,  u_errorName(status));
-         return;
-     }
-     int32_t  pos;
-     int32_t  i = 0;
-     bi->setText(testString1);
-     for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
-         if (pos != bounds1[i]) {
-             errln("%s:%d  expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
-             break;
-         }
-
-         int tag = bi->getRuleStatus();
-         if (tag != brkStatus[i]) {
-             errln("%s:%d  break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
-             break;
-         }
-         i++;
-     }
-}
-
  
  static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
      UErrorCode status = U_ZERO_ERROR;
@@ -318,7 +139,7 @@ static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
      printf("code    alpha extend alphanum type word sent line name\n");
      int nextExpectedIndex = 0;
      utext_setNativeIndex(tstr, 0);
-    for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
+    for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
          if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
              printf("------------------------------------------------ %d\n", j);
              ++nextExpectedIndex;
@@ -391,277 +212,12 @@ void RBBITest::TestBug3818() {
      delete bi;
  }
  
-//----------------------------------------------------------------------------
-//
-// generalIteratorTest      Given a break iterator and a set of test data,
-//                          Run the tests and report the results.
-//
-//----------------------------------------------------------------------------
-void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
-{
-
-    bi.setText(td.fDataToBreak);
-
-    testFirstAndNext(bi, td);
-
-    testLastAndPrevious(bi, td);
-
-    testFollowing(bi, td);
-    testPreceding(bi, td);
-    testIsBoundary(bi, td);
-    doMultipleSelectionTest(bi, td);
-}
-
-
-//
-//   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
-//                       kind of loop.
-//
-void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
-{
-    UErrorCode  status = U_ZERO_ERROR;
-    int32_t     p;
-    int32_t     lastP = -1;
-    int32_t     tag;
-
-    logln("Test first and next");
-    bi.setText(td.fDataToBreak);
-    td.clearResults();
-
-    for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
-        td.fActualBreakPositions.addElement(p, status);  // Save result.
-        tag = bi.getRuleStatus();
-        td.fActualTags.addElement(tag, status);
-        if (p <= lastP) {
-            // If the iterator is not making forward progress, stop.
-            //  No need to raise an error here, it'll be detected in the normal check of results.
-            break;
-        }
-        lastP = p;
-    }
-    td.checkResults("testFirstAndNext", this);
-}
-
-
-//
-//  TestLastAndPrevious.   Run the iterator backwards, starting with last().
-//
-void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
-{
-    UErrorCode  status = U_ZERO_ERROR;
-    int32_t     p;
-    int32_t     lastP  = 0x7ffffffe;
-    int32_t     tag;
-
-    logln("Test last and previous");
-    bi.setText(td.fDataToBreak);
-    td.clearResults();
-
-    for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
-        // Save break position.  Insert it at start of vector of results, shoving
-        //    already-saved results further towards the end.
-        td.fActualBreakPositions.insertElementAt(p, 0, status);
-        // bi.previous();   // TODO:  Why does this fix things up????
-        // bi.next();
-        tag = bi.getRuleStatus();
-        td.fActualTags.insertElementAt(tag, 0, status);
-        if (p >= lastP) {
-            // If the iterator is not making progress, stop.
-            //  No need to raise an error here, it'll be detected in the normal check of results.
-            break;
-        }
-        lastP = p;
-    }
-    td.checkResults("testLastAndPrevious", this);
-}
-
-
-void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
-{
-    UErrorCode  status = U_ZERO_ERROR;
-    int32_t     p;
-    int32_t     tag;
-    int32_t     lastP  = -2;     // A value that will never be returned as a break position.
-                                 //   cannot be -1; that is returned for DONE.
-    int         i;
-
-    logln("testFollowing():");
-    bi.setText(td.fDataToBreak);
-    td.clearResults();
-
-    // Save the starting point, since we won't get that out of following.
-    p = bi.first();
-    td.fActualBreakPositions.addElement(p, status);  // Save result.
-    tag = bi.getRuleStatus();
-    td.fActualTags.addElement(tag, status);
-
-    for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
-        p = bi.following(i);
-        if (p != lastP) {
-            if (p == RuleBasedBreakIterator::DONE) {
-                break;
-            }
-            // We've reached a new break position.  Save it.
-            td.fActualBreakPositions.addElement(p, status);  // Save result.
-            tag = bi.getRuleStatus();
-            td.fActualTags.addElement(tag, status);
-            lastP = p;
-        }
-    }
-    // The loop normally exits by means of the break in the middle.
-    // Make sure that the index was at the correct position for the break iterator to have
-    //   returned DONE.
-    if (i != td.fDataToBreak.length()) {
-        errln("testFollowing():  iterator returned DONE prematurely.");
-    }
-
-    // Full check of all results.
-    td.checkResults("testFollowing", this);
-}
-
-
-
-void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
-    UErrorCode  status = U_ZERO_ERROR;
-    int32_t     p;
-    int32_t     tag;
-    int32_t     lastP  = 0x7ffffffe;
-    int         i;
-
-    logln("testPreceding():");
-    bi.setText(td.fDataToBreak);
-    td.clearResults();
-
-    p = bi.last();
-    td.fActualBreakPositions.addElement(p, status);
-    tag = bi.getRuleStatus();
-    td.fActualTags.addElement(tag, status);
-
-    for (i = td.fDataToBreak.length(); i>=-1; i--) {
-        p = bi.preceding(i);
-        if (p != lastP) {
-            if (p == RuleBasedBreakIterator::DONE) {
-                break;
-            }
-            // We've reached a new break position.  Save it.
-            td.fActualBreakPositions.insertElementAt(p, 0, status);
-            lastP = p;
-            tag = bi.getRuleStatus();
-            td.fActualTags.insertElementAt(tag, 0, status);
-        }
-    }
-    // The loop normally exits by means of the break in the middle.
-    // Make sure that the index was at the correct position for the break iterator to have
-    //   returned DONE.
-    if (i != 0) {
-        errln("testPreceding():  iterator returned DONE prematurely.");
-    }
-
-    // Full check of all results.
-    td.checkResults("testPreceding", this);
-}
-
-
-
-void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
-    UErrorCode  status = U_ZERO_ERROR;
-    int         i;
-    int32_t     tag;
-
-    logln("testIsBoundary():");
-    bi.setText(td.fDataToBreak);
-    td.clearResults();
-
-    for (i = 0; i <= td.fDataToBreak.length(); i++) {
-        if (bi.isBoundary(i)) {
-            td.fActualBreakPositions.addElement(i, status);  // Save result.
-            tag = bi.getRuleStatus();
-            td.fActualTags.addElement(tag, status);
-        }
-    }
-    td.checkResults("testIsBoundary: ", this);
-}
-
-
-
-void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
-{
-    iterator.setText(td.fDataToBreak);
-
-    RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
-    int32_t offset = iterator.first();
-    int32_t testOffset;
-    int32_t count = 0;
-
-    logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
-
-    if (*testIterator != iterator)
-        errln("clone() or operator!= failed: two clones compared unequal");
-
-    do {
-        testOffset = testIterator->first();
-        testOffset = testIterator->next(count);
-        if (offset != testOffset)
-            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
-
-        if (offset != RuleBasedBreakIterator::DONE) {
-            count++;
-            offset = iterator.next();
-
-            if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
-                errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
-                if (count > 10000 || offset == -1) {
-                    errln("operator== failed too many times. Stopping test.");
-                    if (offset == -1) {
-                        errln("Does (RuleBasedBreakIterator::DONE == -1)?");
-                    }
-                    return;
-                }
-            }
-        }
-    } while (offset != RuleBasedBreakIterator::DONE);
-
-    // now do it backwards...
-    offset = iterator.last();
-    count = 0;
-
-    do {
-        testOffset = testIterator->last();
-        testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
-        if (offset != testOffset)
-            errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
-
-        if (offset != RuleBasedBreakIterator::DONE) {
-            count--;
-            offset = iterator.previous();
-        }
-    } while (offset != RuleBasedBreakIterator::DONE);
-
-    delete testIterator;
-}
-
  
  //---------------------------------------------
  //
  //     other tests
  //
  //---------------------------------------------
-void RBBITest::TestEmptyString()
-{
-    UnicodeString text = "";
-    UErrorCode status = U_ZERO_ERROR;
-
-    BITestData x(status);
-    ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
-    RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
-    if (U_FAILURE(status))
-    {
-        errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
-        return;
-    }
-    generalIteratorTest(*bi, x);
-    delete bi;
-}
  
  void RBBITest::TestGetAvailableLocales()
  {
@@ -1019,8 +575,9 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
      //
      //  Run the iterator backwards, verify that the same breaks are found.
      //
-    prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
-    for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
+    prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
+    bp = t->bi->last();
+    while (bp != BreakIterator::DONE) {
          if (prevBP ==  bp) {
              // Fail for lack of progress.
              errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
@@ -1058,6 +615,7 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
          }
  
          prevBP = bp;
+        bp = t->bi->previous();
      }
  
      // Verify that there were no missed breaks prior to the last one found
@@ -1081,10 +639,10 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
      }
  
      // Check following()
-    for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
+    for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
          int32_t actualBreak = t->bi->following(i);
          int32_t expectedBreak = BreakIterator::DONE;
-        for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
+        for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
              if (t->getExpectedBreak(j) != 0) {
                  expectedBreak = j;
                  break;
@@ -1098,7 +656,7 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
      }
  
      // Check preceding()
-    for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
+    for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
          int32_t actualBreak = t->bi->preceding(i);
          int32_t expectedBreak = BreakIterator::DONE;
  
@@ -1108,7 +666,7 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
          // Therefore, start looking at the expected break data not at i-1, but at
          // the start of code point index - 1.
          utext_setNativeIndex(t->textToBreak, i);
-        int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
+        int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
          for (; j >= 0; j--) {
              if (t->getExpectedBreak(j) != 0) {
                  expectedBreak = j;
@@ -1131,34 +689,27 @@ void RBBITest::TestExtended() {
      UErrorCode      status  = U_ZERO_ERROR;
      Locale          locale("");
  
-    UnicodeString       rules;
      TestParams          tp(status);
  
-    RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
+    RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
      if (U_FAILURE(status)) {
          dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
      }
  
-
      //
      //  Open and read the test data file.
      //
      const char *testDataDirectory = IntlTest::getSourceTestData(status);
-    char testFileName[1000];
-    if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
-        errln("Can't open test data.  Path too long.");
-        return;
-    }
-    strcpy(testFileName, testDataDirectory);
-    strcat(testFileName, "rbbitst.txt");
+    CharString testFileName(testDataDirectory, -1, status);
+    testFileName.append("rbbitst.txt", -1, status);
  
      int    len;
-    UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
+    UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
      if (U_FAILURE(status)) {
-        return; /* something went wrong, error already output */
+        errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
+        return;
      }
  
-
      bool skipTest = false; // Skip this test?
  
      //
@@ -1170,7 +721,8 @@ void RBBITest::TestExtended() {
          PARSE_COMMENT,
          PARSE_TAG,
          PARSE_DATA,
-        PARSE_NUM
+        PARSE_NUM,
+        PARSE_RULES
      }
      parseState = PARSE_TAG;
  
@@ -1181,7 +733,10 @@ void RBBITest::TestExtended() {
      int32_t    column   = 0;
      int32_t    charIdx  = 0;
  
-    int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
+    int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
+
+    UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
+    int32_t             rulesFirstLine;  // Line number of the start of current <rules> block
  
      for (charIdx = 0; charIdx < len; ) {
          status = U_ZERO_ERROR;
@@ -1215,41 +770,50 @@ void RBBITest::TestExtended() {
              if (u_isUWhiteSpace(c)) {
                  break;
              }
-            if (testString.compare(charIdx-1, 6, "<word>") == 0) {
+            if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
                  delete tp.bi;
                  tp.bi = BreakIterator::createWordInstance(locale,  status);
                  skipTest = false;
                  charIdx += 5;
                  break;
              }
-            if (testString.compare(charIdx-1, 6, "<char>") == 0) {
+            if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
                  delete tp.bi;
                  tp.bi = BreakIterator::createCharacterInstance(locale,  status);
                  skipTest = false;
                  charIdx += 5;
                  break;
              }
-            if (testString.compare(charIdx-1, 6, "<line>") == 0) {
+            if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
                  delete tp.bi;
                  tp.bi = BreakIterator::createLineInstance(locale,  status);
                  skipTest = false;
                  charIdx += 5;
                  break;
              }
-            if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
+            if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
                  delete tp.bi;
                  tp.bi = BreakIterator::createSentenceInstance(locale,  status);
                  skipTest = false;
                  charIdx += 5;
                  break;
              }
-            if (testString.compare(charIdx-1, 7, "<title>") == 0) {
+            if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
                  delete tp.bi;
                  tp.bi = BreakIterator::createTitleInstance(locale,  status);
                  charIdx += 6;
                  break;
              }
  
+            if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
+                testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
+                charIdx = testString.indexOf(u'>', charIdx) + 1;
+                parseState = PARSE_RULES;
+                rules.remove();
+                rulesFirstLine = lineNum;
+                break;
+            }
+
              // <locale  loc_name>
              localeMatcher.reset(testString);
              if (localeMatcher.lookingAt(charIdx-1, status)) {
@@ -1261,7 +825,7 @@ void RBBITest::TestExtended() {
                  TEST_ASSERT_SUCCESS(status);
                  break;
              }
-            if (testString.compare(charIdx-1, 6, "<data>") == 0) {
+            if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
                  parseState = PARSE_DATA;
                  charIdx += 5;
                  tp.dataToBreak = "";
@@ -1278,6 +842,33 @@ void RBBITest::TestExtended() {
              }
              break;
  
+        case PARSE_RULES:
+            if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
+                charIdx += 7;
+                parseState = PARSE_TAG;
+                delete tp.bi;
+                UParseError pe;
+                tp.bi = new RuleBasedBreakIterator(rules, pe, status);
+                skipTest = U_FAILURE(status);
+                if (U_FAILURE(status)) {
+                    errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
+                        rulesFirstLine + pe.line - 1, u_errorName(status));
+                }
+            } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
+                charIdx += 10;
+                parseState = PARSE_TAG;
+                UErrorCode ec = U_ZERO_ERROR;
+                UParseError pe;
+                RuleBasedBreakIterator bi(rules, pe, ec);
+                if (U_SUCCESS(ec)) {
+                    errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
+                        rulesFirstLine + pe.line - 1);
+                }
+            } else {
+                rules.append(c);
+            }
+            break;
+
          case PARSE_DATA:
              if (c == u'\u2022') { // u'•'
                  int32_t  breakIdx = tp.dataToBreak.length();
@@ -1290,7 +881,7 @@ void RBBITest::TestExtended() {
                  break;
              }
  
-            if (testString.compare(charIdx-1, 7, "</data>") == 0) {
+            if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
                  // Add final entry to mappings from break location to source file position.
                  //  Need one extra because last break position returned is after the
                  //    last char in the data, not at the last char.
@@ -1316,7 +907,7 @@ void RBBITest::TestExtended() {
                  break;
              }
  
-            if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
+            if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
                  // Named character, e.g. \N{COMBINING GRAVE ACCENT}
                  // Get the code point from the name and insert it into the test data.
                  //   (Damn, no API takes names in Unicode  !!!
@@ -1355,8 +946,7 @@ void RBBITest::TestExtended() {
  
  
  
-
-            if (testString.compare(charIdx-1, 2, "<>") == 0) {
+            if (testString.compare(charIdx-1, 2, u"<>") == 0) {
                  charIdx++;
                  int32_t  breakIdx = tp.dataToBreak.length();
                  tp.expectedBreaks->setSize(breakIdx+1);
@@ -1477,6 +1067,18 @@ void RBBITest::TestExtended() {
  
      }
  
+    // Reached end of test file. Raise an error if parseState indicates that we are
+    //   within a block that should have been terminated.
+
+    if (parseState == PARSE_RULES) {
+        errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
+            lineNum, rulesFirstLine);
+    }
+    if (parseState == PARSE_DATA) {
+        errln("rbbitst.txt:%d <data> block not closed.", lineNum);
+    }
+
+
  end_test:
      delete [] testFile;
  #endif
@@ -1573,7 +1175,7 @@ UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char
      fileSize = ftell(f);
      fileBuf = new char[fileSize];
      fseek(f, 0, SEEK_SET);
-    amt_read = fread(fileBuf, 1, fileSize, f);
+    amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
      if (amt_read != fileSize || fileSize <= 0) {
          errln("Error reading test data file.");
          goto cleanUpAndReturn;
@@ -1681,35 +1283,57 @@ void RBBITest::TestUnicodeFiles() {
  
  
  // Check for test cases from the Unicode test data files that are known to fail
-// and should be skipped because ICU is not yet able to fully implement the spec.
-// See ticket #7270.
+// and should be skipped as known issues because ICU does not fully implement
+// the Unicode specifications, or because ICU includes tailorings that differ from
+// the Unicode standard.
+//
+// Test cases are identified by the test data sequence, which tends to be more stable
+// across Unicode versions than the test file line numbers.
+//
+// The test case with ticket "10666" is a dummy, included as an example.
  
  UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
      static struct TestCase {
+        const char *fTicketNum;
          const char *fFileName;
          const UChar *fString;
-    } badTestCases[] = {                                // Line Numbers from Unicode 7.0.0 file.
-        {"LineBreakTest.txt", u"\u200B\u0020}"},        // Line 5198
-        {"LineBreakTest.txt", u"\u200B\u0020)"},        // Line 5202
-        {"LineBreakTest.txt", u"\u200B\u0020!"},        // Line 5214
-        {"LineBreakTest.txt", u"\u200B\u0020,"},        // Line 5246
-        {"LineBreakTest.txt", u"\u200B\u0020/"},        // Line 5298
-        {"LineBreakTest.txt", u"\u200B\u0020\u2060"},   // Line 5302
-                                                        // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
-        {"GraphemeBreakTest.txt", u"\u200D\u2640"},     // Line 656, old GB 11 test ZWJ x GAZ
-        {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
-        {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
-
-                                                        // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
-        {"WordBreakTest.txt", u"\u200D\u261D"},         // Line 1356, ZWJ x EmojiNRK
-        {"WordBreakTest.txt", u"\u200D\U0001F3FB"},     // Line 1358, ZWJ x EmojiNRK
+    } badTestCases[] = {
+        {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
+        // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
+        // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
+        // ICU is out of sync with Unicode.
+        {"8151",  "LineBreakTest.txt", u"-#"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\u00a7"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\U00050005"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\u0e01"},
+        {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
+
+        // Issue ICU-12017 Improve line break around numbers
+        {"12017", "LineBreakTest.txt", u"\u002C\u0030"},   // ",0"
+        {"12017", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
+        {"12017", "LineBreakTest.txt", u"find .com"},
+        {"12017", "LineBreakTest.txt", u"equals .35 cents"},
+        {"12017", "LineBreakTest.txt", u"a.2 "},
+        {"12017", "LineBreakTest.txt", u"a.2 \u0915"},
+        {"12017", "LineBreakTest.txt", u"a.2 \u672C"},
+        {"12017", "LineBreakTest.txt", u"a.2\u3000\u672C"},
+        {"12017", "LineBreakTest.txt", u"a.2\u3000\u307E"},
+        {"12017", "LineBreakTest.txt", u"a.2\u3000\u0033"},
+        {"12017", "LineBreakTest.txt", u"A.1 \uBABB"},
+        {"12017", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
+        {"12017", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
+        {"12017", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
+        {"12017", "LineBreakTest.txt", u"a.2\u3000\u300C"},
      };
  
      for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
          const TestCase &badCase = badTestCases[n];
          if (!strcmp(fileName, badCase.fFileName) &&
                  testCase == UnicodeString(badCase.fString)) {
-            return logKnownIssue("7270");
+            return logKnownIssue(badCase.fTicketNum);
          }
      }
      return FALSE;
@@ -1957,34 +1581,6 @@ static uint32_t m_rand()
  }
  
  
-//
-// Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
-//
-static const char16_t *gExtended_Pict = u"["
-    "\\U0001F774-\\U0001F77F\\U00002700-\\U00002701\\U00002703-\\U00002704\\U0000270E\\U00002710-\\U00002711\\U00002765-\\U00002767"
-    "\\U0001F030-\\U0001F093\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
-    "\\U0001F260-\\U0001F265\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F25F"
-    "\\U0001F266-\\U0001F2FF\\U0001F7D5-\\U0001F7FF\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F"
-    "\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6"
-    "\\U0001F4FE\\U0001F53E-\\U0001F548\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586"
-    "\\U0001F588-\\U0001F589\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7"
-    "\\U0001F5A9-\\U0001F5B0\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB"
-    "\\U0001F5DF-\\U0001F5E0\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
-    "\\U00002605\\U00002607-\\U0000260D\\U0000260F-\\U00002610\\U00002612\\U00002616-\\U00002617\\U00002619-\\U0000261C"
-    "\\U0000261E-\\U0000261F\\U00002621\\U00002624-\\U00002625\\U00002627-\\U00002629\\U0000262B-\\U0000262D\\U00002630-\\U00002637"
-    "\\U0000263B-\\U00002647\\U00002654-\\U0000265F\\U00002661-\\U00002662\\U00002664\\U00002667\\U00002669-\\U0000267A"
-    "\\U0000267C-\\U0000267E\\U00002680-\\U00002691\\U00002695\\U00002698\\U0000269A\\U0000269D-\\U0000269F\\U000026A2-\\U000026A9"
-    "\\U000026AC-\\U000026AF\\U000026B2-\\U000026BC\\U000026BF-\\U000026C3\\U000026C6-\\U000026C7\\U000026C9-\\U000026CD"
-    "\\U000026D0\\U000026D2\\U000026D5-\\U000026E8\\U000026EB-\\U000026EF\\U000026F6\\U000026FB-\\U000026FC\\U000026FE-\\U000026FF"
-    "\\U00002388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5"
-    "\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F"
-    "\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF\\U0001F900-\\U0001F90B\\U0001F91F\\U0001F928-\\U0001F92F"
-    "\\U0001F931-\\U0001F932\\U0001F94C\\U0001F95F-\\U0001F96B\\U0001F992-\\U0001F997\\U0001F9D0-\\U0001F9E6\\U0001F90C-\\U0001F90F"
-    "\\U0001F93F\\U0001F94D-\\U0001F94F\\U0001F96C-\\U0001F97F\\U0001F998-\\U0001F9BF\\U0001F9C1-\\U0001F9CF\\U0001F9E7-\\U0001F9FF"
-    "\\U0001F6C6-\\U0001F6CA\\U0001F6D3-\\U0001F6D4\\U0001F6E6-\\U0001F6E8\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6F7-\\U0001F6F8"
-    "\\U0001F6D5-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F9-\\U0001F6FF"
-    "]";
-
  //------------------------------------------------------------------------------------------
  //
  //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
@@ -2014,11 +1610,7 @@ private:
      UnicodeSet  *fLVSet;
      UnicodeSet  *fLVTSet;
      UnicodeSet  *fHangulSet;
-    UnicodeSet  *fEmojiBaseSet;
-    UnicodeSet  *fEmojiModifierSet;
      UnicodeSet  *fExtendedPictSet;
-    UnicodeSet  *fEBGSet;
-    UnicodeSet  *fEmojiNRKSet;
      UnicodeSet  *fAnySet;
  
      const UnicodeString *fText;
@@ -2050,12 +1642,7 @@ RBBICharMonkey::RBBICharMonkey() {
      fHangulSet->addAll(*fLVSet);
      fHangulSet->addAll(*fLVTSet);
  
-    fEmojiBaseSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
-    fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
-    fExtendedPictSet  = new UnicodeSet(gExtended_Pict, status);
-    fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
-    fEmojiNRKSet      = new UnicodeSet(UNICODE_STRING_SIMPLE(
-                "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
+    fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
      fAnySet           = new UnicodeSet(0, 0x10ffff);
  
      fSets             = new UVector(status);
@@ -2069,12 +1656,8 @@ RBBICharMonkey::RBBICharMonkey() {
      fSets->addElement(fSpacingSet, status);
      fSets->addElement(fHangulSet,  status);
      fSets->addElement(fAnySet,     status);
-    fSets->addElement(fEmojiBaseSet, status);
-    fSets->addElement(fEmojiModifierSet, status);
      fSets->addElement(fZWJSet,     status);
      fSets->addElement(fExtendedPictSet, status);
-    fSets->addElement(fEBGSet,     status);
-    fSets->addElement(fEmojiNRKSet,status);
      if (U_FAILURE(status)) {
          deferredStatus = status;
      }
@@ -2194,22 +1777,8 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
              continue;
          }
  
-        // Rule (GB10)   (Emoji_Base | EBG) Extend * x Emoji_Modifier
-        if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
-            continue;
-        }
-        if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) &&
-                fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
-            continue;
-        }
-
-        // Rule (GB11)   (Glue_After_ZWJ | Emoji) Extend * ZWJ x (Glue_After_ZWJ | Emoji)
-        if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
-                (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
-            continue;
-        }
-        if ((fExtendedPictSet->contains(cBase) || fEmojiNRKSet->contains(cBase)) && fExtendSet->contains(c0) && fZWJSet->contains(c1) &&
-                (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
+        // Rule (GB11)   Extended_Pictographic Extend * ZWJ x Extended_Pictographic
+        if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
              continue;
          }
  
@@ -2218,7 +1787,7 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
          //                      a break if there are three or more contiguous RIs. If there are
          //                      only two, a break following will occur via other rules, and will include
          //                      any trailing extend characters, which is needed behavior.
-        if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1) 
+        if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
                  && fRegionalIndicatorSet->contains(c2)) {
              break;
          }
@@ -2256,12 +1825,8 @@ RBBICharMonkey::~RBBICharMonkey() {
      delete fLVTSet;
      delete fHangulSet;
      delete fAnySet;
-    delete fEmojiBaseSet;
-    delete fEmojiModifierSet;
      delete fZWJSet;
      delete fExtendedPictSet;
-    delete fEBGSet;
-    delete fEmojiNRKSet;
  }
  
  //------------------------------------------------------------------------------------------
@@ -2297,13 +1862,10 @@ private:
      UnicodeSet  *fOtherSet;
      UnicodeSet  *fExtendSet;
      UnicodeSet  *fExtendNumLetSet;
+    UnicodeSet  *fWSegSpaceSet;
      UnicodeSet  *fDictionarySet;
-    UnicodeSet  *fEBaseSet;
-    UnicodeSet  *fEBGSet;
-    UnicodeSet  *fEModifierSet;
      UnicodeSet  *fZWJSet;
      UnicodeSet  *fExtendedPictSet;
-    UnicodeSet  *fEmojiNRKSet;
  
      const UnicodeString  *fText;
  };
@@ -2327,18 +1889,14 @@ RBBIWordMonkey::RBBIWordMonkey()
      fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
      fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]",    status);
      fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
-    fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]",      status);
+    fNumericSet       = new UnicodeSet(u"[[\\p{Word_Break = Numeric}][\\uff10-\\uff19]]", status);
      fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
      fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
      fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}]",       status);
+    fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
  
-    fEBaseSet         = new UnicodeSet(u"[[\\p{Word_Break = EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]", status);
-    fEBGSet           = new UnicodeSet(u"[\\p{Word_Break = EBG}]",          status);
-    fEModifierSet     = new UnicodeSet(u"[\\p{Word_Break = EM}]",           status);
      fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
-    fExtendedPictSet  = new UnicodeSet(gExtended_Pict, status);
-    fEmojiNRKSet      = new UnicodeSet(
-            u"[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]", status);
+    fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
  
      fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
      fDictionarySet->addAll(*fKatakanaSet);
@@ -2366,15 +1924,12 @@ RBBIWordMonkey::RBBIWordMonkey()
      fOtherSet->removeAll(*fMidNumSet);
      fOtherSet->removeAll(*fNumericSet);
      fOtherSet->removeAll(*fExtendNumLetSet);
+    fOtherSet->removeAll(*fWSegSpaceSet);
      fOtherSet->removeAll(*fFormatSet);
      fOtherSet->removeAll(*fExtendSet);
      fOtherSet->removeAll(*fRegionalIndicatorSet);
-    fOtherSet->removeAll(*fEBaseSet);
-    fOtherSet->removeAll(*fEBGSet);
-    fOtherSet->removeAll(*fEModifierSet);
      fOtherSet->removeAll(*fZWJSet);
      fOtherSet->removeAll(*fExtendedPictSet);
-    fOtherSet->removeAll(*fEmojiNRKSet);
  
      // Inhibit dictionary characters from being tested at all.
      fOtherSet->removeAll(*fDictionarySet);
@@ -2398,13 +1953,10 @@ RBBIWordMonkey::RBBIWordMonkey()
      fSets->addElement(fExtendSet,            status);
      fSets->addElement(fOtherSet,             status);
      fSets->addElement(fExtendNumLetSet,      status);
+    fSets->addElement(fWSegSpaceSet,         status);
  
-    fSets->addElement(fEBaseSet,             status);
-    fSets->addElement(fEBGSet,               status);
-    fSets->addElement(fEModifierSet,         status);
      fSets->addElement(fZWJSet,               status);
      fSets->addElement(fExtendedPictSet,      status);
-    fSets->addElement(fEmojiNRKSet,          status);
  
      if (U_FAILURE(status)) {
          deferredStatus = status;
@@ -2483,12 +2035,17 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
              break;
          };
  
-        // Rule (3c)    ZWJ x (Glue_after_ZWJ | EmojiNRK).
+        // Rule (3c)    ZWJ x Extended_Pictographic
          //              Not ignoring extend chars, so peek into input text to
          //              get the potential ZWJ, the character immediately preceding c2.
          //              Sloppy UChar32 indexing: p2-1 may reference trail half
          //              but char32At will get the full code point.
-        if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
+        if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) {
+            continue;
+        }
+
+        // Rule (3d)    Keep horizontal whitespace together.
+        if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
              continue;
          }
  
@@ -2582,11 +2139,6 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
              continue;
          }
  
-        // WB 14  (E_Base | EBG) x E_Modifier
-        if ((fEBaseSet->contains(c1)  || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) {
-            continue;
-        }
-
          // Rule 15 - 17   Group pairs of Regional Indicators.
          if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
              break;
@@ -2626,15 +2178,12 @@ RBBIWordMonkey::~RBBIWordMonkey() {
      delete fFormatSet;
      delete fExtendSet;
      delete fExtendNumLetSet;
+    delete fWSegSpaceSet;
      delete fRegionalIndicatorSet;
      delete fDictionarySet;
      delete fOtherSet;
-    delete fEBaseSet;
-    delete fEBGSet;
-    delete fEModifierSet;
      delete fZWJSet;
      delete fExtendedPictSet;
-    delete fEmojiNRKSet;
  }
  
  
@@ -2996,6 +2545,7 @@ private:
      UnicodeSet  *fB2;
      UnicodeSet  *fBA;
      UnicodeSet  *fBB;
+    UnicodeSet  *fHH;
      UnicodeSet  *fHY;
      UnicodeSet  *fH2;
      UnicodeSet  *fH3;
@@ -3023,9 +2573,7 @@ private:
      UnicodeSet  *fXX;
      UnicodeSet  *fEB;
      UnicodeSet  *fEM;
-    UnicodeSet  *fZJ;
-    UnicodeSet  *fExtendedPict;
-    UnicodeSet  *fEmojiNRK;
+    UnicodeSet  *fZWJ;
  
      BreakIterator        *fCharBI;
      const UnicodeString  *fText;
@@ -3062,6 +2610,7 @@ RBBILineMonkey::RBBILineMonkey() :
      fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
      fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
      fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
+    fHH    = new UnicodeSet();
      fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
      fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
      fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
@@ -3090,9 +2639,7 @@ RBBILineMonkey::RBBILineMonkey() :
      fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
      fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
      fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
-    fZJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
-    fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
-    fExtendedPict = new UnicodeSet(gExtended_Pict, status);
+    fZWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
  
      if (U_FAILURE(status)) {
          deferredStatus = status;
@@ -3104,7 +2651,9 @@ RBBILineMonkey::RBBILineMonkey() :
      fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
  
      fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
-    fCM->addAll(*fZJ);     // ZWJ behaves as a CM.
+    fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
+
+    fHH->add(u'\u2010');   // Hyphen, '‐'
  
      fSets->addElement(fBK, status);
      fSets->addElement(fCR, status);
@@ -3146,14 +2695,13 @@ RBBILineMonkey::RBBILineMonkey() :
      fSets->addElement(fSG, status);
      fSets->addElement(fEB, status);
      fSets->addElement(fEM, status);
-    fSets->addElement(fZJ, status);
-    fSets->addElement(fExtendedPict, status);
-    fSets->addElement(fEmojiNRK, status);
+    fSets->addElement(fZWJ, status);
  
  
      const char *rules =
              "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
              "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
+            "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
              "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
              "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
              "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
@@ -3332,18 +2880,50 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
          }
  
          // LB 8  Break after zero width space
-        if (fZW->contains(prevChar)) {
+        //       ZW SP* ÷
+        //       Scan backwards from prevChar for SP* ZW
+        tPos = prevPos;
+        while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
+            tPos = fText->moveIndex32(tPos, -1);
+        }
+        if (fZW->contains(fText->char32At(tPos))) {
              break;
          }
  
-        // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
+        // LB 25    Numbers
+        //          Move this test up, before LB8a, because numbers can match a longer sequence that would
+        //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
+        if (fNumberMatcher->lookingAt(prevPos, status)) {
+            if (U_FAILURE(status)) {
+                break;
+            }
+            // Matched a number.  But could have been just a single digit, which would
+            //    not represent a "no break here" between prevChar and thisChar
+            int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
+            if (numEndIdx > pos) {
+                // Number match includes at least our two chars being checked
+                if (numEndIdx > nextPos) {
+                    // Number match includes additional chars.  Update pos and nextPos
+                    //   so that next loop iteration will continue at the end of the number,
+                    //   checking for breaks between last char in number & whatever follows.
+                    pos = nextPos = numEndIdx;
+                    do {
+                        pos = fText->moveIndex32(pos, -1);
+                        thisChar = fText->char32At(pos);
+                    } while (fCM->contains(thisChar));
+                }
+                continue;
+            }
+        }
+
+        // LB 8a ZWJ x
          //       The monkey test's way of ignoring combining characters doesn't work
          //       for this rule. ZJ is also a CM. Need to get the actual character
          //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
          {
              int32_t prevIdx = fText->moveIndex32(pos, -1);
              UChar32 prevC = fText->char32At(prevIdx);
-            if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
+            if (fZWJ->contains(prevC)) {
                  continue;
              }
          }
@@ -3374,17 +2954,12 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
              continue;
          }
  
-
-
          // LB 13  Don't break before closings.
-        //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
-        //        fall into LB 17 and the more general number regular expression.
          //
-        if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
-            (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
-                                         fEX->contains(thisChar)  ||
-            (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
-            (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
+        if (fCL->contains(thisChar) ||
+                fCP->contains(thisChar) ||
+                fEX->contains(thisChar) ||
+                fSY->contains(thisChar)) {
              continue;
          }
  
@@ -3392,7 +2967,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
          //       Scan backwards, checking for this sequence.
          //       The OP char could include combining marks, so we actually check for
          //           OP CM* SP*
-        //       Another Twist: The Rule 67 fixes may have changed a SP CM
+        //       Another Twist: The Rule 9 fixes may have changed a SP CM
          //       sequence into a ID char, so before scanning back through spaces,
          //       verify that prevChar is indeed a space.  The prevChar variable
          //       may differ from fText[prevPos]
@@ -3410,6 +2985,21 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
          }
  
  
+        // LB 14a Break before an IS that begins a number and follows a space
+        if (nextPos < fText->length()) {
+            // note: UnicodeString::char32At(length) returns ffff, not distinguishable
+            //       from a legit ffff character. So test length separately.
+            UChar32 nextChar = fText->char32At(nextPos);
+            if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
+                break;
+            }
+        }
+
+        // LB14b Do not break before numeric separators, even after spaces.
+        if (fIS->contains(thisChar)) {
+            continue;
+        }
+
          // LB 15    QU SP* x OP
          if (fOP->contains(thisChar)) {
              // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
@@ -3478,6 +3068,15 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
              break;
          }
  
+        // LB 20.09  Don't break between Hyphens and letters if a break precedes the hyphen.
+        //           Formerly this was a Finnish tailoring.
+        //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
+        //    ^($HY | $HH) $AL;
+        if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
+                prevPosX2 == -1) {
+            continue;
+        }
+
          // LB 21
          if (fBA->contains(thisChar) ||
              fHY->contains(thisChar) ||
@@ -3522,11 +3121,11 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
          // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
          //      PR x (ID | EB | EM)
          //     (ID | EB | EM) x PO
-        if (fPR->contains(prevChar) && 
+        if (fPR->contains(prevChar) &&
                  (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
              continue;
          }
-        if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && 
+        if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
                  fPO->contains(thisChar)) {
              continue;
          }
@@ -3543,32 +3142,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
              continue;
          }
  
-
-
-        // LB 25    Numbers
-        if (fNumberMatcher->lookingAt(prevPos, status)) {
-            if (U_FAILURE(status)) {
-                break;
-            }
-            // Matched a number.  But could have been just a single digit, which would
-            //    not represent a "no break here" between prevChar and thisChar
-            int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
-            if (numEndIdx > pos) {
-                // Number match includes at least our two chars being checked
-                if (numEndIdx > nextPos) {
-                    // Number match includes additional chars.  Update pos and nextPos
-                    //   so that next loop iteration will continue at the end of the number,
-                    //   checking for breaks between last char in number & whatever follows.
-                    pos = nextPos = numEndIdx;
-                    do {
-                        pos = fText->moveIndex32(pos, -1);
-                        thisChar = fText->char32At(pos);
-                    } while (fCM->contains(thisChar));
-                }
-                continue;
-            }
-        }
-
+        // LB 25 numbers match, moved up, before LB 8a,
  
          // LB 26 Do not break a Korean syllable.
          if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
@@ -3626,12 +3200,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
              continue;
          }
  
-        // LB30a    RI RI <break> RI
-        //             RI    x    RI
+        // LB30a    RI RI  ÷  RI
+        //             RI  x  RI
          if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
              break;
          }
          if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
+            // Two Regional Indicators have been paired.
+            // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
+            // following RI. This is a hack.
+            thisChar = -1;
              continue;
          }
  
@@ -3670,6 +3248,7 @@ RBBILineMonkey::~RBBILineMonkey() {
      delete fB2;
      delete fBA;
      delete fBB;
+    delete fHH;
      delete fHY;
      delete fH2;
      delete fH3;
@@ -3698,9 +3277,7 @@ RBBILineMonkey::~RBBILineMonkey() {
      delete fXX;
      delete fEB;
      delete fEM;
-    delete fZJ;
-    delete fExtendedPict;
-    delete fEmojiNRK;
+    delete fZWJ;
  
      delete fCharBI;
      delete fNumberMatcher;
@@ -3762,16 +3339,16 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
      for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
          forward[count] = i;
          if (count < expectedcount && expected[count] != i) {
-            test->errln("break forward test failed: expected %d but got %d",
-                        expected[count], i);
+            test->errln("%s:%d break forward test failed: expected %d but got %d",
+                        __FILE__, __LINE__, expected[count], i);
              break;
          }
          count ++;
      }
      if (count != expectedcount) {
          printStringBreaks(ustr, expected, expectedcount);
-        test->errln("break forward test failed: missed %d match",
-                    expectedcount - count);
+        test->errln("%s:%d break forward test failed: missed %d match",
+                    __FILE__, __LINE__, expectedcount - count);
          return;
      }
      // testing boundaries
@@ -3779,13 +3356,15 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
          int j = expected[i - 1];
          if (!bi->isBoundary(j)) {
              printStringBreaks(ustr, expected, expectedcount);
-            test->errln("isBoundary() failed.  Expected boundary at position %d", j);
+            test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
+                    __FILE__, __LINE__, j);
              return;
          }
          for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
              if (bi->isBoundary(j)) {
                  printStringBreaks(ustr, expected, expectedcount);
-                test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
+                test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
+                    __FILE__, __LINE__, j);
                  return;
              }
          }
@@ -3795,8 +3374,8 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
          count --;
          if (forward[count] != i) {
              printStringBreaks(ustr, expected, expectedcount);
-            test->errln("happy break test previous() failed: expected %d but got %d",
-                        forward[count], i);
+            test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
+                        __FILE__, __LINE__, forward[count], i);
              break;
          }
      }
@@ -3811,9 +3390,12 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
          // int j = expected[i] + 1;
          int j = ustr.moveIndex32(expected[i], 1);
          for (; j <= expected[i + 1]; j ++) {
-            if (bi->preceding(j) != expected[i]) {
+            int32_t expectedPreceding = expected[i];
+            int32_t actualPreceding = bi->preceding(j);
+            if (actualPreceding != expectedPreceding) {
                  printStringBreaks(ustr, expected, expectedcount);
-                test->errln("preceding(): Not expecting boundary at position %d", j);
+                test->errln("%s:%d preceding(%d): expected %d, got %d",
+                        __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
                  return;
              }
          }
@@ -3905,7 +3487,12 @@ void RBBITest::TestWordBoundary(void)
      Locale        locale("en");
      UErrorCode    status = U_ZERO_ERROR;
      // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
-    BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
+    LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
+    if (U_FAILURE(status)) {
+        errcheckln(status, "%s:%d Creation of break iterator failed %s",
+                __FILE__, __LINE__, u_errorName(status));
+        return;
+    }
      UChar         str[50];
      static const char *strlist[] =
      {
@@ -3940,43 +3527,44 @@ void RBBITest::TestWordBoundary(void)
      "\\u003b\\u0027\\u00b7\\u47a3",
      };
      int loop;
-    if (U_FAILURE(status)) {
-        errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
-        return;
-    }
      for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
-        // printf("looping %d\n", loop);
-        u_unescape(strlist[loop], str, 20);
+        u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
          UnicodeString ustr(str);
          int forward[50];
          int count = 0;
  
          bi->setText(ustr);
-        int prev = 0;
-        int i;
-        for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
-            forward[count ++] = i;
-            if (i > prev) {
-                int j;
-                for (j = prev + 1; j < i; j ++) {
-                    if (bi->isBoundary(j)) {
-                        printStringBreaks(ustr, forward, count);
-                        errln("happy boundary test failed: expected %d not a boundary",
-                               j);
-                        return;
-                    }
+        int prev = -1;
+        for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
+            ++count;
+            if (count >= UPRV_LENGTHOF(forward)) {
+                errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
+                        __FILE__, __LINE__, loop, count, boundary);
+                return;
+            }
+            forward[count] = boundary;
+            if (boundary <= prev) {
+                errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
+                        __FILE__, __LINE__, loop, prev, boundary);
+                break;
+            }
+            for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
+                if (bi->isBoundary(nonBoundary)) {
+                    printStringBreaks(ustr, forward, count);
+                    errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
+                           __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
+                    return;
                  }
              }
-            if (!bi->isBoundary(i)) {
+            if (!bi->isBoundary(boundary)) {
                  printStringBreaks(ustr, forward, count);
-                errln("happy boundary test failed: expected %d a boundary",
-                       i);
+                errln("%s:%d happy boundary test failed: expected %d a boundary",
+                       __FILE__, __LINE__, boundary);
                  return;
              }
-            prev = i;
+            prev = boundary;
          }
      }
-    delete bi;
  }
  
  void RBBITest::TestLineBreaks(void)
@@ -4709,6 +4297,7 @@ void RBBITest::TestBug12932() {
  // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
  //             remain undevided by ICU char, word and line break.
  void RBBITest::TestEmoji() {
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS
      UErrorCode  status = U_ZERO_ERROR;
  
      CharString testFileName;
@@ -4789,15 +4378,255 @@ void RBBITest::TestEmoji() {
              }
          }
      }
+#endif
+}
+
+
+// TestBug12519  -  Correct handling of Locales by assignment / copy / clone
+
+void RBBITest::TestBug12519() {
+    UErrorCode status = U_ZERO_ERROR;
+    LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
+    LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
+    if (!assertSuccess(WHERE, status)) {
+        dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
+        return;
+    }
+    assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
+
+    assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
+    assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
+
+    LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
+    assertTrue(WHERE, *biEn == *cloneEn);
+    assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
+
+    LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
+    assertTrue(WHERE, *biFr == *cloneFr);
+    assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
+
+    LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
+    UnicodeString text("Hallo Welt");
+    biDe->setText(text);
+    assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
+    *biDe = *biFr;
+    assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
+}
+
+void RBBITest::TestBug12677() {
+    // Check that stripping of comments from rules for getRules() is not confused by
+    // the presence of '#' characters in the rules that do not introduce comments.
+    UnicodeString rules(u"!!forward; \n"
+                         "$x = [ab#];  # a set with a # literal. \n"
+                         " # .;        # a comment that looks sort of like a rule.   \n"
+                         " '#' '?';    # a rule with a quoted #   \n"
+                       );
+
+    UErrorCode status = U_ZERO_ERROR;
+    UParseError pe;
+    RuleBasedBreakIterator bi(rules, pe, status);
+    assertSuccess(WHERE, status);
+    UnicodeString rtRules = bi.getRules();
+    assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "),  rtRules);
+}
+
+
+void RBBITest::TestTableRedundancies() {
+    UErrorCode status = U_ZERO_ERROR;
+
+    LocalPointer<RuleBasedBreakIterator> bi (
+        (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
+    assertSuccess(WHERE, status);
+    if (U_FAILURE(status)) return;
+
+    RBBIDataWrapper *dw = bi->fData;
+    const RBBIStateTable *fwtbl = dw->fForwardTable;
+    int32_t numCharClasses = dw->fHeader->fCatCount;
+    // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
+
+    // Check for duplicate columns (character categories)
+
+    std::vector<UnicodeString> columns;
+    for (int32_t column = 0; column < numCharClasses; column++) {
+        UnicodeString s;
+        for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
+            RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
+            s.append(row->fNextState[column]);
+        }
+        columns.push_back(s);
+    }
+    // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
+    for (int c1=1; c1<numCharClasses; c1++) {
+        for (int c2 = c1+1; c2 < numCharClasses; c2++) {
+            if (columns.at(c1) == columns.at(c2)) {
+                errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
+                goto out;
+            }
+        }
+    }
+  out:
+
+    // Check for duplicate states
+    std::vector<UnicodeString> rows;
+    for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
+        UnicodeString s;
+        RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
+        assertTrue(WHERE, row->fAccepting >= -1);
+        s.append(row->fAccepting + 1);   // values of -1 are expected.
+        s.append(row->fLookAhead);
+        s.append(row->fTagIdx);
+        for (int32_t column = 0; column < numCharClasses; column++) {
+            s.append(row->fNextState[column]);
+        }
+        rows.push_back(s);
+    }
+    for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
+        for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
+            if (rows.at(r1) == rows.at(r2)) {
+                errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
+                return;
+            }
+        }
+    }
+}
+
+// Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
+//            even after next() has returned DONE.
+
+void RBBITest::TestBug13447() {
+    UErrorCode status = U_ZERO_ERROR;
+    LocalPointer<RuleBasedBreakIterator> bi(
+        (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
+    assertSuccess(WHERE, status);
+    if (U_FAILURE(status)) return;
+    UnicodeString data(u"1234");
+    bi->setText(data);
+    assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
+    assertEquals(WHERE, 4, bi->next());
+    assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
+    assertEquals(WHERE, UBRK_DONE, bi->next());
+    assertEquals(WHERE, 4, bi->current());
+    assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
+}
+
+//  TestReverse exercises both the synthesized safe reverse rules and the logic
+//  for filling the break iterator cache when starting from random positions
+//  in the text.
+//
+//  It's a monkey test, working on random data, with the expected data obtained
+//  from forward iteration (no safe rules involved), comparing with results
+//  when indexing into the interior of the string (safe rules needed).
+
+void RBBITest::TestReverse() {
+    UErrorCode status = U_ZERO_ERROR;
+
+    TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
+            BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
+    assertSuccess(WHERE, status, true);
+    status = U_ZERO_ERROR;
+    TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
+            BreakIterator::createWordInstance(Locale::getEnglish(), status)));
+    assertSuccess(WHERE, status, true);
+    status = U_ZERO_ERROR;
+    TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
+            BreakIterator::createLineInstance(Locale::getEnglish(), status)));
+    assertSuccess(WHERE, status, true);
+    status = U_ZERO_ERROR;
+    TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
+            BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
+    assertSuccess(WHERE, status, true);
+}
+
+void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
+    if (!bi) {
+        return;
+    }
+
+    // From the mapping trie in the break iterator's internal data, create a
+    // vector of UnicodeStrings, one for each character category, containing
+    // all of the code points that map to that category. Unicode planes 0 and 1 only,
+    // to avoid an execess of unassigned code points.
+
+    RBBIDataWrapper *data = bi->fData;
+    int32_t categoryCount = data->fHeader->fCatCount;
+    UTrie2  *trie = data->fTrie;
+
+    std::vector<UnicodeString> strings(categoryCount, UnicodeString());
+    for (int cp=0; cp<0x1fff0; ++cp) {
+        int cat = utrie2_get32(trie, cp);
+        cat &= ~0x4000;    // And off the dictionary bit from the category.
+        assertTrue(WHERE, cat < categoryCount && cat >= 0);
+        if (cat < 0 || cat >= categoryCount) return;
+        strings[cat].append(cp);
+    }
+
+    icu_rand randomGen;
+    const int testStringLength = 10000;
+    UnicodeString testString;
+
+    for (int i=0; i<testStringLength; ++i) {
+        int charClass = randomGen() % categoryCount;
+        if (strings[charClass].length() > 0) {
+            int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
+            testString.append(cp);
+        }
+    }
+
+    typedef std::pair<UBool, int32_t> Result;
+    std::vector<Result> expectedResults;
+    bi->setText(testString);
+    for (int i=0; i<testString.length(); ++i) {
+        bool isboundary = bi->isBoundary(i);
+        int  ruleStatus = bi->getRuleStatus();
+        expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
+    }
+
+    for (int i=testString.length()-1; i>=0; --i) {
+        bi->setText(testString);   // clears the internal break cache
+        Result expected = expectedResults[i];
+        assertEquals(WHERE, expected.first, bi->isBoundary(i));
+        assertEquals(WHERE, expected.second, bi->getRuleStatus());
+    }
  }
  
  
+// Ticket 13692 - finding word boundaries in very large numbers or words could
+//                be very time consuming. When the problem was present, this void test
+//                would run more than fifteen minutes, which is to say, the failure was noticeale.
+
+void RBBITest::TestBug13692() {
+    UErrorCode status = U_ZERO_ERROR;
+    LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
+            BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
+    if (!assertSuccess(WHERE, status, true)) {
+        return;
+    }
+    constexpr int32_t LENGTH = 1000000;
+    UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
+    for (int i=0; i<20; i+=2) {
+        longNumber.setCharAt(i, u' ');
+    }
+    bi->setText(longNumber);
+    assertFalse(WHERE, bi->isBoundary(LENGTH-5));
+    assertSuccess(WHERE, status);
+}
+
  //
  //  TestDebug    -  A place-holder test for debugging purposes.
  //                  For putting in fragments of other tests that can be invoked
  //                  for tracing  without a lot of unwanted extra stuff happening.
  //
  void RBBITest::TestDebug(void) {
+    UErrorCode status = U_ZERO_ERROR;
+    LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
+            BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
+    if (!assertSuccess(WHERE, status, true)) {
+        return;
+    }
+    const UnicodeString &rules = bi->getRules();
+    UParseError pe;
+    LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
+    assertSuccess(WHERE, status);
  }
  
  void RBBITest::TestProperties() {
@@ -4811,4 +4640,4 @@ void RBBITest::TestProperties() {
      }
  }
  
-#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
+#endif // #if !UCONFIG_NO_BREAK_ITERATION