ICU-66108.tar.gz

[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
diff --git a/icuSources/test/intltest/rbbitst.cpp b/icuSources/test/intltest/rbbitst.cpp

index ebd123640f669d458c3ded7de93088d854ddfc52..7bb8d4786feaeb880d39a5e813db0a3c8fdd1fc6 100644 (file)
--- a/icuSources/test/intltest/rbbitst.cpp
+++ b/icuSources/test/intltest/rbbitst.cpp
@@ -46,16 +46,31 @@
  #include "uvector.h"
  #include "uvectr32.h"
  
+// Needed for Apple perf tests <rdar://problem/51193810>
+#include <unistd.h>
+#include <mach/mach_time.h>
+
  
  #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
  #include "unicode/filteredbrk.h"
  #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
  
-#define TEST_ASSERT(x) {if (!(x)) { \
-    errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
-
-#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
-    errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
+#define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
+    if (!(x)) { \
+        errln("Failure in file %s, line %d", __FILE__, __LINE__); \
+    } \
+} UPRV_BLOCK_MACRO_END
+
+#define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
+    if (U_FAILURE(errcode)) { \
+        errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
+    } \
+} UPRV_BLOCK_MACRO_END
+
+#define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
+    IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
+                    __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
+}
  
  //---------------------------------------------
  // runIndexedTest
@@ -736,7 +751,13 @@ void RBBITest::TestExtended() {
      int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
  
      UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
-    int32_t             rulesFirstLine;  // Line number of the start of current <rules> block
+    int32_t             rulesFirstLine = 0;  // Line number of the start of current <rules> block
+
+    // <rdar://problem/51193810>
+    mach_timebase_info_data_t info;
+    uint64_t start, durationOpen = 0.0, durationUse = 0.0;
+    mach_timebase_info(&info);
+    UBool isLine = FALSE;
  
      for (charIdx = 0; charIdx < len; ) {
          status = U_ZERO_ERROR;
@@ -775,6 +796,7 @@ void RBBITest::TestExtended() {
                  tp.bi = BreakIterator::createWordInstance(locale,  status);
                  skipTest = false;
                  charIdx += 5;
+                isLine = FALSE;
                  break;
              }
              if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
@@ -782,13 +804,17 @@ void RBBITest::TestExtended() {
                  tp.bi = BreakIterator::createCharacterInstance(locale,  status);
                  skipTest = false;
                  charIdx += 5;
+                isLine = FALSE;
                  break;
              }
              if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
                  delete tp.bi;
+                start = mach_absolute_time(); // <rdar://problem/51193810>
                  tp.bi = BreakIterator::createLineInstance(locale,  status);
+                durationOpen += (((mach_absolute_time() - start) * info.numer)/info.denom);
                  skipTest = false;
                  charIdx += 5;
+                isLine = TRUE;
                  break;
              }
              if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
@@ -802,6 +828,7 @@ void RBBITest::TestExtended() {
                  delete tp.bi;
                  tp.bi = BreakIterator::createTitleInstance(locale,  status);
                  charIdx += 6;
+                isLine = FALSE;
                  break;
              }
  
@@ -811,6 +838,7 @@ void RBBITest::TestExtended() {
                  parseState = PARSE_RULES;
                  rules.remove();
                  rulesFirstLine = lineNum;
+                isLine = FALSE;
                  break;
              }
  
@@ -895,7 +923,11 @@ void RBBITest::TestExtended() {
                      // RUN THE TEST!
                      status = U_ZERO_ERROR;
                      tp.setUTF16(status);
+                    start = mach_absolute_time(); // <rdar://problem/51193810>
                      executeTest(&tp, status);
+                    if (isLine) {
+                        durationUse += (((mach_absolute_time() - start) * info.numer)/info.denom);
+                    }
                      TEST_ASSERT_SUCCESS(status);
  
                      // Run again, this time with UTF-8 text wrapped in a UText.
@@ -1078,6 +1110,10 @@ void RBBITest::TestExtended() {
          errln("rbbitst.txt:%d <data> block not closed.", lineNum);
      }
  
+    //
+    infoln("TestExtended total time in createLineInstance     (nsec):\t%llu\n", durationOpen);
+    infoln("TestExtended total time in linebreak test execute (nsec):\t%llu\n", durationUse);
+
  
  end_test:
      delete [] testFile;
@@ -1237,7 +1273,7 @@ cleanUpAndReturn:
          delete []retPtr;
          retPtr = 0;
          ulen   = 0;
-    };
+    }
      return retPtr;
  }
  
@@ -1398,14 +1434,14 @@ void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *
      //
      int spin = 0;
      while (tokenMatcher.find()) {
-       if(tokenMatcher.hitEnd()) {
+        if(tokenMatcher.hitEnd()) {
            /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
               This occurred when the text file was corrupt (wasn't marked as UTF-8)
               and caused an infinite loop here on EBCDIC systems!
            */
            fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
-          //      return;
-       }
+          //       return;
+        }
          if (tokenMatcher.start(1, status) >= 0) {
              // Scanned a divide sign, indicating a break position in the test data.
              if (testString.length()>0) {
@@ -1546,14 +1582,30 @@ public:
      // Return -1 after reaching end of string.
      virtual  int32_t   next(int32_t i) = 0;
  
+    // Name of each character class, parallel with charClasses. Used for debugging output
+    // of characters.
+    virtual  std::vector<std::string>&     characterClassNames();
+
+    void setAppliedRule(int32_t position, const char* value);
+
+    std::string getAppliedRule(int32_t position);
+
      virtual ~RBBIMonkeyKind();
-    UErrorCode       deferredStatus;
+    UErrorCode deferredStatus;
  
+    std::string classNameFromCodepoint(const UChar32 c);
+    unsigned int maxClassNameSize();
  
-protected:
-    RBBIMonkeyKind();
+ protected:
+     RBBIMonkeyKind();
+     std::vector<std::string> classNames;
+     std::vector<std::string> appliedRules;
+
+    // Clear `appliedRules` and fill it with empty strings in the size of test text.
+    void prepareAppliedRules(int32_t size );
+
+ private:
  
-private:
  };
  
  RBBIMonkeyKind::RBBIMonkeyKind() {
@@ -1563,6 +1615,45 @@ RBBIMonkeyKind::RBBIMonkeyKind() {
  RBBIMonkeyKind::~RBBIMonkeyKind() {
  }
  
+std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
+    return classNames;
+}
+
+void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
+    // Remove all the information in the `appliedRules`.
+    appliedRules.clear();
+    appliedRules.resize(size + 1);
+}
+
+void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
+    appliedRules[position] = value;
+}
+
+std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
+    return appliedRules[position];
+}
+
+std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
+    // Simply iterate through charClasses to find character's class
+    for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
+        UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
+        if (classSet->contains(c)) {
+            return classNames[aClassNum];
+        }
+    }
+    U_ASSERT(FALSE);  // This should not happen.
+    return "bad class name";
+}
+
+unsigned int RBBIMonkeyKind::maxClassNameSize() {
+    unsigned int maxSize = 0;
+    for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
+        if (classNames[aClassNum].size() > maxSize) {
+            maxSize = classNames[aClassNum].size();
+        }
+    }
+    return maxSize;
+}
  
  //----------------------------------------------------------------------------------------
  //
@@ -1611,6 +1702,9 @@ private:
      UnicodeSet  *fLVTSet;
      UnicodeSet  *fHangulSet;
      UnicodeSet  *fExtendedPictSet;
+    UnicodeSet  *fViramaSet;
+    UnicodeSet  *fLinkingConsonantSet;
+    UnicodeSet  *fExtCccZwjSet;
      UnicodeSet  *fAnySet;
  
      const UnicodeString *fText;
@@ -1643,21 +1737,34 @@ RBBICharMonkey::RBBICharMonkey() {
      fHangulSet->addAll(*fLVTSet);
  
      fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
+    fViramaSet        = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
+                                        "\\p{Indic_Syllabic_Category=Virama}]", status);
+    fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
+                                        "\\p{Indic_Syllabic_Category=Consonant}]", status);
+    fExtCccZwjSet     = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
      fAnySet           = new UnicodeSet(0, 0x10ffff);
  
+    // Create sets of characters, and add the names of the above character sets.
+    // In each new ICU release, add new names corresponding to the sets above.
      fSets             = new UVector(status);
-    fSets->addElement(fCRLFSet,    status);
-    fSets->addElement(fControlSet, status);
-    fSets->addElement(fExtendSet,  status);
-    fSets->addElement(fRegionalIndicatorSet, status);
+
+    // Important: Keep class names the same as the class contents.
+    fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
+    fSets->addElement(fControlSet, status); classNames.push_back("Control");
+    fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
+    fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
      if (!fPrependSet->isEmpty()) {
-        fSets->addElement(fPrependSet, status);
+        fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
      }
-    fSets->addElement(fSpacingSet, status);
-    fSets->addElement(fHangulSet,  status);
-    fSets->addElement(fAnySet,     status);
-    fSets->addElement(fZWJSet,     status);
-    fSets->addElement(fExtendedPictSet, status);
+    fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
+    fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
+    fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
+    fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
+    fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
+    fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
+    fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
+    fSets->addElement(fAnySet, status); classNames.push_back("Any");
+
      if (U_FAILURE(status)) {
          deferredStatus = status;
      }
@@ -1666,6 +1773,7 @@ RBBICharMonkey::RBBICharMonkey() {
  
  void RBBICharMonkey::setText(const UnicodeString &s) {
      fText = &s;
+    prepareAppliedRules(s.length());
  }
  
  
@@ -1688,6 +1796,7 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
      if (prevPos >= fText->length()) {
          return -1;
      }
+
      p0 = p1 = p2 = p3 = prevPos;
      c3 =  fText->char32At(prevPos);
      c0 = c1 = c2 = cBase = 0;
@@ -1701,7 +1810,7 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
          p1 = p2;  c1 = c2;
          p2 = p3;  c2 = c3;
  
-        // Advancd p3 by one codepoint
+        // Advance p3 by one codepoint
          p3 = fText->moveIndex32(p3, 1);
          c3 = fText->char32At(p3);
  
@@ -1709,93 +1818,109 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
              // Still warming up the loop.  (won't work with zero length strings, but we don't care)
              continue;
          }
+
          if (p2 == fText->length()) {
-            // Reached end of string.  Always a break position.
+            setAppliedRule(p2, "End of String");
              break;
          }
  
-        // Rule  GB3   CR x LF
          //     No Extend or Format characters may appear between the CR and LF,
          //     which requires the additional check for p2 immediately following p1.
          //
          if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
-            continue;
+          setAppliedRule(p2, "GB3   CR x LF");
+          continue;
          }
  
-        // Rule (GB4).   ( Control | CR | LF ) <break>
          if (fControlSet->contains(c1) ||
              c1 == 0x0D ||
              c1 == 0x0A)  {
-            break;
+          setAppliedRule(p2, "GB4   ( Control | CR | LF ) <break>");
+          break;
          }
  
-        // Rule (GB5)    <break>  ( Control | CR | LF )
-        //
          if (fControlSet->contains(c2) ||
              c2 == 0x0D ||
              c2 == 0x0A)  {
+            setAppliedRule(p2, "GB5   <break>  ( Control | CR | LF )");
              break;
          }
  
-
-        // Rule (GB6)  L x ( L | V | LV | LVT )
          if (fLSet->contains(c1) &&
                 (fLSet->contains(c2)  ||
                  fVSet->contains(c2)  ||
                  fLVSet->contains(c2) ||
                  fLVTSet->contains(c2))) {
+            setAppliedRule(p2, "GB6   L x ( L | V | LV | LVT )");
              continue;
          }
  
-        // Rule (GB7)    ( LV | V )  x  ( V | T )
          if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
              (fVSet->contains(c2) || fTSet->contains(c2)))  {
+            setAppliedRule(p2, "GB7    ( LV | V )  x  ( V | T )");
              continue;
          }
  
-        // Rule (GB8)    ( LVT | T)  x T
          if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
              fTSet->contains(c2))  {
+            setAppliedRule(p2, "GB8   ( LVT | T)  x T");
              continue;
          }
  
-        // Rule (GB9)    x (Extend | ZWJ)
          if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
              if (!fExtendSet->contains(c1)) {
                  cBase = c1;
              }
+            setAppliedRule(p2, "GB9   x (Extend | ZWJ)");
              continue;
          }
  
-        // Rule (GB9a)   x  SpacingMark
          if (fSpacingSet->contains(c2)) {
+            setAppliedRule(p2, "GB9a  x  SpacingMark");
              continue;
          }
  
-        // Rule (GB9b)   Prepend x
          if (fPrependSet->contains(c1)) {
+            setAppliedRule(p2, "GB9b  Prepend x");
              continue;
          }
  
-        // Rule (GB11)   Extended_Pictographic Extend * ZWJ x Extended_Pictographic
+        //   Note: Viramas are also included in the ExtCccZwj class.
+        if (fLinkingConsonantSet->contains(c2)) {
+            int pi = p1;
+            bool sawVirama = false;
+            while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
+                if (fViramaSet->contains(fText->char32At(pi))) {
+                    sawVirama = true;
+                }
+                pi = fText->moveIndex32(pi, -1);
+            }
+            if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
+              setAppliedRule(p2, "GB9.3  LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
+              continue;
+            }
+        }
+
          if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
-            continue;
+          setAppliedRule(p2, "GB11  Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
+          continue;
          }
  
-        // Rule (GB12-13)    Regional_Indicator x Regional_Indicator
          //                   Note: The first if condition is a little tricky. We only need to force
          //                      a break if there are three or more contiguous RIs. If there are
          //                      only two, a break following will occur via other rules, and will include
          //                      any trailing extend characters, which is needed behavior.
          if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
                  && fRegionalIndicatorSet->contains(c2)) {
-            break;
+          setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
+          break;
          }
          if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
-            continue;
+          setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
+          continue;
          }
  
-        // Rule (GB999)  Any  <break>  Any
+        setAppliedRule(p2, "GB999 Any <break> Any");
          break;
      }
  
@@ -1809,7 +1934,6 @@ UVector  *RBBICharMonkey::charClasses() {
      return fSets;
  }
  
-
  RBBICharMonkey::~RBBICharMonkey() {
      delete fSets;
      delete fCRLFSet;
@@ -1827,6 +1951,9 @@ RBBICharMonkey::~RBBICharMonkey() {
      delete fAnySet;
      delete fZWJSet;
      delete fExtendedPictSet;
+    delete fViramaSet;
+    delete fLinkingConsonantSet;
+    delete fExtCccZwjSet;
  }
  
  //------------------------------------------------------------------------------------------
@@ -1889,10 +2016,15 @@ RBBIWordMonkey::RBBIWordMonkey()
      fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
      fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]",    status);
      fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
-    fNumericSet       = new UnicodeSet(u"[[\\p{Word_Break = Numeric}][\\uff10-\\uff19]]", status);
+    fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
      fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
      fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
-    fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}]",       status);
+    // There are some sc=Hani characters with WB=Extend.
+    // The break rules need to pick one or the other because
+    // Extend overlapping with something else is messy.
+    // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
+    // in $Han (for $dictionary) and out of $Extend.
+    fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
      fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
  
      fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
@@ -1934,29 +2066,32 @@ RBBIWordMonkey::RBBIWordMonkey()
      // Inhibit dictionary characters from being tested at all.
      fOtherSet->removeAll(*fDictionarySet);
  
-    fSets->addElement(fCRSet,                status);
-    fSets->addElement(fLFSet,                status);
-    fSets->addElement(fNewlineSet,           status);
-    fSets->addElement(fRegionalIndicatorSet, status);
-    fSets->addElement(fHebrew_LetterSet,     status);
-    fSets->addElement(fALetterSet,           status);
-    fSets->addElement(fSingle_QuoteSet,      status);
-    fSets->addElement(fDouble_QuoteSet,      status);
-    //fSets->addElement(fKatakanaSet,          status); // Omit Katakana from fSets, which omits Katakana characters
-                                                        // from the test data. They are all in the dictionary set,
-                                                        // which this (old, to be retired) monkey test cannot handle.
-    fSets->addElement(fMidLetterSet,         status);
-    fSets->addElement(fMidNumLetSet,         status);
-    fSets->addElement(fMidNumSet,            status);
-    fSets->addElement(fNumericSet,           status);
-    fSets->addElement(fFormatSet,            status);
-    fSets->addElement(fExtendSet,            status);
-    fSets->addElement(fOtherSet,             status);
-    fSets->addElement(fExtendNumLetSet,      status);
-    fSets->addElement(fWSegSpaceSet,         status);
-
-    fSets->addElement(fZWJSet,               status);
-    fSets->addElement(fExtendedPictSet,      status);
+    // Add classes and their names
+    fSets->addElement(fCRSet, status); classNames.push_back("CR");
+    fSets->addElement(fLFSet, status); classNames.push_back("LF");
+    fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
+    fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
+    fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
+    fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
+    fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
+    fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
+    // Omit Katakana from fSets, which omits Katakana characters
+    // from the test data. They are all in the dictionary set,
+    // which this (old, to be retired) monkey test cannot handle.
+    //fSets->addElement(fKatakanaSet, status);
+
+    fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
+    fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
+    fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
+    fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
+    fSets->addElement(fFormatSet, status); classNames.push_back("Format");
+    fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
+    fSets->addElement(fOtherSet, status); classNames.push_back("Other");
+    fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
+    fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
+
+    fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
+    fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
  
      if (U_FAILURE(status)) {
          deferredStatus = status;
@@ -1965,6 +2100,7 @@ RBBIWordMonkey::RBBIWordMonkey()
  
  void RBBIWordMonkey::setText(const UnicodeString &s) {
      fText       = &s;
+    prepareAppliedRules(s.length());
  }
  
  
@@ -1997,14 +2133,14 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
          p1 = p2;  c1 = c2;
          p2 = p3;  c2 = c3;
  
-        // Advancd p3 by    X(Extend | Format)*   Rule 4
+        // Advance p3 by    X(Extend | Format)*   Rule 4
          //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
          do {
              p3 = fText->moveIndex32(p3, 1);
              c3 = fText->char32At(p3);
              if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
                 break;
-            };
+            }
          }
          while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
  
@@ -2013,141 +2149,145 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
              // Still warming up the loop.  (won't work with zero length strings, but we don't care)
              continue;
          }
+
          if (p2 == fText->length()) {
              // Reached end of string.  Always a break position.
              break;
          }
  
-        // Rule  (3)   CR x LF
          //     No Extend or Format characters may appear between the CR and LF,
          //     which requires the additional check for p2 immediately following p1.
          //
          if (c1==0x0D && c2==0x0A) {
-            continue;
+          setAppliedRule(p2, "WB3   CR x LF");
+          continue;
          }
  
-        // Rule (3a)  Break before and after newlines (including CR and LF)
-        //
          if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
+            setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
              break;
-        };
+        }
          if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
+            setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
              break;
-        };
+        }
  
-        // Rule (3c)    ZWJ x Extended_Pictographic
          //              Not ignoring extend chars, so peek into input text to
          //              get the potential ZWJ, the character immediately preceding c2.
          //              Sloppy UChar32 indexing: p2-1 may reference trail half
          //              but char32At will get the full code point.
-        if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) {
+        if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
+            setAppliedRule(p2, "WB3c  ZWJ x Extended_Pictographic");
              continue;
          }
  
-        // Rule (3d)    Keep horizontal whitespace together.
          if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
+            setAppliedRule(p2, "WB3d  Keep horizontal whitespace together.");
              continue;
          }
  
-        // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
          if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
              (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
+            setAppliedRule(p2, "WB4   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
              continue;
          }
  
-        // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
-        //
          if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
               (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
               (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
+            setAppliedRule(p2,
+                           "WB6   (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
              continue;
          }
  
-        // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
          if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
              (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
              (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
+            setAppliedRule(p2,
+                           "WB7   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)");
              continue;
          }
  
-        // Rule (7a)     Hebrew_Letter x Single_Quote
          if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
+            setAppliedRule(p2, "WB7a  Hebrew_Letter x Single_Quote");
              continue;
          }
  
-        // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
-        if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
+          if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
+            setAppliedRule(p2, "WB7b  Hebrew_Letter x Double_Quote Hebrew_Letter");
              continue;
          }
  
-        // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
          if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
+            setAppliedRule(p2, "WB7c  Hebrew_Letter Double_Quote x Hebrew_Letter");
              continue;
          }
  
-        // Rule (8)    Numeric x Numeric
          if (fNumericSet->contains(c1) &&
-            fNumericSet->contains(c2))  {
+            fNumericSet->contains(c2)) {
+            setAppliedRule(p2, "WB8   Numeric x Numeric");
              continue;
          }
  
-        // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
          if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
-            fNumericSet->contains(c2))  {
+            fNumericSet->contains(c2)) {
+            setAppliedRule(p2, "WB9   (ALetter | Hebrew_Letter) x Numeric");
              continue;
          }
  
-        // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
          if (fNumericSet->contains(c1) &&
              (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
+            setAppliedRule(p2, "WB10   Numeric x (ALetter | Hebrew_Letter)");
              continue;
          }
  
-        // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
-        if (fNumericSet->contains(c0) &&
+          if (fNumericSet->contains(c0) &&
              (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
              fNumericSet->contains(c2)) {
+            setAppliedRule(p2, "WB11  Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric");
              continue;
          }
  
-        // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
          if (fNumericSet->contains(c1) &&
              (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
              fNumericSet->contains(c3)) {
+            setAppliedRule(p2, "WB12  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
              continue;
          }
  
-        // Rule (13)  Katakana x Katakana
          //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
          //                  all Katakana are handled by the dictionary breaker.
          if (fKatakanaSet->contains(c1) &&
              fKatakanaSet->contains(c2))  {
+            setAppliedRule(p2, "WB13  Katakana x Katakana");
              continue;
          }
  
-        // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
          if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
               fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
               fExtendNumLetSet->contains(c2)) {
-                continue;
+            setAppliedRule(p2,
+                           "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
+            continue;
          }
  
-        // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
          if (fExtendNumLetSet->contains(c1) &&
                  (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
                   fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
+            setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
              continue;
          }
  
-        // Rule 15 - 17   Group pairs of Regional Indicators.
          if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
+            setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
              break;
          }
          if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
+            setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
              continue;
          }
  
-        // Rule 999.  Break found here.
+        setAppliedRule(p2, "WB999");
          break;
      }
  
@@ -2160,7 +2300,6 @@ UVector  *RBBIWordMonkey::charClasses() {
      return fSets;
  }
  
-
  RBBIWordMonkey::~RBBIWordMonkey() {
      delete fSets;
      delete fCRSet;
@@ -2224,7 +2363,6 @@ private:
      UnicodeSet  *fExtendSet;
  
      const UnicodeString  *fText;
-
  };
  
  RBBISentMonkey::RBBISentMonkey()
@@ -2269,19 +2407,19 @@ RBBISentMonkey::RBBISentMonkey()
      fOtherSet->removeAll(*fCloseSet);
      fOtherSet->removeAll(*fExtendSet);
  
-    fSets->addElement(fSepSet,       status);
-    fSets->addElement(fFormatSet,    status);
-    fSets->addElement(fSpSet,        status);
-    fSets->addElement(fLowerSet,     status);
-    fSets->addElement(fUpperSet,     status);
-    fSets->addElement(fOLetterSet,   status);
-    fSets->addElement(fNumericSet,   status);
-    fSets->addElement(fATermSet,     status);
-    fSets->addElement(fSContinueSet, status);
-    fSets->addElement(fSTermSet,     status);
-    fSets->addElement(fCloseSet,     status);
-    fSets->addElement(fOtherSet,     status);
-    fSets->addElement(fExtendSet,    status);
+    fSets->addElement(fSepSet, status); classNames.push_back("Sep");
+    fSets->addElement(fFormatSet, status); classNames.push_back("Format");
+    fSets->addElement(fSpSet, status); classNames.push_back("Sp");
+    fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
+    fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
+    fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
+    fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
+    fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
+    fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
+    fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
+    fSets->addElement(fCloseSet, status); classNames.push_back("Close");
+    fSets->addElement(fOtherSet, status); classNames.push_back("Other");
+    fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
  
      if (U_FAILURE(status)) {
          deferredStatus = status;
@@ -2292,13 +2430,13 @@ RBBISentMonkey::RBBISentMonkey()
  
  void RBBISentMonkey::setText(const UnicodeString &s) {
      fText       = &s;
+    prepareAppliedRules(s.length());
  }
  
  UVector  *RBBISentMonkey::charClasses() {
      return fSets;
  }
  
-
  //  moveBack()   Find the "significant" code point preceding the index i.
  //               Skips over ($Extend | $Format)* .
  //
@@ -2370,43 +2508,45 @@ int32_t RBBISentMonkey::next(int32_t prevPos) {
          p1 = p2;  c1 = c2;
          p2 = p3;  c2 = c3;
  
-        // Advancd p3 by    X(Extend | Format)*   Rule 4
+        // Advance p3 by    X(Extend | Format)*   Rule 4
          p3 = moveForward(p3);
          c3 = cAt(p3);
  
-        // Rule (3)  CR x LF
          if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
+            setAppliedRule(p2, "SB3   CR x LF");
              continue;
          }
  
-        // Rule (4).   Sep  <break>
          if (fSepSet->contains(c1)) {
              p2 = p1+1;   // Separators don't combine with Extend or Format.
+
+            setAppliedRule(p2, "SB4   Sep  <break>");
              break;
          }
  
          if (p2 >= fText->length()) {
              // Reached end of string.  Always a break position.
+            setAppliedRule(p2, "SB4   Sep  <break>");
              break;
          }
  
          if (p2 == prevPos) {
              // Still warming up the loop.  (won't work with zero length strings, but we don't care)
+            setAppliedRule(p2, "SB4   Sep  <break>");
              continue;
          }
  
-        // Rule (6).   ATerm x Numeric
          if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
+            setAppliedRule(p2, "SB6   ATerm x Numeric");
              continue;
          }
  
-        // Rule (7).  (Upper | Lower) ATerm  x  Uppper
-        if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
+          if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
                  fATermSet->contains(c1) && fUpperSet->contains(c2)) {
+            setAppliedRule(p2, "SB7   (Upper | Lower) ATerm  x  Uppper");
              continue;
          }
  
-        // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
          //           Note:  STerm | ATerm are added to the negated part of the expression by a
          //                  note to the Unicode 5.0 documents.
          int p8 = p1;
@@ -2423,16 +2563,21 @@ int32_t RBBISentMonkey::next(int32_t prevPos) {
                  if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
                      fLowerSet->contains(c) || fSepSet->contains(c) ||
                      fATermSet->contains(c) || fSTermSet->contains(c))  {
+
+                    setAppliedRule(p2,
+                                   "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
                      break;
                  }
                  p8 = moveForward(p8);
              }
              if (fLowerSet->contains(cAt(p8))) {
+
+                setAppliedRule(p2,
+                               "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
                  continue;
              }
          }
  
-        // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
          if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
              p8 = p1;
              while (fSpSet->contains(cAt(p8))) {
@@ -2443,11 +2588,11 @@ int32_t RBBISentMonkey::next(int32_t prevPos) {
              }
              c = cAt(p8);
              if (fSTermSet->contains(c) || fATermSet->contains(c)) {
+                setAppliedRule(p2, "SB8a  (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
                  continue;
              }
          }
  
-        // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
          int p9 = p1;
          while (fCloseSet->contains(cAt(p9))) {
              p9 = moveBack(p9);
@@ -2455,11 +2600,12 @@ int32_t RBBISentMonkey::next(int32_t prevPos) {
          c = cAt(p9);
          if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
              if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
+
+                setAppliedRule(p2, "SB9  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)");
                  continue;
              }
          }
  
-        // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
          int p10 = p1;
          while (fSpSet->contains(cAt(p10))) {
              p10 = moveBack(p10);
@@ -2469,11 +2615,11 @@ int32_t RBBISentMonkey::next(int32_t prevPos) {
          }
          if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
              if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
+                setAppliedRule(p2, "SB10  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)");
                  continue;
              }
          }
  
-        // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
          int p11 = p1;
          if (fSepSet->contains(cAt(p11))) {
              p11 = moveBack(p11);
@@ -2485,12 +2631,14 @@ int32_t RBBISentMonkey::next(int32_t prevPos) {
              p11 = moveBack(p11);
          }
          if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
+          setAppliedRule(p2, "SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>");
              break;
          }
  
-        //  Rule (12)  Any x Any
+        setAppliedRule(p2, "SB12  Any x Any");
          continue;
      }
+
      breakPos = p2;
      return breakPos;
  }
@@ -2574,6 +2722,8 @@ private:
      UnicodeSet  *fEB;
      UnicodeSet  *fEM;
      UnicodeSet  *fZWJ;
+    UnicodeSet  *fOP30;
+    UnicodeSet  *fCP30;
  
      BreakIterator        *fCharBI;
      const UnicodeString  *fText;
@@ -2614,7 +2764,7 @@ RBBILineMonkey::RBBILineMonkey() :
      fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
      fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
      fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
-    fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
+    fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=CL}] [\\u201D]]"), status); // en adjustments for rdar://problem/51193810
      fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
      fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
      fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
@@ -2622,8 +2772,8 @@ RBBILineMonkey::RBBILineMonkey() :
      fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
      fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
      fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
-    fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
-    fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
+    fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=OP}] [\\u201C\\u2018]]"), status); // en adjustments
+    fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=QU}]-[\\u201C\\u2018\\u201D]]"), status); // en adjustments
      fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
      fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
      fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
@@ -2639,7 +2789,9 @@ RBBILineMonkey::RBBILineMonkey() :
      fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
      fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
      fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
-    fZWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
+    fZWJ   = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
+    fOP30  = new UnicodeSet(u"[[\\p{Line_break=OP} [\\u201C\\u2018]]-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status); // en adjustments
+    fCP30  = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
  
      if (U_FAILURE(status)) {
          deferredStatus = status;
@@ -2655,48 +2807,51 @@ RBBILineMonkey::RBBILineMonkey() :
  
      fHH->add(u'\u2010');   // Hyphen, '‐'
  
-    fSets->addElement(fBK, status);
-    fSets->addElement(fCR, status);
-    fSets->addElement(fLF, status);
-    fSets->addElement(fCM, status);
-    fSets->addElement(fNL, status);
-    fSets->addElement(fWJ, status);
-    fSets->addElement(fZW, status);
-    fSets->addElement(fGL, status);
-    fSets->addElement(fCB, status);
-    fSets->addElement(fSP, status);
-    fSets->addElement(fB2, status);
-    fSets->addElement(fBA, status);
-    fSets->addElement(fBB, status);
-    fSets->addElement(fHY, status);
-    fSets->addElement(fH2, status);
-    fSets->addElement(fH3, status);
-    fSets->addElement(fCL, status);
-    fSets->addElement(fCP, status);
-    fSets->addElement(fEX, status);
-    fSets->addElement(fIN, status);
-    fSets->addElement(fJL, status);
-    fSets->addElement(fJT, status);
-    fSets->addElement(fJV, status);
-    fSets->addElement(fNS, status);
-    fSets->addElement(fOP, status);
-    fSets->addElement(fQU, status);
-    fSets->addElement(fIS, status);
-    fSets->addElement(fNU, status);
-    fSets->addElement(fPO, status);
-    fSets->addElement(fPR, status);
-    fSets->addElement(fSY, status);
-    fSets->addElement(fAI, status);
-    fSets->addElement(fAL, status);
-    fSets->addElement(fHL, status);
-    fSets->addElement(fID, status);
-    fSets->addElement(fWJ, status);
-    fSets->addElement(fRI, status);
-    fSets->addElement(fSG, status);
-    fSets->addElement(fEB, status);
-    fSets->addElement(fEM, status);
-    fSets->addElement(fZWJ, status);
-
+    // Sets and names.
+    fSets->addElement(fBK, status); classNames.push_back("fBK");
+    fSets->addElement(fCR, status); classNames.push_back("fCR");
+    fSets->addElement(fLF, status); classNames.push_back("fLF");
+    fSets->addElement(fCM, status); classNames.push_back("fCM");
+    fSets->addElement(fNL, status); classNames.push_back("fNL");
+    fSets->addElement(fWJ, status); classNames.push_back("fWJ");
+    fSets->addElement(fZW, status); classNames.push_back("fZW");
+    fSets->addElement(fGL, status); classNames.push_back("fGL");
+    fSets->addElement(fCB, status); classNames.push_back("fCB");
+    fSets->addElement(fSP, status); classNames.push_back("fSP");
+    fSets->addElement(fB2, status); classNames.push_back("fB2");
+    fSets->addElement(fBA, status); classNames.push_back("fBA");
+    fSets->addElement(fBB, status); classNames.push_back("fBB");
+    fSets->addElement(fHY, status); classNames.push_back("fHY");
+    fSets->addElement(fH2, status); classNames.push_back("fH2");
+    fSets->addElement(fH3, status); classNames.push_back("fH3");
+    fSets->addElement(fCL, status); classNames.push_back("fCL");
+    fSets->addElement(fCP, status); classNames.push_back("fCP");
+    fSets->addElement(fEX, status); classNames.push_back("fEX");
+    fSets->addElement(fIN, status); classNames.push_back("fIN");
+    fSets->addElement(fJL, status); classNames.push_back("fJL");
+    fSets->addElement(fJT, status); classNames.push_back("fJT");
+    fSets->addElement(fJV, status); classNames.push_back("fJV");
+    fSets->addElement(fNS, status); classNames.push_back("fNS");
+    fSets->addElement(fOP, status); classNames.push_back("fOP");
+    fSets->addElement(fQU, status); classNames.push_back("fQU");
+    fSets->addElement(fIS, status); classNames.push_back("fIS");
+    fSets->addElement(fNU, status); classNames.push_back("fNU");
+    fSets->addElement(fPO, status); classNames.push_back("fPO");
+    fSets->addElement(fPR, status); classNames.push_back("fPR");
+    fSets->addElement(fSY, status); classNames.push_back("fSY");
+    fSets->addElement(fAI, status); classNames.push_back("fAI");
+    fSets->addElement(fAL, status); classNames.push_back("fAL");
+    fSets->addElement(fHL, status); classNames.push_back("fHL");
+    fSets->addElement(fID, status); classNames.push_back("fID");
+    fSets->addElement(fWJ, status); classNames.push_back("fWJ");
+    fSets->addElement(fRI, status); classNames.push_back("fRI");
+    fSets->addElement(fSG, status); classNames.push_back("fSG");
+    fSets->addElement(fEB, status); classNames.push_back("fEB");
+    fSets->addElement(fEM, status); classNames.push_back("fEM");
+    fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
+    // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
+    fSets->addElement(fOP30, status); classNames.push_back("fOP30");
+    fSets->addElement(fCP30, status); classNames.push_back("fCP30");
  
      const char *rules =
              "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
@@ -2715,12 +2870,14 @@ RBBILineMonkey::RBBILineMonkey() :
      if (U_FAILURE(status)) {
          deferredStatus = status;
      }
+
  }
  
  
  void RBBILineMonkey::setText(const UnicodeString &s) {
      fText       = &s;
      fCharBI->setText(s);
+    prepareAppliedRules(s.length());
      fNumberMatcher->reset(s);
  }
  
@@ -2744,8 +2901,8 @@ void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos
      int32_t  nPos = *nextPos;
  
      // LB 9  Keep combining sequences together.
-    //  advance over any CM class chars.  Note that Line Break CM is different
-    //  from the normal Grapheme Extend property.
+    // advance over any CM class chars.  Note that Line Break CM is different
+    // from the normal Grapheme Extend property.
      if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
            *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
          for (;;) {
@@ -2827,59 +2984,67 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
          nextCPPos = fText->moveIndex32(pos, 1);
          nextPos   = nextCPPos;
  
-        // Rule LB2 - Break at end of text.
+
          if (pos >= fText->length()) {
+            setAppliedRule(pos, "LB2 - Break at end of text.");
              break;
          }
  
-        // Rule LB 9 - adjust for combining sequences.
+
          //             We do this one out-of-order because the adjustment does not change anything
          //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
          //             be applied.
-        rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
+        rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
          nextCPPos = nextPos = fText->moveIndex32(pos, 1);
          c = fText->char32At(nextPos);
-        rule9Adjust(pos,     &thisChar, &nextPos, &c);
+        rule9Adjust(pos, &thisChar, &nextPos, &c);
  
          // If the loop is still warming up - if we haven't shifted the initial
          //   -1 positions out of prevPos yet - loop back to advance the
          //    position in the input without any further looking for breaks.
          if (prevPos == -1) {
+          setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
              continue;
          }
  
-        // LB 4  Always break after hard line breaks,
+
          if (fBK->contains(prevChar)) {
+            setAppliedRule(pos, "LB 4  Always break after hard line breaks");
              break;
          }
  
-        // LB 5  Break after CR, LF, NL, but not inside CR LF
+
          if (prevChar == 0x0d && thisChar == 0x0a) {
+            setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
              continue;
          }
          if (prevChar == 0x0d ||
              prevChar == 0x0a ||
              prevChar == 0x85)  {
+            setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
              break;
          }
  
-        // LB 6  Don't break before hard line breaks
+
          if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
              fBK->contains(thisChar)) {
-                continue;
+            setAppliedRule(pos, "LB 6  Don't break before hard line breaks");
+            continue;
          }
  
  
-        // LB 7  Don't break before spaces or zero-width space.
          if (fSP->contains(thisChar)) {
+            setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
              continue;
          }
  
+        // !!! ??? Is this the right text for the applied rule?
          if (fZW->contains(thisChar)) {
+            setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
              continue;
          }
  
-        // LB 8  Break after zero width space
+
          //       ZW SP* ÷
          //       Scan backwards from prevChar for SP* ZW
          tPos = prevPos;
@@ -2887,14 +3052,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
              tPos = fText->moveIndex32(tPos, -1);
          }
          if (fZW->contains(fText->char32At(tPos))) {
+            setAppliedRule(pos, "LB 8  Break after zero width space");
              break;
          }
  
-        // LB 25    Numbers
+
          //          Move this test up, before LB8a, because numbers can match a longer sequence that would
          //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
          if (fNumberMatcher->lookingAt(prevPos, status)) {
              if (U_FAILURE(status)) {
+                setAppliedRule(pos, "LB 25 Numbers");
                  break;
              }
              // Matched a number.  But could have been just a single digit, which would
@@ -2912,11 +3079,12 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
                          thisChar = fText->char32At(pos);
                      } while (fCM->contains(thisChar));
                  }
+                setAppliedRule(pos, "LB 25 Numbers");
                  continue;
              }
          }
  
-        // LB 8a ZWJ x
+
          //       The monkey test's way of ignoring combining characters doesn't work
          //       for this rule. ZJ is also a CM. Need to get the actual character
          //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
@@ -2924,46 +3092,48 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
              int32_t prevIdx = fText->moveIndex32(pos, -1);
              UChar32 prevC = fText->char32At(prevIdx);
              if (fZWJ->contains(prevC)) {
+                setAppliedRule(pos, "LB 8a ZWJ x");
                  continue;
              }
          }
  
-        // LB 9, 10  Already done, at top of loop.
+
+        // appliedRule: "LB 9, 10"; //  Already done, at top of loop.";
          //
  
  
-        // LB 11  Do not break before or after WORD JOINER and related characters.
          //    x  WJ
          //    WJ  x
          //
          if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
+            setAppliedRule(pos, "LB 11  Do not break before or after WORD JOINER and related characters.");
              continue;
          }
  
-        // LB 12
-        //    GL  x
+
          if (fGL->contains(prevChar)) {
+            setAppliedRule(pos, "LB 12  GL  x");
              continue;
          }
  
-        // LB 12a
-        //    [^SP BA HY] x GL
-        if (!(fSP->contains(prevChar) ||
+
+          if (!(fSP->contains(prevChar) ||
                fBA->contains(prevChar) ||
                fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
-            continue;
+              setAppliedRule(pos, "LB 12a  [^SP BA HY] x GL");
+              continue;
          }
  
-        // LB 13  Don't break before closings.
-        //
+
          if (fCL->contains(thisChar) ||
                  fCP->contains(thisChar) ||
                  fEX->contains(thisChar) ||
                  fSY->contains(thisChar)) {
+            setAppliedRule(pos, "LB 13  Don't break before closings.");
              continue;
          }
  
-        // LB 14 Don't break after OP SP*
+
          //       Scan backwards, checking for this sequence.
          //       The OP char could include combining marks, so we actually check for
          //           OP CM* SP*
@@ -2981,26 +3151,28 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
              tPos=fText->moveIndex32(tPos, -1);
          }
          if (fOP->contains(fText->char32At(tPos))) {
+            setAppliedRule(pos, "LB 14 Don't break after OP SP*");
              continue;
          }
  
  
-        // LB 14a Break before an IS that begins a number and follows a space
          if (nextPos < fText->length()) {
              // note: UnicodeString::char32At(length) returns ffff, not distinguishable
              //       from a legit ffff character. So test length separately.
              UChar32 nextChar = fText->char32At(nextPos);
              if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
+                setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
                  break;
              }
          }
  
-        // LB14b Do not break before numeric separators, even after spaces.
-        if (fIS->contains(thisChar)) {
-            continue;
+
+          if (fIS->contains(thisChar)) {
+              setAppliedRule(pos, "LB 14b  Do not break before numeric separators, even after spaces.");
+              continue;
          }
  
-        // LB 15    QU SP* x OP
+
          if (fOP->contains(thisChar)) {
              // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
              int tPos = prevPos;
@@ -3011,13 +3183,12 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
                  tPos = fText->moveIndex32(tPos, -1);
              }
              if (fQU->contains(fText->char32At(tPos))) {
+                setAppliedRule(pos, "LB 15    QU SP* x OP");
                  continue;
              }
          }
  
  
-
-        // LB 16   (CL | CP) SP* x NS
          //    Scan backwards for SP* CM* (CL | CP)
          if (fNS->contains(thisChar)) {
              int tPos = prevPos;
@@ -3028,12 +3199,12 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
                  tPos = fText->moveIndex32(tPos, -1);
              }
              if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
+                setAppliedRule(pos, "LB 16   (CL | CP) SP* x NS");
                  continue;
              }
          }
  
  
-        // LB 17        B2 SP* x B2
          if (fB2->contains(thisChar)) {
              //  Scan backwards, checking for the B2 CM* SP* sequence.
              tPos = prevPos;
@@ -3046,163 +3217,169 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
                  tPos=fText->moveIndex32(tPos, -1);
              }
              if (fB2->contains(fText->char32At(tPos))) {
+                setAppliedRule(pos, "LB 17   B2 SP* x B2");
                  continue;
              }
          }
  
  
-        // LB 18    break after space
          if (fSP->contains(prevChar)) {
+            setAppliedRule(pos, "LB 18    break after space");
              break;
          }
  
-        // LB 19
          //    x   QU
          //    QU  x
          if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
+            setAppliedRule(pos, "LB 19");
              continue;
          }
  
-        // LB 20  Break around a CB
          if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
+            setAppliedRule(pos, "LB 20  Break around a CB");
              break;
          }
  
-        // LB 20.09  Don't break between Hyphens and letters if a break precedes the hyphen.
+        //           Don't break between Hyphens and letters if a break precedes the hyphen.
          //           Formerly this was a Finnish tailoring.
          //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
-        //    ^($HY | $HH) $AL;
+        //           ^($HY | $HH) $AL;
          if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
                  prevPosX2 == -1) {
+            setAppliedRule(pos, "LB 20.09");
              continue;
          }
  
-        // LB 21
          if (fBA->contains(thisChar) ||
              fHY->contains(thisChar) ||
              fNS->contains(thisChar) ||
              fBB->contains(prevChar) )   {
+            setAppliedRule(pos, "LB 21");
              continue;
          }
  
-        // LB 21a
-        //   HL (HY | BA) x
          if (fHL->contains(prevCharX2) &&
                  (fHY->contains(prevChar) || fBA->contains(prevChar))) {
+            setAppliedRule(pos, "LB 21a   HL (HY | BA) x");
              continue;
          }
  
-        // LB 21b
-        //   SY x HL
          if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
+            setAppliedRule(pos, "LB 21b SY x HL");
              continue;
          }
  
-        // LB 22
-        if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
-            (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
-            (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
-            ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
-            (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
-            (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
+        if (fIN->contains(thisChar))   {
+            setAppliedRule(pos, "LB 22");
              continue;
          }
  
  
-        // LB 23    (AL | HL) x NU
+        //          (AL | HL) x NU
          //          NU x (AL | HL)
          if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
+            setAppliedRule(pos, "LB 23");
              continue;
          }
          if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
+            setAppliedRule(pos, "LB 23");
              continue;
          }
  
-        // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
+        // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
          //      PR x (ID | EB | EM)
          //     (ID | EB | EM) x PO
          if (fPR->contains(prevChar) &&
                  (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
+            setAppliedRule(pos, "LB 23a");
              continue;
          }
          if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
                  fPO->contains(thisChar)) {
+            setAppliedRule(pos, "LB 23a");
              continue;
          }
  
-        // LB 24  Do not break between prefix and letters or ideographs.
+        //   Do not break between prefix and letters or ideographs.
          //         (PR | PO) x (AL | HL)
          //         (AL | HL) x (PR | PO)
          if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
                  (fAL->contains(thisChar) || fHL->contains(thisChar))) {
+            setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
              continue;
          }
          if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
                  (fPR->contains(thisChar) || fPO->contains(thisChar))) {
+            setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
              continue;
          }
  
-        // LB 25 numbers match, moved up, before LB 8a,
+        // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
  
-        // LB 26 Do not break a Korean syllable.
          if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
                                          fJV->contains(thisChar) ||
                                          fH2->contains(thisChar) ||
                                          fH3->contains(thisChar))) {
-                                            continue;
+            setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
+            continue;
                                          }
  
          if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
              (fJV->contains(thisChar) || fJT->contains(thisChar))) {
-                continue;
+            setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
+            continue;
          }
  
          if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
              fJT->contains(thisChar)) {
-                continue;
+            setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
+            continue;
          }
  
-        // LB 27 Treat a Korean Syllable Block the same as ID.
          if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
              fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
              fIN->contains(thisChar)) {
-                continue;
+            setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
+            continue;
              }
          if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
              fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
              fPO->contains(thisChar)) {
-                continue;
+            setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
+            continue;
              }
          if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
              fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
-                continue;
+            setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
+            continue;
              }
  
  
  
-        // LB 28  Do not break between alphabetics ("at").
          if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
+            setAppliedRule(pos, "LB 28  Do not break between alphabetics (\"at\").");
              continue;
          }
  
-        // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
-        if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
-            continue;
+          if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
+              setAppliedRule(pos, "LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").");
+              continue;
          }
  
-        // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
          //          (AL | NU) x OP
          //          CP x (AL | NU)
-        if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
+        if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
+            setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
              continue;
          }
-        if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
+        if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
+            setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
              continue;
          }
  
-        // LB30a    RI RI  ÷  RI
          //             RI  x  RI
          if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
+            setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
              break;
          }
          if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
@@ -3210,17 +3387,17 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
              // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
              // following RI. This is a hack.
              thisChar = -1;
+            setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
              continue;
          }
  
-        // LB30b    Emoji Base x Emoji Modifier
          if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
+            setAppliedRule(pos, "LB30b    Emoji Base x Emoji Modifier");
              continue;
          }
  
-        // LB 31    Break everywhere else
+        setAppliedRule(pos, "LB 31    Break everywhere else");
          break;
-
      }
  
      return pos;
@@ -3278,6 +3455,8 @@ RBBILineMonkey::~RBBILineMonkey() {
      delete fEB;
      delete fEM;
      delete fZWJ;
+    delete fOP30;
+    delete fCP30;
  
      delete fCharBI;
      delete fNumberMatcher;
@@ -3315,7 +3494,7 @@ static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t d
              paramLength = (int32_t)(sizeof(valString)-2);
          }
          params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
-        val = strtol(valString,  NULL, 10);
+        val = strtol(valString, NULL, 10);
  
          // Delete this parameter from the params string.
          m.reset();
@@ -3643,6 +3822,7 @@ void RBBITest::TestLineBreaks(void)
          int expectedcount = 0;
  
          monkey.setText(ustr);
+
          int i;
          for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
              if (expectedcount >= EXPECTEDSIZE) {
@@ -3707,6 +3887,7 @@ void RBBITest::TestSentBreaks(void)
          int expectedcount = 0;
  
          monkey.setText(ustr);
+
          int i;
          for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
              if (expectedcount >= EXPECTEDSIZE) {
@@ -3822,7 +4003,7 @@ void RBBITest::TestMonkey() {
              loopCount = loopCount / 10;   // Sentence runs slower than the other break types
          }
          if (U_SUCCESS(status)) {
-            RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
+            RunMonkey(bi, m, "sent", seed, loopCount, useUText);
          }
          else {
              errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
@@ -3851,7 +4032,6 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
      UnicodeString    testText;
      int32_t          numCharClasses;
      UVector          *chClasses;
-    int              expected[TESTSTRINGLEN*2 + 1];
      int              expectedCount = 0;
      char             expectedBreaks[TESTSTRINGLEN*2 + 1];
      char             forwardBreaks[TESTSTRINGLEN*2 + 1];
@@ -3862,6 +4042,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
      int              i;
      int              loopCount = 0;
  
+
      m_seed = seed;
  
      numCharClasses = mk.charClasses()->size();
@@ -3884,6 +4065,9 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
          }
      }
  
+    // For minimizing width of class name output.
+    int classNameSize = mk.maxClassNameSize();
+
      while (loopCount < numIterations || numIterations == -1) {
          if (numIterations == -1 && loopCount % 10 == 0) {
              // If test is running in an infinite loop, display a periodic tic so
@@ -3914,8 +4098,9 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
              testText.append(c);
          }
  
-        // Calculate the expected results for this test string.
+        // Calculate the expected results for this test string and reset applied rules.
          mk.setText(testText);
+
          memset(expectedBreaks, 0, sizeof(expectedBreaks));
          expectedBreaks[0] = 1;
          int32_t breakPos = 0;
@@ -3930,9 +4115,6 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
              }
              expectedBreaks[breakPos] = 1;
              U_ASSERT(expectedCount<testText.length());
-            expected[expectedCount ++] = breakPos;
-            (void)expected;   // Set but not used warning.
-                              // TODO (andy): check it out.
          }
  
          // Find the break positions using forward iteration
@@ -4025,25 +4207,31 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
          // Compare the expected and actual results.
          for (i=0; i<=testText.length(); i++) {
              const char *errorType = NULL;
+            const char* currentBreakData = NULL;
              if  (forwardBreaks[i] != expectedBreaks[i]) {
                  errorType = "next()";
+                currentBreakData = forwardBreaks;
              } else if (reverseBreaks[i] != forwardBreaks[i]) {
                  errorType = "previous()";
-            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
+                currentBreakData = reverseBreaks;
+           } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
                  errorType = "isBoundary()";
+                currentBreakData = isBoundaryBreaks;
              } else if (followingBreaks[i] != expectedBreaks[i]) {
                  errorType = "following()";
+                currentBreakData = followingBreaks;
              } else if (precedingBreaks[i] != expectedBreaks[i]) {
                  errorType = "preceding()";
+                currentBreakData = precedingBreaks;
              }
  
-
              if (errorType != NULL) {
                  // Format a range of the test text that includes the failure as
                  //  a data item that can be included in the rbbi test data file.
  
                  // Start of the range is the last point where expected and actual results
-                //   both agreed that there was a break position.
+                //  both agreed that there was a break position.
+
                  int startContext = i;
                  int32_t count = 0;
                  for (;;) {
@@ -4069,53 +4257,62 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
                      }
                  }
  
-                // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
-                UnicodeString errorText = "<data>";
-                /***if (strcmp(errorType, "next()") == 0) {
-                    startContext = 0;
-                    endContext = testText.length();
+                // Formatting of each line includes:
+                //   character code
+                //   reference break: '|' -> a break, '.' -> no break
+                //   actual break:    '|' -> a break, '.' -> no break
+                //   (name of character clase)
+                //   Unicode name of character
+                //   '-->' indicates location of the difference.
  
-                    printStringBreaks(testText, expected, expectedCount);
-                }***/
+                MONKEY_ERROR(
+                    (expectedBreaks[i] ? "Break expected but not found" :
+                       "Break found but not expected"),
+                    name, i, seed);
  
-                for (ci=startContext; ci<endContext;) {
-                    UnicodeString hexChars("0123456789abcdef");
+                for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
                      UChar32  c;
-                    int      bn;
                      c = testText.char32At(ci);
+
+                    std::string currentLineFlag = "   ";
                      if (ci == i) {
-                        // This is the location of the error.
-                        errorText.append("<?>");
-                    } else if (expectedBreaks[ci] != 0) {
-                        // This a non-error expected break position.
-                        errorText.append("\\");
+                        currentLineFlag = "-->";  // Error position
                      }
-                    if (c < 0x10000) {
-                        errorText.append("\\u");
-                        for (bn=12; bn>=0; bn-=4) {
-                            errorText.append(hexChars.charAt((c>>bn)&0xf));
-                        }
+
+                    // BMP or SMP character in hex
+                    char hexCodePoint[12];
+                    std::string format = "    \\u%04x";
+                    if (c >= 0x10000) {
+                        format = "\\U%08x";
+                    }
+                    sprintf(hexCodePoint, format.c_str(), c);
+
+                    // Get the class name and character name for the character.
+                    char cName[200];
+                    UErrorCode status = U_ZERO_ERROR;
+                    u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
+
+                    char buffer[200];
+                    snprintf(buffer, 200,
+                             "%4s %3i :  %1s  %1s  %10s  %-*s  %-40s  %-40s",
+                             currentLineFlag.c_str(),
+                             ci,
+                             expectedBreaks[ci] == 0 ? "." : "|",  // Reference break
+                             currentBreakData[ci] == 0 ? "." : "|",  // Actual break
+                             hexCodePoint,
+                             classNameSize,
+                             mk.classNameFromCodepoint(c).c_str(),
+                             mk.getAppliedRule(ci).c_str(), cName);
+
+                    // Output the error
+                    if (ci == i) {
+                        errln(buffer);
                      } else {
-                        errorText.append("\\U");
-                        for (bn=28; bn>=0; bn-=4) {
-                            errorText.append(hexChars.charAt((c>>bn)&0xf));
-                        }
+                        infoln(buffer);
                      }
-                    ci = testText.moveIndex32(ci, 1);
+
+                    if (ci >= endContext) { break; }
                  }
-                errorText.append("\\");
-                errorText.append("</data>\n");
-
-                // Output the error
-                char  charErrorTxt[500];
-                UErrorCode status = U_ZERO_ERROR;
-                errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
-                charErrorTxt[sizeof(charErrorTxt)-1] = 0;
-                const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
-
-                errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
-                    name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
-                    errorType, seed, i, charErrorTxt);
                  break;
              }
          }
@@ -4397,11 +4594,11 @@ void RBBITest::TestBug12519() {
      assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
      assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
  
-    LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
+    LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
      assertTrue(WHERE, *biEn == *cloneEn);
      assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
  
-    LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
+    LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
      assertTrue(WHERE, *biFr == *cloneFr);
      assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));