X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/a01113dcd0f39d5da295ef82785beff9ed86fe38..340931cb2e044a2141d11567dd0f782524e32994:/icuSources/test/intltest/rbbitst.cpp?ds=inline diff --git a/icuSources/test/intltest/rbbitst.cpp b/icuSources/test/intltest/rbbitst.cpp index ebd12364..7bb8d478 100644 --- a/icuSources/test/intltest/rbbitst.cpp +++ b/icuSources/test/intltest/rbbitst.cpp @@ -46,16 +46,31 @@ #include "uvector.h" #include "uvectr32.h" +// Needed for Apple perf tests +#include +#include + #if !UCONFIG_NO_FILTERED_BREAK_ITERATION #include "unicode/filteredbrk.h" #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION -#define TEST_ASSERT(x) {if (!(x)) { \ - errln("Failure in file %s, line %d", __FILE__, __LINE__);}} - -#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ - errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} +#define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \ + if (!(x)) { \ + errln("Failure in file %s, line %d", __FILE__, __LINE__); \ + } \ +} UPRV_BLOCK_MACRO_END + +#define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \ + if (U_FAILURE(errcode)) { \ + errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \ + } \ +} UPRV_BLOCK_MACRO_END + +#define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \ + IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \ + __FILE__, __LINE__, msg, index, fRuleFileName, seed); \ +} //--------------------------------------------- // runIndexedTest @@ -736,7 +751,13 @@ void RBBITest::TestExtended() { int32_t tagValue = 0; // The numeric value of a tag. UnicodeString rules; // Holds rules from a ... block - int32_t rulesFirstLine; // Line number of the start of current block + int32_t rulesFirstLine = 0; // Line number of the start of current block + + // + mach_timebase_info_data_t info; + uint64_t start, durationOpen = 0.0, durationUse = 0.0; + mach_timebase_info(&info); + UBool isLine = FALSE; for (charIdx = 0; charIdx < len; ) { status = U_ZERO_ERROR; @@ -775,6 +796,7 @@ void RBBITest::TestExtended() { tp.bi = BreakIterator::createWordInstance(locale, status); skipTest = false; charIdx += 5; + isLine = FALSE; break; } if (testString.compare(charIdx-1, 6, u"") == 0) { @@ -782,13 +804,17 @@ void RBBITest::TestExtended() { tp.bi = BreakIterator::createCharacterInstance(locale, status); skipTest = false; charIdx += 5; + isLine = FALSE; break; } if (testString.compare(charIdx-1, 6, u"") == 0) { delete tp.bi; + start = mach_absolute_time(); // tp.bi = BreakIterator::createLineInstance(locale, status); + durationOpen += (((mach_absolute_time() - start) * info.numer)/info.denom); skipTest = false; charIdx += 5; + isLine = TRUE; break; } if (testString.compare(charIdx-1, 6, u"") == 0) { @@ -802,6 +828,7 @@ void RBBITest::TestExtended() { delete tp.bi; tp.bi = BreakIterator::createTitleInstance(locale, status); charIdx += 6; + isLine = FALSE; break; } @@ -811,6 +838,7 @@ void RBBITest::TestExtended() { parseState = PARSE_RULES; rules.remove(); rulesFirstLine = lineNum; + isLine = FALSE; break; } @@ -895,7 +923,11 @@ void RBBITest::TestExtended() { // RUN THE TEST! status = U_ZERO_ERROR; tp.setUTF16(status); + start = mach_absolute_time(); // executeTest(&tp, status); + if (isLine) { + durationUse += (((mach_absolute_time() - start) * info.numer)/info.denom); + } TEST_ASSERT_SUCCESS(status); // Run again, this time with UTF-8 text wrapped in a UText. @@ -1078,6 +1110,10 @@ void RBBITest::TestExtended() { errln("rbbitst.txt:%d block not closed.", lineNum); } + // + infoln("TestExtended total time in createLineInstance (nsec):\t%llu\n", durationOpen); + infoln("TestExtended total time in linebreak test execute (nsec):\t%llu\n", durationUse); + end_test: delete [] testFile; @@ -1237,7 +1273,7 @@ cleanUpAndReturn: delete []retPtr; retPtr = 0; ulen = 0; - }; + } return retPtr; } @@ -1398,14 +1434,14 @@ void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator * // int spin = 0; while (tokenMatcher.find()) { - if(tokenMatcher.hitEnd()) { + if(tokenMatcher.hitEnd()) { /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. This occurred when the text file was corrupt (wasn't marked as UTF-8) and caused an infinite loop here on EBCDIC systems! */ fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); - // return; - } + // return; + } if (tokenMatcher.start(1, status) >= 0) { // Scanned a divide sign, indicating a break position in the test data. if (testString.length()>0) { @@ -1546,14 +1582,30 @@ public: // Return -1 after reaching end of string. virtual int32_t next(int32_t i) = 0; + // Name of each character class, parallel with charClasses. Used for debugging output + // of characters. + virtual std::vector& characterClassNames(); + + void setAppliedRule(int32_t position, const char* value); + + std::string getAppliedRule(int32_t position); + virtual ~RBBIMonkeyKind(); - UErrorCode deferredStatus; + UErrorCode deferredStatus; + std::string classNameFromCodepoint(const UChar32 c); + unsigned int maxClassNameSize(); -protected: - RBBIMonkeyKind(); + protected: + RBBIMonkeyKind(); + std::vector classNames; + std::vector appliedRules; + + // Clear `appliedRules` and fill it with empty strings in the size of test text. + void prepareAppliedRules(int32_t size ); + + private: -private: }; RBBIMonkeyKind::RBBIMonkeyKind() { @@ -1563,6 +1615,45 @@ RBBIMonkeyKind::RBBIMonkeyKind() { RBBIMonkeyKind::~RBBIMonkeyKind() { } +std::vector& RBBIMonkeyKind::characterClassNames() { + return classNames; +} + +void RBBIMonkeyKind::prepareAppliedRules(int32_t size) { + // Remove all the information in the `appliedRules`. + appliedRules.clear(); + appliedRules.resize(size + 1); +} + +void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) { + appliedRules[position] = value; +} + +std::string RBBIMonkeyKind::getAppliedRule(int32_t position){ + return appliedRules[position]; +} + +std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) { + // Simply iterate through charClasses to find character's class + for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) { + UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum); + if (classSet->contains(c)) { + return classNames[aClassNum]; + } + } + U_ASSERT(FALSE); // This should not happen. + return "bad class name"; +} + +unsigned int RBBIMonkeyKind::maxClassNameSize() { + unsigned int maxSize = 0; + for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) { + if (classNames[aClassNum].size() > maxSize) { + maxSize = classNames[aClassNum].size(); + } + } + return maxSize; +} //---------------------------------------------------------------------------------------- // @@ -1611,6 +1702,9 @@ private: UnicodeSet *fLVTSet; UnicodeSet *fHangulSet; UnicodeSet *fExtendedPictSet; + UnicodeSet *fViramaSet; + UnicodeSet *fLinkingConsonantSet; + UnicodeSet *fExtCccZwjSet; UnicodeSet *fAnySet; const UnicodeString *fText; @@ -1643,21 +1737,34 @@ RBBICharMonkey::RBBICharMonkey() { fHangulSet->addAll(*fLVTSet); fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status); + fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&" + "\\p{Indic_Syllabic_Category=Virama}]", status); + fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&" + "\\p{Indic_Syllabic_Category=Consonant}]", status); + fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status); fAnySet = new UnicodeSet(0, 0x10ffff); + // Create sets of characters, and add the names of the above character sets. + // In each new ICU release, add new names corresponding to the sets above. fSets = new UVector(status); - fSets->addElement(fCRLFSet, status); - fSets->addElement(fControlSet, status); - fSets->addElement(fExtendSet, status); - fSets->addElement(fRegionalIndicatorSet, status); + + // Important: Keep class names the same as the class contents. + fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF"); + fSets->addElement(fControlSet, status); classNames.push_back("Control"); + fSets->addElement(fExtendSet, status); classNames.push_back("Extended"); + fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator"); if (!fPrependSet->isEmpty()) { - fSets->addElement(fPrependSet, status); + fSets->addElement(fPrependSet, status); classNames.push_back("Prepend"); } - fSets->addElement(fSpacingSet, status); - fSets->addElement(fHangulSet, status); - fSets->addElement(fAnySet, status); - fSets->addElement(fZWJSet, status); - fSets->addElement(fExtendedPictSet, status); + fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing"); + fSets->addElement(fHangulSet, status); classNames.push_back("Hangul"); + fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ"); + fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict"); + fSets->addElement(fViramaSet, status); classNames.push_back("Virama"); + fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant"); + fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj"); + fSets->addElement(fAnySet, status); classNames.push_back("Any"); + if (U_FAILURE(status)) { deferredStatus = status; } @@ -1666,6 +1773,7 @@ RBBICharMonkey::RBBICharMonkey() { void RBBICharMonkey::setText(const UnicodeString &s) { fText = &s; + prepareAppliedRules(s.length()); } @@ -1688,6 +1796,7 @@ int32_t RBBICharMonkey::next(int32_t prevPos) { if (prevPos >= fText->length()) { return -1; } + p0 = p1 = p2 = p3 = prevPos; c3 = fText->char32At(prevPos); c0 = c1 = c2 = cBase = 0; @@ -1701,7 +1810,7 @@ int32_t RBBICharMonkey::next(int32_t prevPos) { p1 = p2; c1 = c2; p2 = p3; c2 = c3; - // Advancd p3 by one codepoint + // Advance p3 by one codepoint p3 = fText->moveIndex32(p3, 1); c3 = fText->char32At(p3); @@ -1709,93 +1818,109 @@ int32_t RBBICharMonkey::next(int32_t prevPos) { // Still warming up the loop. (won't work with zero length strings, but we don't care) continue; } + if (p2 == fText->length()) { - // Reached end of string. Always a break position. + setAppliedRule(p2, "End of String"); break; } - // Rule GB3 CR x LF // No Extend or Format characters may appear between the CR and LF, // which requires the additional check for p2 immediately following p1. // if (c1==0x0D && c2==0x0A && p1==(p2-1)) { - continue; + setAppliedRule(p2, "GB3 CR x LF"); + continue; } - // Rule (GB4). ( Control | CR | LF ) if (fControlSet->contains(c1) || c1 == 0x0D || c1 == 0x0A) { - break; + setAppliedRule(p2, "GB4 ( Control | CR | LF ) "); + break; } - // Rule (GB5) ( Control | CR | LF ) - // if (fControlSet->contains(c2) || c2 == 0x0D || c2 == 0x0A) { + setAppliedRule(p2, "GB5 ( Control | CR | LF )"); break; } - - // Rule (GB6) L x ( L | V | LV | LVT ) if (fLSet->contains(c1) && (fLSet->contains(c2) || fVSet->contains(c2) || fLVSet->contains(c2) || fLVTSet->contains(c2))) { + setAppliedRule(p2, "GB6 L x ( L | V | LV | LVT )"); continue; } - // Rule (GB7) ( LV | V ) x ( V | T ) if ((fLVSet->contains(c1) || fVSet->contains(c1)) && (fVSet->contains(c2) || fTSet->contains(c2))) { + setAppliedRule(p2, "GB7 ( LV | V ) x ( V | T )"); continue; } - // Rule (GB8) ( LVT | T) x T if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && fTSet->contains(c2)) { + setAppliedRule(p2, "GB8 ( LVT | T) x T"); continue; } - // Rule (GB9) x (Extend | ZWJ) if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) { if (!fExtendSet->contains(c1)) { cBase = c1; } + setAppliedRule(p2, "GB9 x (Extend | ZWJ)"); continue; } - // Rule (GB9a) x SpacingMark if (fSpacingSet->contains(c2)) { + setAppliedRule(p2, "GB9a x SpacingMark"); continue; } - // Rule (GB9b) Prepend x if (fPrependSet->contains(c1)) { + setAppliedRule(p2, "GB9b Prepend x"); continue; } - // Rule (GB11) Extended_Pictographic Extend * ZWJ x Extended_Pictographic + // Note: Viramas are also included in the ExtCccZwj class. + if (fLinkingConsonantSet->contains(c2)) { + int pi = p1; + bool sawVirama = false; + while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) { + if (fViramaSet->contains(fText->char32At(pi))) { + sawVirama = true; + } + pi = fText->moveIndex32(pi, -1); + } + if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) { + setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant"); + continue; + } + } + if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) { - continue; + setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic"); + continue; } - // Rule (GB12-13) Regional_Indicator x Regional_Indicator // Note: The first if condition is a little tricky. We only need to force // a break if there are three or more contiguous RIs. If there are // only two, a break following will occur via other rules, and will include // any trailing extend characters, which is needed behavior. if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { - break; + setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator"); + break; } if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { - continue; + setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator"); + continue; } - // Rule (GB999) Any Any + setAppliedRule(p2, "GB999 Any Any"); break; } @@ -1809,7 +1934,6 @@ UVector *RBBICharMonkey::charClasses() { return fSets; } - RBBICharMonkey::~RBBICharMonkey() { delete fSets; delete fCRLFSet; @@ -1827,6 +1951,9 @@ RBBICharMonkey::~RBBICharMonkey() { delete fAnySet; delete fZWJSet; delete fExtendedPictSet; + delete fViramaSet; + delete fLinkingConsonantSet; + delete fExtCccZwjSet; } //------------------------------------------------------------------------------------------ @@ -1889,10 +2016,15 @@ RBBIWordMonkey::RBBIWordMonkey() fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status); fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]", status); fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status); - fNumericSet = new UnicodeSet(u"[[\\p{Word_Break = Numeric}][\\uff10-\\uff19]]", status); + fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status); fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status); fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status); - fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status); + // There are some sc=Hani characters with WB=Extend. + // The break rules need to pick one or the other because + // Extend overlapping with something else is messy. + // For Unicode 13, we chose to keep U+16FF0 & U+16FF1 + // in $Han (for $dictionary) and out of $Extend. + fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status); fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status); fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status); @@ -1934,29 +2066,32 @@ RBBIWordMonkey::RBBIWordMonkey() // Inhibit dictionary characters from being tested at all. fOtherSet->removeAll(*fDictionarySet); - fSets->addElement(fCRSet, status); - fSets->addElement(fLFSet, status); - fSets->addElement(fNewlineSet, status); - fSets->addElement(fRegionalIndicatorSet, status); - fSets->addElement(fHebrew_LetterSet, status); - fSets->addElement(fALetterSet, status); - fSets->addElement(fSingle_QuoteSet, status); - fSets->addElement(fDouble_QuoteSet, status); - //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters - // from the test data. They are all in the dictionary set, - // which this (old, to be retired) monkey test cannot handle. - fSets->addElement(fMidLetterSet, status); - fSets->addElement(fMidNumLetSet, status); - fSets->addElement(fMidNumSet, status); - fSets->addElement(fNumericSet, status); - fSets->addElement(fFormatSet, status); - fSets->addElement(fExtendSet, status); - fSets->addElement(fOtherSet, status); - fSets->addElement(fExtendNumLetSet, status); - fSets->addElement(fWSegSpaceSet, status); - - fSets->addElement(fZWJSet, status); - fSets->addElement(fExtendedPictSet, status); + // Add classes and their names + fSets->addElement(fCRSet, status); classNames.push_back("CR"); + fSets->addElement(fLFSet, status); classNames.push_back("LF"); + fSets->addElement(fNewlineSet, status); classNames.push_back("Newline"); + fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator"); + fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew"); + fSets->addElement(fALetterSet, status); classNames.push_back("ALetter"); + fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote"); + fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote"); + // Omit Katakana from fSets, which omits Katakana characters + // from the test data. They are all in the dictionary set, + // which this (old, to be retired) monkey test cannot handle. + //fSets->addElement(fKatakanaSet, status); + + fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter"); + fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet"); + fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum"); + fSets->addElement(fNumericSet, status); classNames.push_back("Numeric"); + fSets->addElement(fFormatSet, status); classNames.push_back("Format"); + fSets->addElement(fExtendSet, status); classNames.push_back("Extend"); + fSets->addElement(fOtherSet, status); classNames.push_back("Other"); + fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet"); + fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace"); + + fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ"); + fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict"); if (U_FAILURE(status)) { deferredStatus = status; @@ -1965,6 +2100,7 @@ RBBIWordMonkey::RBBIWordMonkey() void RBBIWordMonkey::setText(const UnicodeString &s) { fText = &s; + prepareAppliedRules(s.length()); } @@ -1997,14 +2133,14 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) { p1 = p2; c1 = c2; p2 = p3; c2 = c3; - // Advancd p3 by X(Extend | Format)* Rule 4 + // Advance p3 by X(Extend | Format)* Rule 4 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) do { p3 = fText->moveIndex32(p3, 1); c3 = fText->char32At(p3); if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { break; - }; + } } while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3)); @@ -2013,141 +2149,145 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) { // Still warming up the loop. (won't work with zero length strings, but we don't care) continue; } + if (p2 == fText->length()) { // Reached end of string. Always a break position. break; } - // Rule (3) CR x LF // No Extend or Format characters may appear between the CR and LF, // which requires the additional check for p2 immediately following p1. // if (c1==0x0D && c2==0x0A) { - continue; + setAppliedRule(p2, "WB3 CR x LF"); + continue; } - // Rule (3a) Break before and after newlines (including CR and LF) - // if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { + setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)"); break; - }; + } if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { + setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)"); break; - }; + } - // Rule (3c) ZWJ x Extended_Pictographic // Not ignoring extend chars, so peek into input text to // get the potential ZWJ, the character immediately preceding c2. // Sloppy UChar32 indexing: p2-1 may reference trail half // but char32At will get the full code point. - if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) { + if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){ + setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic"); continue; } - // Rule (3d) Keep horizontal whitespace together. if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) { + setAppliedRule(p2, "WB3d Keep horizontal whitespace together."); continue; } - // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { + setAppliedRule(p2, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)"); continue; } - // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) - // if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) { + setAppliedRule(p2, + "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)"); continue; } - // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter) if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) && (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { + setAppliedRule(p2, + "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)"); continue; } - // Rule (7a) Hebrew_Letter x Single_Quote if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) { + setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote"); continue; } - // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter - if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) { + if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) { + setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter"); continue; } - // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) { + setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter"); continue; } - // Rule (8) Numeric x Numeric if (fNumericSet->contains(c1) && - fNumericSet->contains(c2)) { + fNumericSet->contains(c2)) { + setAppliedRule(p2, "WB8 Numeric x Numeric"); continue; } - // Rule (9) (ALetter | Hebrew_Letter) x Numeric if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && - fNumericSet->contains(c2)) { + fNumericSet->contains(c2)) { + setAppliedRule(p2, "WB9 (ALetter | Hebrew_Letter) x Numeric"); continue; } - // Rule (10) Numeric x (ALetter | Hebrew_Letter) if (fNumericSet->contains(c1) && (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { + setAppliedRule(p2, "WB10 Numeric x (ALetter | Hebrew_Letter)"); continue; } - // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric - if (fNumericSet->contains(c0) && + if (fNumericSet->contains(c0) && (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && fNumericSet->contains(c2)) { + setAppliedRule(p2, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric"); continue; } - // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric if (fNumericSet->contains(c1) && (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && fNumericSet->contains(c3)) { + setAppliedRule(p2, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric"); continue; } - // Rule (13) Katakana x Katakana // Note: matches UAX 29 rules, but doesn't come into play for ICU because // all Katakana are handled by the dictionary breaker. if (fKatakanaSet->contains(c1) && fKatakanaSet->contains(c2)) { + setAppliedRule(p2, "WB13 Katakana x Katakana"); continue; } - // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) || fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && fExtendNumLetSet->contains(c2)) { - continue; + setAppliedRule(p2, + "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet"); + continue; } - // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) if (fExtendNumLetSet->contains(c1) && (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) || fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) { + setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)"); continue; } - // Rule 15 - 17 Group pairs of Regional Indicators. if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) { + setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators."); break; } if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { + setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators."); continue; } - // Rule 999. Break found here. + setAppliedRule(p2, "WB999"); break; } @@ -2160,7 +2300,6 @@ UVector *RBBIWordMonkey::charClasses() { return fSets; } - RBBIWordMonkey::~RBBIWordMonkey() { delete fSets; delete fCRSet; @@ -2224,7 +2363,6 @@ private: UnicodeSet *fExtendSet; const UnicodeString *fText; - }; RBBISentMonkey::RBBISentMonkey() @@ -2269,19 +2407,19 @@ RBBISentMonkey::RBBISentMonkey() fOtherSet->removeAll(*fCloseSet); fOtherSet->removeAll(*fExtendSet); - fSets->addElement(fSepSet, status); - fSets->addElement(fFormatSet, status); - fSets->addElement(fSpSet, status); - fSets->addElement(fLowerSet, status); - fSets->addElement(fUpperSet, status); - fSets->addElement(fOLetterSet, status); - fSets->addElement(fNumericSet, status); - fSets->addElement(fATermSet, status); - fSets->addElement(fSContinueSet, status); - fSets->addElement(fSTermSet, status); - fSets->addElement(fCloseSet, status); - fSets->addElement(fOtherSet, status); - fSets->addElement(fExtendSet, status); + fSets->addElement(fSepSet, status); classNames.push_back("Sep"); + fSets->addElement(fFormatSet, status); classNames.push_back("Format"); + fSets->addElement(fSpSet, status); classNames.push_back("Sp"); + fSets->addElement(fLowerSet, status); classNames.push_back("Lower"); + fSets->addElement(fUpperSet, status); classNames.push_back("Upper"); + fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter"); + fSets->addElement(fNumericSet, status); classNames.push_back("Numeric"); + fSets->addElement(fATermSet, status); classNames.push_back("ATerm"); + fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue"); + fSets->addElement(fSTermSet, status); classNames.push_back("STerm"); + fSets->addElement(fCloseSet, status); classNames.push_back("Close"); + fSets->addElement(fOtherSet, status); classNames.push_back("Other"); + fSets->addElement(fExtendSet, status); classNames.push_back("Extend"); if (U_FAILURE(status)) { deferredStatus = status; @@ -2292,13 +2430,13 @@ RBBISentMonkey::RBBISentMonkey() void RBBISentMonkey::setText(const UnicodeString &s) { fText = &s; + prepareAppliedRules(s.length()); } UVector *RBBISentMonkey::charClasses() { return fSets; } - // moveBack() Find the "significant" code point preceding the index i. // Skips over ($Extend | $Format)* . // @@ -2370,43 +2508,45 @@ int32_t RBBISentMonkey::next(int32_t prevPos) { p1 = p2; c1 = c2; p2 = p3; c2 = c3; - // Advancd p3 by X(Extend | Format)* Rule 4 + // Advance p3 by X(Extend | Format)* Rule 4 p3 = moveForward(p3); c3 = cAt(p3); - // Rule (3) CR x LF if (c1==0x0d && c2==0x0a && p2==(p1+1)) { + setAppliedRule(p2, "SB3 CR x LF"); continue; } - // Rule (4). Sep if (fSepSet->contains(c1)) { p2 = p1+1; // Separators don't combine with Extend or Format. + + setAppliedRule(p2, "SB4 Sep "); break; } if (p2 >= fText->length()) { // Reached end of string. Always a break position. + setAppliedRule(p2, "SB4 Sep "); break; } if (p2 == prevPos) { // Still warming up the loop. (won't work with zero length strings, but we don't care) + setAppliedRule(p2, "SB4 Sep "); continue; } - // Rule (6). ATerm x Numeric if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { + setAppliedRule(p2, "SB6 ATerm x Numeric"); continue; } - // Rule (7). (Upper | Lower) ATerm x Uppper - if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) && + if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) && fATermSet->contains(c1) && fUpperSet->contains(c2)) { + setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper"); continue; } - // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower // Note: STerm | ATerm are added to the negated part of the expression by a // note to the Unicode 5.0 documents. int p8 = p1; @@ -2423,16 +2563,21 @@ int32_t RBBISentMonkey::next(int32_t prevPos) { if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || fLowerSet->contains(c) || fSepSet->contains(c) || fATermSet->contains(c) || fSTermSet->contains(c)) { + + setAppliedRule(p2, + "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* "); break; } p8 = moveForward(p8); } if (fLowerSet->contains(cAt(p8))) { + + setAppliedRule(p2, + "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* "); continue; } } - // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { p8 = p1; while (fSpSet->contains(cAt(p8))) { @@ -2443,11 +2588,11 @@ int32_t RBBISentMonkey::next(int32_t prevPos) { } c = cAt(p8); if (fSTermSet->contains(c) || fATermSet->contains(c)) { + setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)"); continue; } } - // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) int p9 = p1; while (fCloseSet->contains(cAt(p9))) { p9 = moveBack(p9); @@ -2455,11 +2600,12 @@ int32_t RBBISentMonkey::next(int32_t prevPos) { c = cAt(p9); if ((fSTermSet->contains(c) || fATermSet->contains(c))) { if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { + + setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)"); continue; } } - // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) int p10 = p1; while (fSpSet->contains(cAt(p10))) { p10 = moveBack(p10); @@ -2469,11 +2615,11 @@ int32_t RBBISentMonkey::next(int32_t prevPos) { } if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { if (fSpSet->contains(c2) || fSepSet->contains(c2)) { + setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)"); continue; } } - // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? int p11 = p1; if (fSepSet->contains(cAt(p11))) { p11 = moveBack(p11); @@ -2485,12 +2631,14 @@ int32_t RBBISentMonkey::next(int32_t prevPos) { p11 = moveBack(p11); } if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { + setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? "); break; } - // Rule (12) Any x Any + setAppliedRule(p2, "SB12 Any x Any"); continue; } + breakPos = p2; return breakPos; } @@ -2574,6 +2722,8 @@ private: UnicodeSet *fEB; UnicodeSet *fEM; UnicodeSet *fZWJ; + UnicodeSet *fOP30; + UnicodeSet *fCP30; BreakIterator *fCharBI; const UnicodeString *fText; @@ -2614,7 +2764,7 @@ RBBILineMonkey::RBBILineMonkey() : fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); - fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); + fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=CL}] [\\u201D]]"), status); // en adjustments for rdar://problem/51193810 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); @@ -2622,8 +2772,8 @@ RBBILineMonkey::RBBILineMonkey() : fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); - fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); - fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); + fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=OP}] [\\u201C\\u2018]]"), status); // en adjustments + fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=QU}]-[\\u201C\\u2018\\u201D]]"), status); // en adjustments fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); @@ -2639,7 +2789,9 @@ RBBILineMonkey::RBBILineMonkey() : fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status); fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status); - fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status); + fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status); + fOP30 = new UnicodeSet(u"[[\\p{Line_break=OP} [\\u201C\\u2018]]-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status); // en adjustments + fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status); if (U_FAILURE(status)) { deferredStatus = status; @@ -2655,48 +2807,51 @@ RBBILineMonkey::RBBILineMonkey() : fHH->add(u'\u2010'); // Hyphen, '‐' - fSets->addElement(fBK, status); - fSets->addElement(fCR, status); - fSets->addElement(fLF, status); - fSets->addElement(fCM, status); - fSets->addElement(fNL, status); - fSets->addElement(fWJ, status); - fSets->addElement(fZW, status); - fSets->addElement(fGL, status); - fSets->addElement(fCB, status); - fSets->addElement(fSP, status); - fSets->addElement(fB2, status); - fSets->addElement(fBA, status); - fSets->addElement(fBB, status); - fSets->addElement(fHY, status); - fSets->addElement(fH2, status); - fSets->addElement(fH3, status); - fSets->addElement(fCL, status); - fSets->addElement(fCP, status); - fSets->addElement(fEX, status); - fSets->addElement(fIN, status); - fSets->addElement(fJL, status); - fSets->addElement(fJT, status); - fSets->addElement(fJV, status); - fSets->addElement(fNS, status); - fSets->addElement(fOP, status); - fSets->addElement(fQU, status); - fSets->addElement(fIS, status); - fSets->addElement(fNU, status); - fSets->addElement(fPO, status); - fSets->addElement(fPR, status); - fSets->addElement(fSY, status); - fSets->addElement(fAI, status); - fSets->addElement(fAL, status); - fSets->addElement(fHL, status); - fSets->addElement(fID, status); - fSets->addElement(fWJ, status); - fSets->addElement(fRI, status); - fSets->addElement(fSG, status); - fSets->addElement(fEB, status); - fSets->addElement(fEM, status); - fSets->addElement(fZWJ, status); - + // Sets and names. + fSets->addElement(fBK, status); classNames.push_back("fBK"); + fSets->addElement(fCR, status); classNames.push_back("fCR"); + fSets->addElement(fLF, status); classNames.push_back("fLF"); + fSets->addElement(fCM, status); classNames.push_back("fCM"); + fSets->addElement(fNL, status); classNames.push_back("fNL"); + fSets->addElement(fWJ, status); classNames.push_back("fWJ"); + fSets->addElement(fZW, status); classNames.push_back("fZW"); + fSets->addElement(fGL, status); classNames.push_back("fGL"); + fSets->addElement(fCB, status); classNames.push_back("fCB"); + fSets->addElement(fSP, status); classNames.push_back("fSP"); + fSets->addElement(fB2, status); classNames.push_back("fB2"); + fSets->addElement(fBA, status); classNames.push_back("fBA"); + fSets->addElement(fBB, status); classNames.push_back("fBB"); + fSets->addElement(fHY, status); classNames.push_back("fHY"); + fSets->addElement(fH2, status); classNames.push_back("fH2"); + fSets->addElement(fH3, status); classNames.push_back("fH3"); + fSets->addElement(fCL, status); classNames.push_back("fCL"); + fSets->addElement(fCP, status); classNames.push_back("fCP"); + fSets->addElement(fEX, status); classNames.push_back("fEX"); + fSets->addElement(fIN, status); classNames.push_back("fIN"); + fSets->addElement(fJL, status); classNames.push_back("fJL"); + fSets->addElement(fJT, status); classNames.push_back("fJT"); + fSets->addElement(fJV, status); classNames.push_back("fJV"); + fSets->addElement(fNS, status); classNames.push_back("fNS"); + fSets->addElement(fOP, status); classNames.push_back("fOP"); + fSets->addElement(fQU, status); classNames.push_back("fQU"); + fSets->addElement(fIS, status); classNames.push_back("fIS"); + fSets->addElement(fNU, status); classNames.push_back("fNU"); + fSets->addElement(fPO, status); classNames.push_back("fPO"); + fSets->addElement(fPR, status); classNames.push_back("fPR"); + fSets->addElement(fSY, status); classNames.push_back("fSY"); + fSets->addElement(fAI, status); classNames.push_back("fAI"); + fSets->addElement(fAL, status); classNames.push_back("fAL"); + fSets->addElement(fHL, status); classNames.push_back("fHL"); + fSets->addElement(fID, status); classNames.push_back("fID"); + fSets->addElement(fWJ, status); classNames.push_back("fWJ"); + fSets->addElement(fRI, status); classNames.push_back("fRI"); + fSets->addElement(fSG, status); classNames.push_back("fSG"); + fSets->addElement(fEB, status); classNames.push_back("fEB"); + fSets->addElement(fEM, status); classNames.push_back("fEM"); + fSets->addElement(fZWJ, status); classNames.push_back("fZWJ"); + // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented. + fSets->addElement(fOP30, status); classNames.push_back("fOP30"); + fSets->addElement(fCP30, status); classNames.push_back("fCP30"); const char *rules = "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?" @@ -2715,12 +2870,14 @@ RBBILineMonkey::RBBILineMonkey() : if (U_FAILURE(status)) { deferredStatus = status; } + } void RBBILineMonkey::setText(const UnicodeString &s) { fText = &s; fCharBI->setText(s); + prepareAppliedRules(s.length()); fNumberMatcher->reset(s); } @@ -2744,8 +2901,8 @@ void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos int32_t nPos = *nextPos; // LB 9 Keep combining sequences together. - // advance over any CM class chars. Note that Line Break CM is different - // from the normal Grapheme Extend property. + // advance over any CM class chars. Note that Line Break CM is different + // from the normal Grapheme Extend property. if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { for (;;) { @@ -2827,59 +2984,67 @@ int32_t RBBILineMonkey::next(int32_t startPos) { nextCPPos = fText->moveIndex32(pos, 1); nextPos = nextCPPos; - // Rule LB2 - Break at end of text. + if (pos >= fText->length()) { + setAppliedRule(pos, "LB2 - Break at end of text."); break; } - // Rule LB 9 - adjust for combining sequences. + // We do this one out-of-order because the adjustment does not change anything // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to // be applied. - rule9Adjust(prevPos, &prevChar, &pos, &thisChar); + rule9Adjust(prevPos, &prevChar, &pos, &thisChar); nextCPPos = nextPos = fText->moveIndex32(pos, 1); c = fText->char32At(nextPos); - rule9Adjust(pos, &thisChar, &nextPos, &c); + rule9Adjust(pos, &thisChar, &nextPos, &c); // If the loop is still warming up - if we haven't shifted the initial // -1 positions out of prevPos yet - loop back to advance the // position in the input without any further looking for breaks. if (prevPos == -1) { + setAppliedRule(pos, "LB 9 - adjust for combining sequences."); continue; } - // LB 4 Always break after hard line breaks, + if (fBK->contains(prevChar)) { + setAppliedRule(pos, "LB 4 Always break after hard line breaks"); break; } - // LB 5 Break after CR, LF, NL, but not inside CR LF + if (prevChar == 0x0d && thisChar == 0x0a) { + setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF"); continue; } if (prevChar == 0x0d || prevChar == 0x0a || prevChar == 0x85) { + setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF"); break; } - // LB 6 Don't break before hard line breaks + if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || fBK->contains(thisChar)) { - continue; + setAppliedRule(pos, "LB 6 Don't break before hard line breaks"); + continue; } - // LB 7 Don't break before spaces or zero-width space. if (fSP->contains(thisChar)) { + setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space."); continue; } + // !!! ??? Is this the right text for the applied rule? if (fZW->contains(thisChar)) { + setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space."); continue; } - // LB 8 Break after zero width space + // ZW SP* ÷ // Scan backwards from prevChar for SP* ZW tPos = prevPos; @@ -2887,14 +3052,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) { tPos = fText->moveIndex32(tPos, -1); } if (fZW->contains(fText->char32At(tPos))) { + setAppliedRule(pos, "LB 8 Break after zero width space"); break; } - // LB 25 Numbers + // Move this test up, before LB8a, because numbers can match a longer sequence that would // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM) if (fNumberMatcher->lookingAt(prevPos, status)) { if (U_FAILURE(status)) { + setAppliedRule(pos, "LB 25 Numbers"); break; } // Matched a number. But could have been just a single digit, which would @@ -2912,11 +3079,12 @@ int32_t RBBILineMonkey::next(int32_t startPos) { thisChar = fText->char32At(pos); } while (fCM->contains(thisChar)); } + setAppliedRule(pos, "LB 25 Numbers"); continue; } } - // LB 8a ZWJ x + // The monkey test's way of ignoring combining characters doesn't work // for this rule. ZJ is also a CM. Need to get the actual character // preceding "thisChar", not ignoring combining marks, possibly ZJ. @@ -2924,46 +3092,48 @@ int32_t RBBILineMonkey::next(int32_t startPos) { int32_t prevIdx = fText->moveIndex32(pos, -1); UChar32 prevC = fText->char32At(prevIdx); if (fZWJ->contains(prevC)) { + setAppliedRule(pos, "LB 8a ZWJ x"); continue; } } - // LB 9, 10 Already done, at top of loop. + + // appliedRule: "LB 9, 10"; // Already done, at top of loop."; // - // LB 11 Do not break before or after WORD JOINER and related characters. // x WJ // WJ x // if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { + setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters."); continue; } - // LB 12 - // GL x + if (fGL->contains(prevChar)) { + setAppliedRule(pos, "LB 12 GL x"); continue; } - // LB 12a - // [^SP BA HY] x GL - if (!(fSP->contains(prevChar) || + + if (!(fSP->contains(prevChar) || fBA->contains(prevChar) || fHY->contains(prevChar) ) && fGL->contains(thisChar)) { - continue; + setAppliedRule(pos, "LB 12a [^SP BA HY] x GL"); + continue; } - // LB 13 Don't break before closings. - // + if (fCL->contains(thisChar) || fCP->contains(thisChar) || fEX->contains(thisChar) || fSY->contains(thisChar)) { + setAppliedRule(pos, "LB 13 Don't break before closings."); continue; } - // LB 14 Don't break after OP SP* + // Scan backwards, checking for this sequence. // The OP char could include combining marks, so we actually check for // OP CM* SP* @@ -2981,26 +3151,28 @@ int32_t RBBILineMonkey::next(int32_t startPos) { tPos=fText->moveIndex32(tPos, -1); } if (fOP->contains(fText->char32At(tPos))) { + setAppliedRule(pos, "LB 14 Don't break after OP SP*"); continue; } - // LB 14a Break before an IS that begins a number and follows a space if (nextPos < fText->length()) { // note: UnicodeString::char32At(length) returns ffff, not distinguishable // from a legit ffff character. So test length separately. UChar32 nextChar = fText->char32At(nextPos); if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) { + setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space"); break; } } - // LB14b Do not break before numeric separators, even after spaces. - if (fIS->contains(thisChar)) { - continue; + + if (fIS->contains(thisChar)) { + setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces."); + continue; } - // LB 15 QU SP* x OP + if (fOP->contains(thisChar)) { // Scan backwards from prevChar to see if it is preceded by QU CM* SP* int tPos = prevPos; @@ -3011,13 +3183,12 @@ int32_t RBBILineMonkey::next(int32_t startPos) { tPos = fText->moveIndex32(tPos, -1); } if (fQU->contains(fText->char32At(tPos))) { + setAppliedRule(pos, "LB 15 QU SP* x OP"); continue; } } - - // LB 16 (CL | CP) SP* x NS // Scan backwards for SP* CM* (CL | CP) if (fNS->contains(thisChar)) { int tPos = prevPos; @@ -3028,12 +3199,12 @@ int32_t RBBILineMonkey::next(int32_t startPos) { tPos = fText->moveIndex32(tPos, -1); } if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { + setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS"); continue; } } - // LB 17 B2 SP* x B2 if (fB2->contains(thisChar)) { // Scan backwards, checking for the B2 CM* SP* sequence. tPos = prevPos; @@ -3046,163 +3217,169 @@ int32_t RBBILineMonkey::next(int32_t startPos) { tPos=fText->moveIndex32(tPos, -1); } if (fB2->contains(fText->char32At(tPos))) { + setAppliedRule(pos, "LB 17 B2 SP* x B2"); continue; } } - // LB 18 break after space if (fSP->contains(prevChar)) { + setAppliedRule(pos, "LB 18 break after space"); break; } - // LB 19 // x QU // QU x if (fQU->contains(thisChar) || fQU->contains(prevChar)) { + setAppliedRule(pos, "LB 19"); continue; } - // LB 20 Break around a CB if (fCB->contains(thisChar) || fCB->contains(prevChar)) { + setAppliedRule(pos, "LB 20 Break around a CB"); break; } - // LB 20.09 Don't break between Hyphens and letters if a break precedes the hyphen. + // Don't break between Hyphens and letters if a break precedes the hyphen. // Formerly this was a Finnish tailoring. // Moved to root in ICU 63. This is an ICU customization, not in UAX-14. - // ^($HY | $HH) $AL; + // ^($HY | $HH) $AL; if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) && prevPosX2 == -1) { + setAppliedRule(pos, "LB 20.09"); continue; } - // LB 21 if (fBA->contains(thisChar) || fHY->contains(thisChar) || fNS->contains(thisChar) || fBB->contains(prevChar) ) { + setAppliedRule(pos, "LB 21"); continue; } - // LB 21a - // HL (HY | BA) x if (fHL->contains(prevCharX2) && (fHY->contains(prevChar) || fBA->contains(prevChar))) { + setAppliedRule(pos, "LB 21a HL (HY | BA) x"); continue; } - // LB 21b - // SY x HL if (fSY->contains(prevChar) && fHL->contains(thisChar)) { + setAppliedRule(pos, "LB 21b SY x HL"); continue; } - // LB 22 - if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || - (fEX->contains(prevChar) && fIN->contains(thisChar)) || - (fHL->contains(prevChar) && fIN->contains(thisChar)) || - ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) || - (fIN->contains(prevChar) && fIN->contains(thisChar)) || - (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { + if (fIN->contains(thisChar)) { + setAppliedRule(pos, "LB 22"); continue; } - // LB 23 (AL | HL) x NU + // (AL | HL) x NU // NU x (AL | HL) if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) { + setAppliedRule(pos, "LB 23"); continue; } if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { + setAppliedRule(pos, "LB 23"); continue; } - // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes. + // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes. // PR x (ID | EB | EM) // (ID | EB | EM) x PO if (fPR->contains(prevChar) && (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) { + setAppliedRule(pos, "LB 23a"); continue; } if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fPO->contains(thisChar)) { + setAppliedRule(pos, "LB 23a"); continue; } - // LB 24 Do not break between prefix and letters or ideographs. + // Do not break between prefix and letters or ideographs. // (PR | PO) x (AL | HL) // (AL | HL) x (PR | PO) if ((fPR->contains(prevChar) || fPO->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { + setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs"); continue; } if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fPR->contains(thisChar) || fPO->contains(thisChar))) { + setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs"); continue; } - // LB 25 numbers match, moved up, before LB 8a, + // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a, - // LB 26 Do not break a Korean syllable. if (fJL->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { - continue; + setAppliedRule(pos, "LB 26 Do not break a Korean syllable."); + continue; } if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && (fJV->contains(thisChar) || fJT->contains(thisChar))) { - continue; + setAppliedRule(pos, "LB 26 Do not break a Korean syllable."); + continue; } if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && fJT->contains(thisChar)) { - continue; + setAppliedRule(pos, "LB 26 Do not break a Korean syllable."); + continue; } - // LB 27 Treat a Korean Syllable Block the same as ID. if ((fJL->contains(prevChar) || fJV->contains(prevChar) || fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && fIN->contains(thisChar)) { - continue; + setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID."); + continue; } if ((fJL->contains(prevChar) || fJV->contains(prevChar) || fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && fPO->contains(thisChar)) { - continue; + setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID."); + continue; } if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { - continue; + setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID."); + continue; } - // LB 28 Do not break between alphabetics ("at"). if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { + setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\")."); continue; } - // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). - if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { - continue; + if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { + setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\")."); + continue; } - // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. // (AL | NU) x OP // CP x (AL | NU) - if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { + if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) { + setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation."); continue; } - if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) { + if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) { + setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation."); continue; } - // LB30a RI RI ÷ RI // RI x RI if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) { + setAppliedRule(pos, "LB30a RI RI ÷ RI"); break; } if (fRI->contains(prevChar) && fRI->contains(thisChar)) { @@ -3210,17 +3387,17 @@ int32_t RBBILineMonkey::next(int32_t startPos) { // Over-write the trailing one (thisChar) to prevent it from forming another pair with a // following RI. This is a hack. thisChar = -1; + setAppliedRule(pos, "LB30a RI RI ÷ RI"); continue; } - // LB30b Emoji Base x Emoji Modifier if (fEB->contains(prevChar) && fEM->contains(thisChar)) { + setAppliedRule(pos, "LB30b Emoji Base x Emoji Modifier"); continue; } - // LB 31 Break everywhere else + setAppliedRule(pos, "LB 31 Break everywhere else"); break; - } return pos; @@ -3278,6 +3455,8 @@ RBBILineMonkey::~RBBILineMonkey() { delete fEB; delete fEM; delete fZWJ; + delete fOP30; + delete fCP30; delete fCharBI; delete fNumberMatcher; @@ -3315,7 +3494,7 @@ static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t d paramLength = (int32_t)(sizeof(valString)-2); } params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); - val = strtol(valString, NULL, 10); + val = strtol(valString, NULL, 10); // Delete this parameter from the params string. m.reset(); @@ -3643,6 +3822,7 @@ void RBBITest::TestLineBreaks(void) int expectedcount = 0; monkey.setText(ustr); + int i; for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { if (expectedcount >= EXPECTEDSIZE) { @@ -3707,6 +3887,7 @@ void RBBITest::TestSentBreaks(void) int expectedcount = 0; monkey.setText(ustr); + int i; for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { if (expectedcount >= EXPECTEDSIZE) { @@ -3822,7 +4003,7 @@ void RBBITest::TestMonkey() { loopCount = loopCount / 10; // Sentence runs slower than the other break types } if (U_SUCCESS(status)) { - RunMonkey(bi, m, "sentence", seed, loopCount, useUText); + RunMonkey(bi, m, "sent", seed, loopCount, useUText); } else { errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); @@ -3851,7 +4032,6 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name UnicodeString testText; int32_t numCharClasses; UVector *chClasses; - int expected[TESTSTRINGLEN*2 + 1]; int expectedCount = 0; char expectedBreaks[TESTSTRINGLEN*2 + 1]; char forwardBreaks[TESTSTRINGLEN*2 + 1]; @@ -3862,6 +4042,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name int i; int loopCount = 0; + m_seed = seed; numCharClasses = mk.charClasses()->size(); @@ -3884,6 +4065,9 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name } } + // For minimizing width of class name output. + int classNameSize = mk.maxClassNameSize(); + while (loopCount < numIterations || numIterations == -1) { if (numIterations == -1 && loopCount % 10 == 0) { // If test is running in an infinite loop, display a periodic tic so @@ -3914,8 +4098,9 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name testText.append(c); } - // Calculate the expected results for this test string. + // Calculate the expected results for this test string and reset applied rules. mk.setText(testText); + memset(expectedBreaks, 0, sizeof(expectedBreaks)); expectedBreaks[0] = 1; int32_t breakPos = 0; @@ -3930,9 +4115,6 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name } expectedBreaks[breakPos] = 1; U_ASSERT(expectedCount\\\uabcd\uabcd\\\U0001abcd..." - UnicodeString errorText = ""; - /***if (strcmp(errorType, "next()") == 0) { - startContext = 0; - endContext = testText.length(); + // Formatting of each line includes: + // character code + // reference break: '|' -> a break, '.' -> no break + // actual break: '|' -> a break, '.' -> no break + // (name of character clase) + // Unicode name of character + // '-->' indicates location of the difference. - printStringBreaks(testText, expected, expectedCount); - }***/ + MONKEY_ERROR( + (expectedBreaks[i] ? "Break expected but not found" : + "Break found but not expected"), + name, i, seed); - for (ci=startContext; ci"); - } else if (expectedBreaks[ci] != 0) { - // This a non-error expected break position. - errorText.append("\\"); + currentLineFlag = "-->"; // Error position } - if (c < 0x10000) { - errorText.append("\\u"); - for (bn=12; bn>=0; bn-=4) { - errorText.append(hexChars.charAt((c>>bn)&0xf)); - } + + // BMP or SMP character in hex + char hexCodePoint[12]; + std::string format = " \\u%04x"; + if (c >= 0x10000) { + format = "\\U%08x"; + } + sprintf(hexCodePoint, format.c_str(), c); + + // Get the class name and character name for the character. + char cName[200]; + UErrorCode status = U_ZERO_ERROR; + u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status); + + char buffer[200]; + snprintf(buffer, 200, + "%4s %3i : %1s %1s %10s %-*s %-40s %-40s", + currentLineFlag.c_str(), + ci, + expectedBreaks[ci] == 0 ? "." : "|", // Reference break + currentBreakData[ci] == 0 ? "." : "|", // Actual break + hexCodePoint, + classNameSize, + mk.classNameFromCodepoint(c).c_str(), + mk.getAppliedRule(ci).c_str(), cName); + + // Output the error + if (ci == i) { + errln(buffer); } else { - errorText.append("\\U"); - for (bn=28; bn>=0; bn-=4) { - errorText.append(hexChars.charAt((c>>bn)&0xf)); - } + infoln(buffer); } - ci = testText.moveIndex32(ci, 1); + + if (ci >= endContext) { break; } } - errorText.append("\\"); - errorText.append("\n"); - - // Output the error - char charErrorTxt[500]; - UErrorCode status = U_ZERO_ERROR; - errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); - charErrorTxt[sizeof(charErrorTxt)-1] = 0; - const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status); - - errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", - name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), - errorType, seed, i, charErrorTxt); break; } } @@ -4397,11 +4594,11 @@ void RBBITest::TestBug12519() { assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status)); assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr); - LocalPointercloneEn((RuleBasedBreakIterator *)biEn->clone()); + LocalPointercloneEn(biEn->clone()); assertTrue(WHERE, *biEn == *cloneEn); assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status)); - LocalPointercloneFr((RuleBasedBreakIterator *)biFr->clone()); + LocalPointercloneFr(biFr->clone()); assertTrue(WHERE, *biFr == *cloneFr); assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));