#include "uvector.h"
#include "uvectr32.h"
+// Needed for Apple perf tests <rdar://problem/51193810>
+#include <unistd.h>
+#include <mach/mach_time.h>
+
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
#include "unicode/filteredbrk.h"
#endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
-#define TEST_ASSERT(x) {if (!(x)) { \
- errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
-
-#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
- errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
+#define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
+ if (!(x)) { \
+ errln("Failure in file %s, line %d", __FILE__, __LINE__); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+#define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
+ if (U_FAILURE(errcode)) { \
+ errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+#define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
+ IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
+ __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
+}
//---------------------------------------------
// runIndexedTest
int32_t tagValue = 0; // The numeric value of a <nnn> tag.
UnicodeString rules; // Holds rules from a <rules> ... </rules> block
- int32_t rulesFirstLine; // Line number of the start of current <rules> block
+ int32_t rulesFirstLine = 0; // Line number of the start of current <rules> block
+
+ // <rdar://problem/51193810>
+ mach_timebase_info_data_t info;
+ uint64_t start, durationOpen = 0.0, durationUse = 0.0;
+ mach_timebase_info(&info);
+ UBool isLine = FALSE;
for (charIdx = 0; charIdx < len; ) {
status = U_ZERO_ERROR;
tp.bi = BreakIterator::createWordInstance(locale, status);
skipTest = false;
charIdx += 5;
+ isLine = FALSE;
break;
}
if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
tp.bi = BreakIterator::createCharacterInstance(locale, status);
skipTest = false;
charIdx += 5;
+ isLine = FALSE;
break;
}
if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
delete tp.bi;
+ start = mach_absolute_time(); // <rdar://problem/51193810>
tp.bi = BreakIterator::createLineInstance(locale, status);
+ durationOpen += (((mach_absolute_time() - start) * info.numer)/info.denom);
skipTest = false;
charIdx += 5;
+ isLine = TRUE;
break;
}
if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createTitleInstance(locale, status);
charIdx += 6;
+ isLine = FALSE;
break;
}
parseState = PARSE_RULES;
rules.remove();
rulesFirstLine = lineNum;
+ isLine = FALSE;
break;
}
// RUN THE TEST!
status = U_ZERO_ERROR;
tp.setUTF16(status);
+ start = mach_absolute_time(); // <rdar://problem/51193810>
executeTest(&tp, status);
+ if (isLine) {
+ durationUse += (((mach_absolute_time() - start) * info.numer)/info.denom);
+ }
TEST_ASSERT_SUCCESS(status);
// Run again, this time with UTF-8 text wrapped in a UText.
errln("rbbitst.txt:%d <data> block not closed.", lineNum);
}
+ //
+ infoln("TestExtended total time in createLineInstance (nsec):\t%llu\n", durationOpen);
+ infoln("TestExtended total time in linebreak test execute (nsec):\t%llu\n", durationUse);
+
end_test:
delete [] testFile;
delete []retPtr;
retPtr = 0;
ulen = 0;
- };
+ }
return retPtr;
}
//
int spin = 0;
while (tokenMatcher.find()) {
- if(tokenMatcher.hitEnd()) {
+ if(tokenMatcher.hitEnd()) {
/* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
This occurred when the text file was corrupt (wasn't marked as UTF-8)
and caused an infinite loop here on EBCDIC systems!
*/
fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
- // return;
- }
+ // return;
+ }
if (tokenMatcher.start(1, status) >= 0) {
// Scanned a divide sign, indicating a break position in the test data.
if (testString.length()>0) {
// Return -1 after reaching end of string.
virtual int32_t next(int32_t i) = 0;
+ // Name of each character class, parallel with charClasses. Used for debugging output
+ // of characters.
+ virtual std::vector<std::string>& characterClassNames();
+
+ void setAppliedRule(int32_t position, const char* value);
+
+ std::string getAppliedRule(int32_t position);
+
virtual ~RBBIMonkeyKind();
- UErrorCode deferredStatus;
+ UErrorCode deferredStatus;
+ std::string classNameFromCodepoint(const UChar32 c);
+ unsigned int maxClassNameSize();
-protected:
- RBBIMonkeyKind();
+ protected:
+ RBBIMonkeyKind();
+ std::vector<std::string> classNames;
+ std::vector<std::string> appliedRules;
+
+ // Clear `appliedRules` and fill it with empty strings in the size of test text.
+ void prepareAppliedRules(int32_t size );
+
+ private:
-private:
};
RBBIMonkeyKind::RBBIMonkeyKind() {
RBBIMonkeyKind::~RBBIMonkeyKind() {
}
+std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
+ return classNames;
+}
+
+void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
+ // Remove all the information in the `appliedRules`.
+ appliedRules.clear();
+ appliedRules.resize(size + 1);
+}
+
+void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
+ appliedRules[position] = value;
+}
+
+std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
+ return appliedRules[position];
+}
+
+std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
+ // Simply iterate through charClasses to find character's class
+ for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
+ UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
+ if (classSet->contains(c)) {
+ return classNames[aClassNum];
+ }
+ }
+ U_ASSERT(FALSE); // This should not happen.
+ return "bad class name";
+}
+
+unsigned int RBBIMonkeyKind::maxClassNameSize() {
+ unsigned int maxSize = 0;
+ for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
+ if (classNames[aClassNum].size() > maxSize) {
+ maxSize = classNames[aClassNum].size();
+ }
+ }
+ return maxSize;
+}
//----------------------------------------------------------------------------------------
//
UnicodeSet *fLVTSet;
UnicodeSet *fHangulSet;
UnicodeSet *fExtendedPictSet;
+ UnicodeSet *fViramaSet;
+ UnicodeSet *fLinkingConsonantSet;
+ UnicodeSet *fExtCccZwjSet;
UnicodeSet *fAnySet;
const UnicodeString *fText;
fHangulSet->addAll(*fLVTSet);
fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
+ fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
+ "\\p{Indic_Syllabic_Category=Virama}]", status);
+ fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
+ "\\p{Indic_Syllabic_Category=Consonant}]", status);
+ fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
fAnySet = new UnicodeSet(0, 0x10ffff);
+ // Create sets of characters, and add the names of the above character sets.
+ // In each new ICU release, add new names corresponding to the sets above.
fSets = new UVector(status);
- fSets->addElement(fCRLFSet, status);
- fSets->addElement(fControlSet, status);
- fSets->addElement(fExtendSet, status);
- fSets->addElement(fRegionalIndicatorSet, status);
+
+ // Important: Keep class names the same as the class contents.
+ fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
+ fSets->addElement(fControlSet, status); classNames.push_back("Control");
+ fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
+ fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
if (!fPrependSet->isEmpty()) {
- fSets->addElement(fPrependSet, status);
+ fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
}
- fSets->addElement(fSpacingSet, status);
- fSets->addElement(fHangulSet, status);
- fSets->addElement(fAnySet, status);
- fSets->addElement(fZWJSet, status);
- fSets->addElement(fExtendedPictSet, status);
+ fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
+ fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
+ fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
+ fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
+ fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
+ fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
+ fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
+ fSets->addElement(fAnySet, status); classNames.push_back("Any");
+
if (U_FAILURE(status)) {
deferredStatus = status;
}
void RBBICharMonkey::setText(const UnicodeString &s) {
fText = &s;
+ prepareAppliedRules(s.length());
}
if (prevPos >= fText->length()) {
return -1;
}
+
p0 = p1 = p2 = p3 = prevPos;
c3 = fText->char32At(prevPos);
c0 = c1 = c2 = cBase = 0;
p1 = p2; c1 = c2;
p2 = p3; c2 = c3;
- // Advancd p3 by one codepoint
+ // Advance p3 by one codepoint
p3 = fText->moveIndex32(p3, 1);
c3 = fText->char32At(p3);
// Still warming up the loop. (won't work with zero length strings, but we don't care)
continue;
}
+
if (p2 == fText->length()) {
- // Reached end of string. Always a break position.
+ setAppliedRule(p2, "End of String");
break;
}
- // Rule GB3 CR x LF
// No Extend or Format characters may appear between the CR and LF,
// which requires the additional check for p2 immediately following p1.
//
if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
- continue;
+ setAppliedRule(p2, "GB3 CR x LF");
+ continue;
}
- // Rule (GB4). ( Control | CR | LF ) <break>
if (fControlSet->contains(c1) ||
c1 == 0x0D ||
c1 == 0x0A) {
- break;
+ setAppliedRule(p2, "GB4 ( Control | CR | LF ) <break>");
+ break;
}
- // Rule (GB5) <break> ( Control | CR | LF )
- //
if (fControlSet->contains(c2) ||
c2 == 0x0D ||
c2 == 0x0A) {
+ setAppliedRule(p2, "GB5 <break> ( Control | CR | LF )");
break;
}
-
- // Rule (GB6) L x ( L | V | LV | LVT )
if (fLSet->contains(c1) &&
(fLSet->contains(c2) ||
fVSet->contains(c2) ||
fLVSet->contains(c2) ||
fLVTSet->contains(c2))) {
+ setAppliedRule(p2, "GB6 L x ( L | V | LV | LVT )");
continue;
}
- // Rule (GB7) ( LV | V ) x ( V | T )
if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
(fVSet->contains(c2) || fTSet->contains(c2))) {
+ setAppliedRule(p2, "GB7 ( LV | V ) x ( V | T )");
continue;
}
- // Rule (GB8) ( LVT | T) x T
if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
fTSet->contains(c2)) {
+ setAppliedRule(p2, "GB8 ( LVT | T) x T");
continue;
}
- // Rule (GB9) x (Extend | ZWJ)
if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
if (!fExtendSet->contains(c1)) {
cBase = c1;
}
+ setAppliedRule(p2, "GB9 x (Extend | ZWJ)");
continue;
}
- // Rule (GB9a) x SpacingMark
if (fSpacingSet->contains(c2)) {
+ setAppliedRule(p2, "GB9a x SpacingMark");
continue;
}
- // Rule (GB9b) Prepend x
if (fPrependSet->contains(c1)) {
+ setAppliedRule(p2, "GB9b Prepend x");
continue;
}
- // Rule (GB11) Extended_Pictographic Extend * ZWJ x Extended_Pictographic
+ // Note: Viramas are also included in the ExtCccZwj class.
+ if (fLinkingConsonantSet->contains(c2)) {
+ int pi = p1;
+ bool sawVirama = false;
+ while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
+ if (fViramaSet->contains(fText->char32At(pi))) {
+ sawVirama = true;
+ }
+ pi = fText->moveIndex32(pi, -1);
+ }
+ if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
+ setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
+ continue;
+ }
+ }
+
if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
- continue;
+ setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
+ continue;
}
- // Rule (GB12-13) Regional_Indicator x Regional_Indicator
// Note: The first if condition is a little tricky. We only need to force
// a break if there are three or more contiguous RIs. If there are
// only two, a break following will occur via other rules, and will include
// any trailing extend characters, which is needed behavior.
if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
&& fRegionalIndicatorSet->contains(c2)) {
- break;
+ setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
+ break;
}
if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
- continue;
+ setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
+ continue;
}
- // Rule (GB999) Any <break> Any
+ setAppliedRule(p2, "GB999 Any <break> Any");
break;
}
return fSets;
}
-
RBBICharMonkey::~RBBICharMonkey() {
delete fSets;
delete fCRLFSet;
delete fAnySet;
delete fZWJSet;
delete fExtendedPictSet;
+ delete fViramaSet;
+ delete fLinkingConsonantSet;
+ delete fExtCccZwjSet;
}
//------------------------------------------------------------------------------------------
fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]", status);
fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
- fNumericSet = new UnicodeSet(u"[[\\p{Word_Break = Numeric}][\\uff10-\\uff19]]", status);
+ fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
- fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status);
+ // There are some sc=Hani characters with WB=Extend.
+ // The break rules need to pick one or the other because
+ // Extend overlapping with something else is messy.
+ // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
+ // in $Han (for $dictionary) and out of $Extend.
+ fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
// Inhibit dictionary characters from being tested at all.
fOtherSet->removeAll(*fDictionarySet);
- fSets->addElement(fCRSet, status);
- fSets->addElement(fLFSet, status);
- fSets->addElement(fNewlineSet, status);
- fSets->addElement(fRegionalIndicatorSet, status);
- fSets->addElement(fHebrew_LetterSet, status);
- fSets->addElement(fALetterSet, status);
- fSets->addElement(fSingle_QuoteSet, status);
- fSets->addElement(fDouble_QuoteSet, status);
- //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters
- // from the test data. They are all in the dictionary set,
- // which this (old, to be retired) monkey test cannot handle.
- fSets->addElement(fMidLetterSet, status);
- fSets->addElement(fMidNumLetSet, status);
- fSets->addElement(fMidNumSet, status);
- fSets->addElement(fNumericSet, status);
- fSets->addElement(fFormatSet, status);
- fSets->addElement(fExtendSet, status);
- fSets->addElement(fOtherSet, status);
- fSets->addElement(fExtendNumLetSet, status);
- fSets->addElement(fWSegSpaceSet, status);
-
- fSets->addElement(fZWJSet, status);
- fSets->addElement(fExtendedPictSet, status);
+ // Add classes and their names
+ fSets->addElement(fCRSet, status); classNames.push_back("CR");
+ fSets->addElement(fLFSet, status); classNames.push_back("LF");
+ fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
+ fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
+ fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
+ fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
+ fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
+ fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
+ // Omit Katakana from fSets, which omits Katakana characters
+ // from the test data. They are all in the dictionary set,
+ // which this (old, to be retired) monkey test cannot handle.
+ //fSets->addElement(fKatakanaSet, status);
+
+ fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
+ fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
+ fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
+ fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
+ fSets->addElement(fFormatSet, status); classNames.push_back("Format");
+ fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
+ fSets->addElement(fOtherSet, status); classNames.push_back("Other");
+ fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
+ fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
+
+ fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
+ fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
if (U_FAILURE(status)) {
deferredStatus = status;
void RBBIWordMonkey::setText(const UnicodeString &s) {
fText = &s;
+ prepareAppliedRules(s.length());
}
p1 = p2; c1 = c2;
p2 = p3; c2 = c3;
- // Advancd p3 by X(Extend | Format)* Rule 4
+ // Advance p3 by X(Extend | Format)* Rule 4
// But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
do {
p3 = fText->moveIndex32(p3, 1);
c3 = fText->char32At(p3);
if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
break;
- };
+ }
}
while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
// Still warming up the loop. (won't work with zero length strings, but we don't care)
continue;
}
+
if (p2 == fText->length()) {
// Reached end of string. Always a break position.
break;
}
- // Rule (3) CR x LF
// No Extend or Format characters may appear between the CR and LF,
// which requires the additional check for p2 immediately following p1.
//
if (c1==0x0D && c2==0x0A) {
- continue;
+ setAppliedRule(p2, "WB3 CR x LF");
+ continue;
}
- // Rule (3a) Break before and after newlines (including CR and LF)
- //
if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
+ setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
break;
- };
+ }
if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
+ setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
break;
- };
+ }
- // Rule (3c) ZWJ x Extended_Pictographic
// Not ignoring extend chars, so peek into input text to
// get the potential ZWJ, the character immediately preceding c2.
// Sloppy UChar32 indexing: p2-1 may reference trail half
// but char32At will get the full code point.
- if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) {
+ if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
+ setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic");
continue;
}
- // Rule (3d) Keep horizontal whitespace together.
if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
+ setAppliedRule(p2, "WB3d Keep horizontal whitespace together.");
continue;
}
- // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
(fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
+ setAppliedRule(p2, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
continue;
}
- // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
- //
if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
(fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
(fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
+ setAppliedRule(p2,
+ "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
continue;
}
- // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
(fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
(fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
+ setAppliedRule(p2,
+ "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)");
continue;
}
- // Rule (7a) Hebrew_Letter x Single_Quote
if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
+ setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote");
continue;
}
- // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
- if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
+ if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
+ setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter");
continue;
}
- // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
+ setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter");
continue;
}
- // Rule (8) Numeric x Numeric
if (fNumericSet->contains(c1) &&
- fNumericSet->contains(c2)) {
+ fNumericSet->contains(c2)) {
+ setAppliedRule(p2, "WB8 Numeric x Numeric");
continue;
}
- // Rule (9) (ALetter | Hebrew_Letter) x Numeric
if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
- fNumericSet->contains(c2)) {
+ fNumericSet->contains(c2)) {
+ setAppliedRule(p2, "WB9 (ALetter | Hebrew_Letter) x Numeric");
continue;
}
- // Rule (10) Numeric x (ALetter | Hebrew_Letter)
if (fNumericSet->contains(c1) &&
(fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
+ setAppliedRule(p2, "WB10 Numeric x (ALetter | Hebrew_Letter)");
continue;
}
- // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
- if (fNumericSet->contains(c0) &&
+ if (fNumericSet->contains(c0) &&
(fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
fNumericSet->contains(c2)) {
+ setAppliedRule(p2, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric");
continue;
}
- // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
if (fNumericSet->contains(c1) &&
(fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
fNumericSet->contains(c3)) {
+ setAppliedRule(p2, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
continue;
}
- // Rule (13) Katakana x Katakana
// Note: matches UAX 29 rules, but doesn't come into play for ICU because
// all Katakana are handled by the dictionary breaker.
if (fKatakanaSet->contains(c1) &&
fKatakanaSet->contains(c2)) {
+ setAppliedRule(p2, "WB13 Katakana x Katakana");
continue;
}
- // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
fExtendNumLetSet->contains(c2)) {
- continue;
+ setAppliedRule(p2,
+ "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
+ continue;
}
- // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
if (fExtendNumLetSet->contains(c1) &&
(fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
+ setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
continue;
}
- // Rule 15 - 17 Group pairs of Regional Indicators.
if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
+ setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
break;
}
if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
+ setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
continue;
}
- // Rule 999. Break found here.
+ setAppliedRule(p2, "WB999");
break;
}
return fSets;
}
-
RBBIWordMonkey::~RBBIWordMonkey() {
delete fSets;
delete fCRSet;
UnicodeSet *fExtendSet;
const UnicodeString *fText;
-
};
RBBISentMonkey::RBBISentMonkey()
fOtherSet->removeAll(*fCloseSet);
fOtherSet->removeAll(*fExtendSet);
- fSets->addElement(fSepSet, status);
- fSets->addElement(fFormatSet, status);
- fSets->addElement(fSpSet, status);
- fSets->addElement(fLowerSet, status);
- fSets->addElement(fUpperSet, status);
- fSets->addElement(fOLetterSet, status);
- fSets->addElement(fNumericSet, status);
- fSets->addElement(fATermSet, status);
- fSets->addElement(fSContinueSet, status);
- fSets->addElement(fSTermSet, status);
- fSets->addElement(fCloseSet, status);
- fSets->addElement(fOtherSet, status);
- fSets->addElement(fExtendSet, status);
+ fSets->addElement(fSepSet, status); classNames.push_back("Sep");
+ fSets->addElement(fFormatSet, status); classNames.push_back("Format");
+ fSets->addElement(fSpSet, status); classNames.push_back("Sp");
+ fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
+ fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
+ fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
+ fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
+ fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
+ fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
+ fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
+ fSets->addElement(fCloseSet, status); classNames.push_back("Close");
+ fSets->addElement(fOtherSet, status); classNames.push_back("Other");
+ fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
if (U_FAILURE(status)) {
deferredStatus = status;
void RBBISentMonkey::setText(const UnicodeString &s) {
fText = &s;
+ prepareAppliedRules(s.length());
}
UVector *RBBISentMonkey::charClasses() {
return fSets;
}
-
// moveBack() Find the "significant" code point preceding the index i.
// Skips over ($Extend | $Format)* .
//
p1 = p2; c1 = c2;
p2 = p3; c2 = c3;
- // Advancd p3 by X(Extend | Format)* Rule 4
+ // Advance p3 by X(Extend | Format)* Rule 4
p3 = moveForward(p3);
c3 = cAt(p3);
- // Rule (3) CR x LF
if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
+ setAppliedRule(p2, "SB3 CR x LF");
continue;
}
- // Rule (4). Sep <break>
if (fSepSet->contains(c1)) {
p2 = p1+1; // Separators don't combine with Extend or Format.
+
+ setAppliedRule(p2, "SB4 Sep <break>");
break;
}
if (p2 >= fText->length()) {
// Reached end of string. Always a break position.
+ setAppliedRule(p2, "SB4 Sep <break>");
break;
}
if (p2 == prevPos) {
// Still warming up the loop. (won't work with zero length strings, but we don't care)
+ setAppliedRule(p2, "SB4 Sep <break>");
continue;
}
- // Rule (6). ATerm x Numeric
if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
+ setAppliedRule(p2, "SB6 ATerm x Numeric");
continue;
}
- // Rule (7). (Upper | Lower) ATerm x Uppper
- if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
+ if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
fATermSet->contains(c1) && fUpperSet->contains(c2)) {
+ setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper");
continue;
}
- // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
// Note: STerm | ATerm are added to the negated part of the expression by a
// note to the Unicode 5.0 documents.
int p8 = p1;
if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
fLowerSet->contains(c) || fSepSet->contains(c) ||
fATermSet->contains(c) || fSTermSet->contains(c)) {
+
+ setAppliedRule(p2,
+ "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
break;
}
p8 = moveForward(p8);
}
if (fLowerSet->contains(cAt(p8))) {
+
+ setAppliedRule(p2,
+ "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
continue;
}
}
- // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
p8 = p1;
while (fSpSet->contains(cAt(p8))) {
}
c = cAt(p8);
if (fSTermSet->contains(c) || fATermSet->contains(c)) {
+ setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
continue;
}
}
- // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
int p9 = p1;
while (fCloseSet->contains(cAt(p9))) {
p9 = moveBack(p9);
c = cAt(p9);
if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
+
+ setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)");
continue;
}
}
- // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
int p10 = p1;
while (fSpSet->contains(cAt(p10))) {
p10 = moveBack(p10);
}
if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
+ setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)");
continue;
}
}
- // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
int p11 = p1;
if (fSepSet->contains(cAt(p11))) {
p11 = moveBack(p11);
p11 = moveBack(p11);
}
if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
+ setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>");
break;
}
- // Rule (12) Any x Any
+ setAppliedRule(p2, "SB12 Any x Any");
continue;
}
+
breakPos = p2;
return breakPos;
}
UnicodeSet *fEB;
UnicodeSet *fEM;
UnicodeSet *fZWJ;
+ UnicodeSet *fOP30;
+ UnicodeSet *fCP30;
BreakIterator *fCharBI;
const UnicodeString *fText;
fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
- fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
+ fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=CL}] [\\u201D]]"), status); // en adjustments for rdar://problem/51193810
fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
- fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
- fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
+ fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=OP}] [\\u201C\\u2018]]"), status); // en adjustments
+ fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=QU}]-[\\u201C\\u2018\\u201D]]"), status); // en adjustments
fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
- fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
+ fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
+ fOP30 = new UnicodeSet(u"[[\\p{Line_break=OP} [\\u201C\\u2018]]-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status); // en adjustments
+ fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
if (U_FAILURE(status)) {
deferredStatus = status;
fHH->add(u'\u2010'); // Hyphen, '‐'
- fSets->addElement(fBK, status);
- fSets->addElement(fCR, status);
- fSets->addElement(fLF, status);
- fSets->addElement(fCM, status);
- fSets->addElement(fNL, status);
- fSets->addElement(fWJ, status);
- fSets->addElement(fZW, status);
- fSets->addElement(fGL, status);
- fSets->addElement(fCB, status);
- fSets->addElement(fSP, status);
- fSets->addElement(fB2, status);
- fSets->addElement(fBA, status);
- fSets->addElement(fBB, status);
- fSets->addElement(fHY, status);
- fSets->addElement(fH2, status);
- fSets->addElement(fH3, status);
- fSets->addElement(fCL, status);
- fSets->addElement(fCP, status);
- fSets->addElement(fEX, status);
- fSets->addElement(fIN, status);
- fSets->addElement(fJL, status);
- fSets->addElement(fJT, status);
- fSets->addElement(fJV, status);
- fSets->addElement(fNS, status);
- fSets->addElement(fOP, status);
- fSets->addElement(fQU, status);
- fSets->addElement(fIS, status);
- fSets->addElement(fNU, status);
- fSets->addElement(fPO, status);
- fSets->addElement(fPR, status);
- fSets->addElement(fSY, status);
- fSets->addElement(fAI, status);
- fSets->addElement(fAL, status);
- fSets->addElement(fHL, status);
- fSets->addElement(fID, status);
- fSets->addElement(fWJ, status);
- fSets->addElement(fRI, status);
- fSets->addElement(fSG, status);
- fSets->addElement(fEB, status);
- fSets->addElement(fEM, status);
- fSets->addElement(fZWJ, status);
-
+ // Sets and names.
+ fSets->addElement(fBK, status); classNames.push_back("fBK");
+ fSets->addElement(fCR, status); classNames.push_back("fCR");
+ fSets->addElement(fLF, status); classNames.push_back("fLF");
+ fSets->addElement(fCM, status); classNames.push_back("fCM");
+ fSets->addElement(fNL, status); classNames.push_back("fNL");
+ fSets->addElement(fWJ, status); classNames.push_back("fWJ");
+ fSets->addElement(fZW, status); classNames.push_back("fZW");
+ fSets->addElement(fGL, status); classNames.push_back("fGL");
+ fSets->addElement(fCB, status); classNames.push_back("fCB");
+ fSets->addElement(fSP, status); classNames.push_back("fSP");
+ fSets->addElement(fB2, status); classNames.push_back("fB2");
+ fSets->addElement(fBA, status); classNames.push_back("fBA");
+ fSets->addElement(fBB, status); classNames.push_back("fBB");
+ fSets->addElement(fHY, status); classNames.push_back("fHY");
+ fSets->addElement(fH2, status); classNames.push_back("fH2");
+ fSets->addElement(fH3, status); classNames.push_back("fH3");
+ fSets->addElement(fCL, status); classNames.push_back("fCL");
+ fSets->addElement(fCP, status); classNames.push_back("fCP");
+ fSets->addElement(fEX, status); classNames.push_back("fEX");
+ fSets->addElement(fIN, status); classNames.push_back("fIN");
+ fSets->addElement(fJL, status); classNames.push_back("fJL");
+ fSets->addElement(fJT, status); classNames.push_back("fJT");
+ fSets->addElement(fJV, status); classNames.push_back("fJV");
+ fSets->addElement(fNS, status); classNames.push_back("fNS");
+ fSets->addElement(fOP, status); classNames.push_back("fOP");
+ fSets->addElement(fQU, status); classNames.push_back("fQU");
+ fSets->addElement(fIS, status); classNames.push_back("fIS");
+ fSets->addElement(fNU, status); classNames.push_back("fNU");
+ fSets->addElement(fPO, status); classNames.push_back("fPO");
+ fSets->addElement(fPR, status); classNames.push_back("fPR");
+ fSets->addElement(fSY, status); classNames.push_back("fSY");
+ fSets->addElement(fAI, status); classNames.push_back("fAI");
+ fSets->addElement(fAL, status); classNames.push_back("fAL");
+ fSets->addElement(fHL, status); classNames.push_back("fHL");
+ fSets->addElement(fID, status); classNames.push_back("fID");
+ fSets->addElement(fWJ, status); classNames.push_back("fWJ");
+ fSets->addElement(fRI, status); classNames.push_back("fRI");
+ fSets->addElement(fSG, status); classNames.push_back("fSG");
+ fSets->addElement(fEB, status); classNames.push_back("fEB");
+ fSets->addElement(fEM, status); classNames.push_back("fEM");
+ fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
+ // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
+ fSets->addElement(fOP30, status); classNames.push_back("fOP30");
+ fSets->addElement(fCP30, status); classNames.push_back("fCP30");
const char *rules =
"((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
if (U_FAILURE(status)) {
deferredStatus = status;
}
+
}
void RBBILineMonkey::setText(const UnicodeString &s) {
fText = &s;
fCharBI->setText(s);
+ prepareAppliedRules(s.length());
fNumberMatcher->reset(s);
}
int32_t nPos = *nextPos;
// LB 9 Keep combining sequences together.
- // advance over any CM class chars. Note that Line Break CM is different
- // from the normal Grapheme Extend property.
+ // advance over any CM class chars. Note that Line Break CM is different
+ // from the normal Grapheme Extend property.
if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
*posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
for (;;) {
nextCPPos = fText->moveIndex32(pos, 1);
nextPos = nextCPPos;
- // Rule LB2 - Break at end of text.
+
if (pos >= fText->length()) {
+ setAppliedRule(pos, "LB2 - Break at end of text.");
break;
}
- // Rule LB 9 - adjust for combining sequences.
+
// We do this one out-of-order because the adjustment does not change anything
// that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
// be applied.
- rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
+ rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
nextCPPos = nextPos = fText->moveIndex32(pos, 1);
c = fText->char32At(nextPos);
- rule9Adjust(pos, &thisChar, &nextPos, &c);
+ rule9Adjust(pos, &thisChar, &nextPos, &c);
// If the loop is still warming up - if we haven't shifted the initial
// -1 positions out of prevPos yet - loop back to advance the
// position in the input without any further looking for breaks.
if (prevPos == -1) {
+ setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
continue;
}
- // LB 4 Always break after hard line breaks,
+
if (fBK->contains(prevChar)) {
+ setAppliedRule(pos, "LB 4 Always break after hard line breaks");
break;
}
- // LB 5 Break after CR, LF, NL, but not inside CR LF
+
if (prevChar == 0x0d && thisChar == 0x0a) {
+ setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
continue;
}
if (prevChar == 0x0d ||
prevChar == 0x0a ||
prevChar == 0x85) {
+ setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
break;
}
- // LB 6 Don't break before hard line breaks
+
if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
fBK->contains(thisChar)) {
- continue;
+ setAppliedRule(pos, "LB 6 Don't break before hard line breaks");
+ continue;
}
- // LB 7 Don't break before spaces or zero-width space.
if (fSP->contains(thisChar)) {
+ setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
continue;
}
+ // !!! ??? Is this the right text for the applied rule?
if (fZW->contains(thisChar)) {
+ setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
continue;
}
- // LB 8 Break after zero width space
+
// ZW SP* ÷
// Scan backwards from prevChar for SP* ZW
tPos = prevPos;
tPos = fText->moveIndex32(tPos, -1);
}
if (fZW->contains(fText->char32At(tPos))) {
+ setAppliedRule(pos, "LB 8 Break after zero width space");
break;
}
- // LB 25 Numbers
+
// Move this test up, before LB8a, because numbers can match a longer sequence that would
// also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
if (fNumberMatcher->lookingAt(prevPos, status)) {
if (U_FAILURE(status)) {
+ setAppliedRule(pos, "LB 25 Numbers");
break;
}
// Matched a number. But could have been just a single digit, which would
thisChar = fText->char32At(pos);
} while (fCM->contains(thisChar));
}
+ setAppliedRule(pos, "LB 25 Numbers");
continue;
}
}
- // LB 8a ZWJ x
+
// The monkey test's way of ignoring combining characters doesn't work
// for this rule. ZJ is also a CM. Need to get the actual character
// preceding "thisChar", not ignoring combining marks, possibly ZJ.
int32_t prevIdx = fText->moveIndex32(pos, -1);
UChar32 prevC = fText->char32At(prevIdx);
if (fZWJ->contains(prevC)) {
+ setAppliedRule(pos, "LB 8a ZWJ x");
continue;
}
}
- // LB 9, 10 Already done, at top of loop.
+
+ // appliedRule: "LB 9, 10"; // Already done, at top of loop.";
//
- // LB 11 Do not break before or after WORD JOINER and related characters.
// x WJ
// WJ x
//
if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
+ setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters.");
continue;
}
- // LB 12
- // GL x
+
if (fGL->contains(prevChar)) {
+ setAppliedRule(pos, "LB 12 GL x");
continue;
}
- // LB 12a
- // [^SP BA HY] x GL
- if (!(fSP->contains(prevChar) ||
+
+ if (!(fSP->contains(prevChar) ||
fBA->contains(prevChar) ||
fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
- continue;
+ setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
+ continue;
}
- // LB 13 Don't break before closings.
- //
+
if (fCL->contains(thisChar) ||
fCP->contains(thisChar) ||
fEX->contains(thisChar) ||
fSY->contains(thisChar)) {
+ setAppliedRule(pos, "LB 13 Don't break before closings.");
continue;
}
- // LB 14 Don't break after OP SP*
+
// Scan backwards, checking for this sequence.
// The OP char could include combining marks, so we actually check for
// OP CM* SP*
tPos=fText->moveIndex32(tPos, -1);
}
if (fOP->contains(fText->char32At(tPos))) {
+ setAppliedRule(pos, "LB 14 Don't break after OP SP*");
continue;
}
- // LB 14a Break before an IS that begins a number and follows a space
if (nextPos < fText->length()) {
// note: UnicodeString::char32At(length) returns ffff, not distinguishable
// from a legit ffff character. So test length separately.
UChar32 nextChar = fText->char32At(nextPos);
if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
+ setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
break;
}
}
- // LB14b Do not break before numeric separators, even after spaces.
- if (fIS->contains(thisChar)) {
- continue;
+
+ if (fIS->contains(thisChar)) {
+ setAppliedRule(pos, "LB 14b Do not break before numeric separators, even after spaces.");
+ continue;
}
- // LB 15 QU SP* x OP
+
if (fOP->contains(thisChar)) {
// Scan backwards from prevChar to see if it is preceded by QU CM* SP*
int tPos = prevPos;
tPos = fText->moveIndex32(tPos, -1);
}
if (fQU->contains(fText->char32At(tPos))) {
+ setAppliedRule(pos, "LB 15 QU SP* x OP");
continue;
}
}
-
- // LB 16 (CL | CP) SP* x NS
// Scan backwards for SP* CM* (CL | CP)
if (fNS->contains(thisChar)) {
int tPos = prevPos;
tPos = fText->moveIndex32(tPos, -1);
}
if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
+ setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS");
continue;
}
}
- // LB 17 B2 SP* x B2
if (fB2->contains(thisChar)) {
// Scan backwards, checking for the B2 CM* SP* sequence.
tPos = prevPos;
tPos=fText->moveIndex32(tPos, -1);
}
if (fB2->contains(fText->char32At(tPos))) {
+ setAppliedRule(pos, "LB 17 B2 SP* x B2");
continue;
}
}
- // LB 18 break after space
if (fSP->contains(prevChar)) {
+ setAppliedRule(pos, "LB 18 break after space");
break;
}
- // LB 19
// x QU
// QU x
if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
+ setAppliedRule(pos, "LB 19");
continue;
}
- // LB 20 Break around a CB
if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
+ setAppliedRule(pos, "LB 20 Break around a CB");
break;
}
- // LB 20.09 Don't break between Hyphens and letters if a break precedes the hyphen.
+ // Don't break between Hyphens and letters if a break precedes the hyphen.
// Formerly this was a Finnish tailoring.
// Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
- // ^($HY | $HH) $AL;
+ // ^($HY | $HH) $AL;
if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
prevPosX2 == -1) {
+ setAppliedRule(pos, "LB 20.09");
continue;
}
- // LB 21
if (fBA->contains(thisChar) ||
fHY->contains(thisChar) ||
fNS->contains(thisChar) ||
fBB->contains(prevChar) ) {
+ setAppliedRule(pos, "LB 21");
continue;
}
- // LB 21a
- // HL (HY | BA) x
if (fHL->contains(prevCharX2) &&
(fHY->contains(prevChar) || fBA->contains(prevChar))) {
+ setAppliedRule(pos, "LB 21a HL (HY | BA) x");
continue;
}
- // LB 21b
- // SY x HL
if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
+ setAppliedRule(pos, "LB 21b SY x HL");
continue;
}
- // LB 22
- if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
- (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
- (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
- ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
- (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
- (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
+ if (fIN->contains(thisChar)) {
+ setAppliedRule(pos, "LB 22");
continue;
}
- // LB 23 (AL | HL) x NU
+ // (AL | HL) x NU
// NU x (AL | HL)
if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
+ setAppliedRule(pos, "LB 23");
continue;
}
if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
+ setAppliedRule(pos, "LB 23");
continue;
}
- // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
+ // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
// PR x (ID | EB | EM)
// (ID | EB | EM) x PO
if (fPR->contains(prevChar) &&
(fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
+ setAppliedRule(pos, "LB 23a");
continue;
}
if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
fPO->contains(thisChar)) {
+ setAppliedRule(pos, "LB 23a");
continue;
}
- // LB 24 Do not break between prefix and letters or ideographs.
+ // Do not break between prefix and letters or ideographs.
// (PR | PO) x (AL | HL)
// (AL | HL) x (PR | PO)
if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
(fAL->contains(thisChar) || fHL->contains(thisChar))) {
+ setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
continue;
}
if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
(fPR->contains(thisChar) || fPO->contains(thisChar))) {
+ setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
continue;
}
- // LB 25 numbers match, moved up, before LB 8a,
+ // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
- // LB 26 Do not break a Korean syllable.
if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
fJV->contains(thisChar) ||
fH2->contains(thisChar) ||
fH3->contains(thisChar))) {
- continue;
+ setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
+ continue;
}
if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
(fJV->contains(thisChar) || fJT->contains(thisChar))) {
- continue;
+ setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
+ continue;
}
if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
fJT->contains(thisChar)) {
- continue;
+ setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
+ continue;
}
- // LB 27 Treat a Korean Syllable Block the same as ID.
if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
fIN->contains(thisChar)) {
- continue;
+ setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
+ continue;
}
if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
fPO->contains(thisChar)) {
- continue;
+ setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
+ continue;
}
if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
- continue;
+ setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
+ continue;
}
- // LB 28 Do not break between alphabetics ("at").
if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
+ setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\").");
continue;
}
- // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
- if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
- continue;
+ if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
+ setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
+ continue;
}
- // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
// (AL | NU) x OP
// CP x (AL | NU)
- if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
+ if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
+ setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
continue;
}
- if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
+ if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
+ setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
continue;
}
- // LB30a RI RI ÷ RI
// RI x RI
if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
+ setAppliedRule(pos, "LB30a RI RI ÷ RI");
break;
}
if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
// Over-write the trailing one (thisChar) to prevent it from forming another pair with a
// following RI. This is a hack.
thisChar = -1;
+ setAppliedRule(pos, "LB30a RI RI ÷ RI");
continue;
}
- // LB30b Emoji Base x Emoji Modifier
if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
+ setAppliedRule(pos, "LB30b Emoji Base x Emoji Modifier");
continue;
}
- // LB 31 Break everywhere else
+ setAppliedRule(pos, "LB 31 Break everywhere else");
break;
-
}
return pos;
delete fEB;
delete fEM;
delete fZWJ;
+ delete fOP30;
+ delete fCP30;
delete fCharBI;
delete fNumberMatcher;
paramLength = (int32_t)(sizeof(valString)-2);
}
params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
- val = strtol(valString, NULL, 10);
+ val = strtol(valString, NULL, 10);
// Delete this parameter from the params string.
m.reset();
int expectedcount = 0;
monkey.setText(ustr);
+
int i;
for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
if (expectedcount >= EXPECTEDSIZE) {
int expectedcount = 0;
monkey.setText(ustr);
+
int i;
for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
if (expectedcount >= EXPECTEDSIZE) {
loopCount = loopCount / 10; // Sentence runs slower than the other break types
}
if (U_SUCCESS(status)) {
- RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
+ RunMonkey(bi, m, "sent", seed, loopCount, useUText);
}
else {
errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
UnicodeString testText;
int32_t numCharClasses;
UVector *chClasses;
- int expected[TESTSTRINGLEN*2 + 1];
int expectedCount = 0;
char expectedBreaks[TESTSTRINGLEN*2 + 1];
char forwardBreaks[TESTSTRINGLEN*2 + 1];
int i;
int loopCount = 0;
+
m_seed = seed;
numCharClasses = mk.charClasses()->size();
}
}
+ // For minimizing width of class name output.
+ int classNameSize = mk.maxClassNameSize();
+
while (loopCount < numIterations || numIterations == -1) {
if (numIterations == -1 && loopCount % 10 == 0) {
// If test is running in an infinite loop, display a periodic tic so
testText.append(c);
}
- // Calculate the expected results for this test string.
+ // Calculate the expected results for this test string and reset applied rules.
mk.setText(testText);
+
memset(expectedBreaks, 0, sizeof(expectedBreaks));
expectedBreaks[0] = 1;
int32_t breakPos = 0;
}
expectedBreaks[breakPos] = 1;
U_ASSERT(expectedCount<testText.length());
- expected[expectedCount ++] = breakPos;
- (void)expected; // Set but not used warning.
- // TODO (andy): check it out.
}
// Find the break positions using forward iteration
// Compare the expected and actual results.
for (i=0; i<=testText.length(); i++) {
const char *errorType = NULL;
+ const char* currentBreakData = NULL;
if (forwardBreaks[i] != expectedBreaks[i]) {
errorType = "next()";
+ currentBreakData = forwardBreaks;
} else if (reverseBreaks[i] != forwardBreaks[i]) {
errorType = "previous()";
- } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
+ currentBreakData = reverseBreaks;
+ } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
errorType = "isBoundary()";
+ currentBreakData = isBoundaryBreaks;
} else if (followingBreaks[i] != expectedBreaks[i]) {
errorType = "following()";
+ currentBreakData = followingBreaks;
} else if (precedingBreaks[i] != expectedBreaks[i]) {
errorType = "preceding()";
+ currentBreakData = precedingBreaks;
}
-
if (errorType != NULL) {
// Format a range of the test text that includes the failure as
// a data item that can be included in the rbbi test data file.
// Start of the range is the last point where expected and actual results
- // both agreed that there was a break position.
+ // both agreed that there was a break position.
+
int startContext = i;
int32_t count = 0;
for (;;) {
}
}
- // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
- UnicodeString errorText = "<data>";
- /***if (strcmp(errorType, "next()") == 0) {
- startContext = 0;
- endContext = testText.length();
+ // Formatting of each line includes:
+ // character code
+ // reference break: '|' -> a break, '.' -> no break
+ // actual break: '|' -> a break, '.' -> no break
+ // (name of character clase)
+ // Unicode name of character
+ // '-->' indicates location of the difference.
- printStringBreaks(testText, expected, expectedCount);
- }***/
+ MONKEY_ERROR(
+ (expectedBreaks[i] ? "Break expected but not found" :
+ "Break found but not expected"),
+ name, i, seed);
- for (ci=startContext; ci<endContext;) {
- UnicodeString hexChars("0123456789abcdef");
+ for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
UChar32 c;
- int bn;
c = testText.char32At(ci);
+
+ std::string currentLineFlag = " ";
if (ci == i) {
- // This is the location of the error.
- errorText.append("<?>");
- } else if (expectedBreaks[ci] != 0) {
- // This a non-error expected break position.
- errorText.append("\\");
+ currentLineFlag = "-->"; // Error position
}
- if (c < 0x10000) {
- errorText.append("\\u");
- for (bn=12; bn>=0; bn-=4) {
- errorText.append(hexChars.charAt((c>>bn)&0xf));
- }
+
+ // BMP or SMP character in hex
+ char hexCodePoint[12];
+ std::string format = " \\u%04x";
+ if (c >= 0x10000) {
+ format = "\\U%08x";
+ }
+ sprintf(hexCodePoint, format.c_str(), c);
+
+ // Get the class name and character name for the character.
+ char cName[200];
+ UErrorCode status = U_ZERO_ERROR;
+ u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
+
+ char buffer[200];
+ snprintf(buffer, 200,
+ "%4s %3i : %1s %1s %10s %-*s %-40s %-40s",
+ currentLineFlag.c_str(),
+ ci,
+ expectedBreaks[ci] == 0 ? "." : "|", // Reference break
+ currentBreakData[ci] == 0 ? "." : "|", // Actual break
+ hexCodePoint,
+ classNameSize,
+ mk.classNameFromCodepoint(c).c_str(),
+ mk.getAppliedRule(ci).c_str(), cName);
+
+ // Output the error
+ if (ci == i) {
+ errln(buffer);
} else {
- errorText.append("\\U");
- for (bn=28; bn>=0; bn-=4) {
- errorText.append(hexChars.charAt((c>>bn)&0xf));
- }
+ infoln(buffer);
}
- ci = testText.moveIndex32(ci, 1);
+
+ if (ci >= endContext) { break; }
}
- errorText.append("\\");
- errorText.append("</data>\n");
-
- // Output the error
- char charErrorTxt[500];
- UErrorCode status = U_ZERO_ERROR;
- errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
- charErrorTxt[sizeof(charErrorTxt)-1] = 0;
- const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
-
- errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
- name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
- errorType, seed, i, charErrorTxt);
break;
}
}
assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
- LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
+ LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
assertTrue(WHERE, *biEn == *cloneEn);
assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
- LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
+ LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
assertTrue(WHERE, *biFr == *cloneFr);
assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));