1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * Copyright (c) 2016, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 ********************************************************************/
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
13 #include "rbbimonkeytest.h"
14 #include "unicode/utypes.h"
15 #include "unicode/brkiter.h"
16 #include "unicode/utf16.h"
17 #include "unicode/uniset.h"
18 #include "unicode/unistr.h"
35 void RBBIMonkeyTest::runIndexedTest(int32_t index
, UBool exec
, const char* &name
, char* params
) {
36 fParams
= params
; // Work around TESTCASE_AUTO not being able to pass params to test function.
39 TESTCASE_AUTO(testMonkey
);
43 //---------------------------------------------------------------------------------------
45 // class BreakRule implementation.
47 //---------------------------------------------------------------------------------------
49 BreakRule::BreakRule() // : all field default initialized.
53 BreakRule::~BreakRule() {}
56 //---------------------------------------------------------------------------------------
58 // class BreakRules implementation.
60 //---------------------------------------------------------------------------------------
61 BreakRules::BreakRules(RBBIMonkeyImpl
*monkeyImpl
, UErrorCode
&status
) :
62 fMonkeyImpl(monkeyImpl
), fBreakRules(status
), fType(UBRK_COUNT
) {
63 fCharClasses
.adoptInstead(uhash_open(uhash_hashUnicodeString
,
64 uhash_compareUnicodeString
,
65 NULL
, // value comparator.
67 if (U_FAILURE(status
)) {
70 uhash_setKeyDeleter(fCharClasses
.getAlias(), uprv_deleteUObject
);
71 uhash_setValueDeleter(fCharClasses
.getAlias(), uprv_deleteUObject
);
72 fBreakRules
.setDeleter(uprv_deleteUObject
);
74 fCharClassList
.adoptInstead(new UVector(status
));
76 fSetRefsMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
77 "(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative look behind for '{' or '=' or '[:'
78 // (the identifier is a unicode property name or value)
79 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
82 // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
83 fCommentsMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
84 "(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';')
85 "[ \\t]*+" // Match white space.
86 "(#.*)?+" // Optional # plus whatever follows
87 "\\R$" // new-line at end of line.
90 // Match (initial parse) of a character class definition line.
91 fClassDefMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
92 "[ \\t]*" // leading white space
93 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
94 "[ \\t]*=[ \\t]*" // =
95 "(?<ClassDef>.*?)" // The char class UnicodeSet expression
96 "[ \\t]*;$"), // ; <end of line>
99 // Match (initial parse) of a break rule line.
100 fRuleDefMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
101 "[ \\t]*" // leading white space
102 "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name
103 "[ \\t]*:[ \\t]*" // :
104 "(?<RuleDef>.*?)" // The rule definition
105 "[ \\t]*;$"), // ; <end of line>
111 BreakRules::~BreakRules() {}
114 CharClass
*BreakRules::addCharClass(const UnicodeString
&name
, const UnicodeString
&definition
, UErrorCode
&status
) {
116 // Create the expanded definition for this char class,
117 // replacing any set references with the corresponding definition.
119 UnicodeString expandedDef
;
120 UnicodeString emptyString
;
121 fSetRefsMatcher
->reset(definition
);
122 while (fSetRefsMatcher
->find() && U_SUCCESS(status
)) {
123 const UnicodeString name
=
124 fSetRefsMatcher
->group(fSetRefsMatcher
->pattern().groupNumberFromName("ClassName", status
), status
);
125 CharClass
*nameClass
= static_cast<CharClass
*>(uhash_get(fCharClasses
.getAlias(), &name
));
126 const UnicodeString
&expansionForName
= nameClass
? nameClass
->fExpandedDef
: name
;
128 fSetRefsMatcher
->appendReplacement(expandedDef
, emptyString
, status
);
129 expandedDef
.append(expansionForName
);
131 fSetRefsMatcher
->appendTail(expandedDef
);
133 // Verify that the expanded set definition is valid.
135 if (fMonkeyImpl
->fDumpExpansions
) {
136 printf("epandedDef: %s\n", CStr(expandedDef
)());
139 UnicodeSet
*s
= new UnicodeSet(expandedDef
, USET_IGNORE_SPACE
, NULL
, status
);
140 if (U_FAILURE(status
)) {
141 IntlTest::gTest
->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__
, __LINE__
,
142 u_errorName(status
), CStr(name
)());
145 CharClass
*cclass
= new CharClass(name
, definition
, expandedDef
, s
);
146 CharClass
*previousClass
= static_cast<CharClass
*>(uhash_put(fCharClasses
.getAlias(),
147 new UnicodeString(name
), // Key, owned by hash table.
148 cclass
, // Value, owned by hash table.
151 if (previousClass
!= NULL
) {
152 // Duplicate class def.
153 // These are legitimate, they are adjustments of an existing class.
154 // TODO: will need to keep the old around when we handle tailorings.
155 IntlTest::gTest
->logln("Redefinition of character class %s\n", CStr(cclass
->fName
)());
156 delete previousClass
;
162 void BreakRules::addRule(const UnicodeString
&name
, const UnicodeString
&definition
, UErrorCode
&status
) {
163 LocalPointer
<BreakRule
> thisRule(new BreakRule
);
164 thisRule
->fName
= name
;
165 thisRule
->fRule
= definition
;
167 // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
168 // This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
169 UnicodeString emptyString
;
171 // Expand the char class definitions within the rule.
172 fSetRefsMatcher
->reset(definition
);
173 while (fSetRefsMatcher
->find() && U_SUCCESS(status
)) {
174 const UnicodeString name
=
175 fSetRefsMatcher
->group(fSetRefsMatcher
->pattern().groupNumberFromName("ClassName", status
), status
);
176 CharClass
*nameClass
= static_cast<CharClass
*>(uhash_get(fCharClasses
.getAlias(), &name
));
178 IntlTest::gTest
->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
179 __FILE__
, __LINE__
, CStr(name
)(), CStr(definition
)());
181 const UnicodeString
&expansionForName
= nameClass
? nameClass
->fExpandedDef
: name
;
183 fSetRefsMatcher
->appendReplacement(thisRule
->fExpandedRule
, emptyString
, status
);
184 thisRule
->fExpandedRule
.append(expansionForName
);
186 fSetRefsMatcher
->appendTail(thisRule
->fExpandedRule
);
188 // If rule begins with a '^' rule chaining is disallowed.
189 // Strip off the '^' from the rule expression, and set the flag.
190 if (thisRule
->fExpandedRule
.charAt(0) == u
'^') {
191 thisRule
->fInitialMatchOnly
= true;
192 thisRule
->fExpandedRule
.remove(0, 1);
193 thisRule
->fExpandedRule
.trim();
196 // Replace the divide sign (\u00f7) with a regular expression named capture.
197 // When running the rules, a match that includes this group means we found a break position.
199 int32_t dividePos
= thisRule
->fExpandedRule
.indexOf((UChar
)0x00f7);
200 if (dividePos
>= 0) {
201 thisRule
->fExpandedRule
.replace(dividePos
, 1, UnicodeString("(?<BreakPosition>)"));
203 if (thisRule
->fExpandedRule
.indexOf((UChar
)0x00f7) != -1) {
204 status
= U_ILLEGAL_ARGUMENT_ERROR
; // TODO: produce a good error message.
207 // UAX break rule set definitions can be empty, just [].
208 // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
209 // also matches nothing.
211 static const UChar emptySet
[] = {(UChar
)0x5b, (UChar
)0x5d, 0};
213 while ((where
= thisRule
->fExpandedRule
.indexOf(emptySet
, 2, 0)) >= 0) {
214 thisRule
->fExpandedRule
.replace(where
, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
216 if (fMonkeyImpl
->fDumpExpansions
) {
217 printf("fExpandedRule: %s\n", CStr(thisRule
->fExpandedRule
)());
220 // Compile a regular expression for this rule.
221 thisRule
->fRuleMatcher
.adoptInstead(new RegexMatcher(thisRule
->fExpandedRule
, UREGEX_COMMENTS
| UREGEX_DOTALL
, status
));
222 if (U_FAILURE(status
)) {
223 IntlTest::gTest
->errln("%s:%d Error creating regular expression for %s",
224 __FILE__
, __LINE__
, CStr(thisRule
->fExpandedRule
)());
228 // Put this new rule into the vector of all Rules.
229 fBreakRules
.addElement(thisRule
.orphan(), status
);
233 bool BreakRules::setKeywordParameter(const UnicodeString
&keyword
, const UnicodeString
&value
, UErrorCode
&status
) {
234 if (keyword
== UnicodeString("locale")) {
235 CharString localeName
;
236 localeName
.append(CStr(value
)(), -1, status
);
237 fLocale
= Locale::createFromName(localeName
.data());
240 if (keyword
== UnicodeString("type")) {
241 if (value
== UnicodeString("grapheme")) {
242 fType
= UBRK_CHARACTER
;
243 } else if (value
== UnicodeString("word")) {
245 } else if (value
== UnicodeString("line")) {
247 } else if (value
== UnicodeString("sentence")) {
248 fType
= UBRK_SENTENCE
;
250 IntlTest::gTest
->errln("%s:%d Unrecognized break type %s", __FILE__
, __LINE__
, CStr(value
)());
254 // TODO: add tailoring base setting here.
258 RuleBasedBreakIterator
*BreakRules::createICUBreakIterator(UErrorCode
&status
) {
259 if (U_FAILURE(status
)) {
262 RuleBasedBreakIterator
*bi
= NULL
;
265 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createCharacterInstance(fLocale
, status
));
268 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createWordInstance(fLocale
, status
));
271 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createLineInstance(fLocale
, status
));
274 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createSentenceInstance(fLocale
, status
));
277 IntlTest::gTest
->errln("%s:%d Bad break iterator type of %d", __FILE__
, __LINE__
, fType
);
278 status
= U_ILLEGAL_ARGUMENT_ERROR
;
284 void BreakRules::compileRules(UCHARBUF
*rules
, UErrorCode
&status
) {
285 if (U_FAILURE(status
)) {
289 UnicodeString emptyString
;
290 for (int32_t lineNumber
=0; ;lineNumber
++) { // Loop once per input line.
291 if (U_FAILURE(status
)) {
294 int32_t lineLength
= 0;
295 const UChar
*lineBuf
= ucbuf_readline(rules
, &lineLength
, &status
);
296 if (lineBuf
== NULL
) {
299 UnicodeString
line(lineBuf
, lineLength
);
301 // Strip comment lines.
302 fCommentsMatcher
->reset(line
);
303 line
= fCommentsMatcher
->replaceFirst(emptyString
, status
);
304 if (line
.isEmpty()) {
308 // Recognize character class definition and keyword lines
309 fClassDefMatcher
->reset(line
);
310 if (fClassDefMatcher
->matches(status
)) {
311 UnicodeString className
= fClassDefMatcher
->group(fClassDefMatcher
->pattern().groupNumberFromName("ClassName", status
), status
);
312 UnicodeString classDef
= fClassDefMatcher
->group(fClassDefMatcher
->pattern().groupNumberFromName("ClassDef", status
), status
);
313 if (fMonkeyImpl
->fDumpExpansions
) {
314 printf("scanned class: %s = %s\n", CStr(className
)(), CStr(classDef
)());
316 if (setKeywordParameter(className
, classDef
, status
)) {
317 // The scanned item was "type = ..." or "locale = ...", etc.
318 // which are not actual character classes.
321 addCharClass(className
, classDef
, status
);
325 // Recognize rule lines.
326 fRuleDefMatcher
->reset(line
);
327 if (fRuleDefMatcher
->matches(status
)) {
328 UnicodeString ruleName
= fRuleDefMatcher
->group(fRuleDefMatcher
->pattern().groupNumberFromName("RuleName", status
), status
);
329 UnicodeString ruleDef
= fRuleDefMatcher
->group(fRuleDefMatcher
->pattern().groupNumberFromName("RuleDef", status
), status
);
330 if (fMonkeyImpl
->fDumpExpansions
) {
331 printf("scanned rule: %s : %s\n", CStr(ruleName
)(), CStr(ruleDef
)());
333 addRule(ruleName
, ruleDef
, status
);
337 IntlTest::gTest
->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
338 __FILE__
, __LINE__
, fMonkeyImpl
->fRuleFileName
, CStr(line
)());
341 // Build the vector of char classes, omitting the dictionary class if there is one.
342 // This will be used when constructing the random text to be tested.
344 // Also compute the "other" set, consisting of any characters not included in
345 // one or more of the user defined sets.
347 UnicodeSet
otherSet((UChar32
)0, 0x10ffff);
348 int32_t pos
= UHASH_FIRST
;
349 const UHashElement
*el
= NULL
;
350 while ((el
= uhash_nextElement(fCharClasses
.getAlias(), &pos
)) != NULL
) {
351 const UnicodeString
*ccName
= static_cast<const UnicodeString
*>(el
->key
.pointer
);
352 CharClass
*cclass
= static_cast<CharClass
*>(el
->value
.pointer
);
353 // printf(" Adding %s\n", CStr(*ccName)());
354 if (*ccName
!= cclass
->fName
) {
355 IntlTest::gTest
->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
356 __FILE__
, __LINE__
, CStr(*ccName
)(), CStr(cclass
->fName
)());
358 const UnicodeSet
*set
= cclass
->fSet
.getAlias();
359 otherSet
.removeAll(*set
);
360 if (*ccName
== UnicodeString("dictionary")) {
361 fDictionarySet
= *set
;
363 fCharClassList
->addElement(cclass
, status
);
367 if (!otherSet
.isEmpty()) {
368 // fprintf(stderr, "have an other set.\n");
369 UnicodeString pattern
;
370 CharClass
*cclass
= addCharClass(UnicodeString("__Others"), otherSet
.toPattern(pattern
), status
);
371 fCharClassList
->addElement(cclass
, status
);
376 const CharClass
*BreakRules::getClassForChar(UChar32 c
, int32_t *iter
) const {
377 int32_t localIter
= 0;
378 int32_t &it
= iter
? *iter
: localIter
;
380 while (it
< fCharClassList
->size()) {
381 const CharClass
*cc
= static_cast<const CharClass
*>(fCharClassList
->elementAt(it
));
383 if (cc
->fSet
->contains(c
)) {
390 //---------------------------------------------------------------------------------------
392 // class MonkeyTestData implementation.
394 //---------------------------------------------------------------------------------------
396 void MonkeyTestData::set(BreakRules
*rules
, IntlTest::icu_rand
&rand
, UErrorCode
&status
) {
397 const int32_t dataLength
= 1000;
399 // Fill the test string with random characters.
400 // First randomly pick a char class, then randomly pick a character from that class.
401 // Exclude any characters from the dictionary set.
403 // std::cout << "Populating Test Data" << std::endl;
404 fRandomSeed
= rand
.getSeed(); // Save initial seed for use in error messages,
405 // allowing recreation of failing data.
408 for (int32_t n
=0; n
<dataLength
;) {
409 int charClassIndex
= rand() % rules
->fCharClassList
->size();
410 const CharClass
*cclass
= static_cast<CharClass
*>(rules
->fCharClassList
->elementAt(charClassIndex
));
411 if (cclass
->fSet
->size() == 0) {
412 // Some rules or tailorings do end up with empty char classes.
415 int32_t charIndex
= rand() % cclass
->fSet
->size();
416 UChar32 c
= cclass
->fSet
->charAt(charIndex
);
417 if (U16_IS_TRAIL(c
) && fString
.length() > 0 && U16_IS_LEAD(fString
.charAt(fString
.length()-1))) {
418 // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
419 // Don't let random unpaired surrogates combine in the test data because they might
420 // produce an unwanted dictionary character.
424 if (!rules
->fDictionarySet
.contains(c
)) {
430 // Reset each rule matcher regex with this new string.
431 // (Although we are always using the same string object, ICU regular expressions
432 // don't like the underlying string data changing without doing a reset).
434 for (int32_t ruleNum
=0; ruleNum
<rules
->fBreakRules
.size(); ruleNum
++) {
435 BreakRule
*rule
= static_cast<BreakRule
*>(rules
->fBreakRules
.elementAt(ruleNum
));
436 rule
->fRuleMatcher
->reset(fString
);
439 // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
440 // Expected and Actual breaks are one longer than the input string; a non-zero value
441 // will indicate a boundary preceding that position.
444 fExpectedBreaks
= fActualBreaks
;
445 fRuleForPosition
= fActualBreaks
;
446 f2ndRuleForPos
= fActualBreaks
;
448 // Apply reference rules to find the expected breaks.
450 fExpectedBreaks
.setCharAt(0, (UChar
)1); // Force an expected break before the start of the text.
451 // ICU always reports a break there.
452 // The reference rules do not have a means to do so.
454 bool initialMatch
= true; // True at start of text, and immediately after each boundary,
455 // for control over rule chaining.
456 while (strIdx
< fString
.length()) {
457 BreakRule
*matchingRule
= NULL
;
458 UBool hasBreak
= FALSE
;
460 int32_t matchStart
= 0;
461 int32_t matchEnd
= 0;
462 int32_t breakGroup
= 0;
463 for (ruleNum
=0; ruleNum
<rules
->fBreakRules
.size(); ruleNum
++) {
464 BreakRule
*rule
= static_cast<BreakRule
*>(rules
->fBreakRules
.elementAt(ruleNum
));
465 if (rule
->fInitialMatchOnly
&& !initialMatch
) {
466 // Skip checking this '^' rule. (No rule chaining)
469 rule
->fRuleMatcher
->reset();
470 if (rule
->fRuleMatcher
->lookingAt(strIdx
, status
)) {
471 // A candidate rule match, check further to see if we take it or continue to check other rules.
472 // Matches of zero or one codepoint count only if they also specify a break.
473 matchStart
= rule
->fRuleMatcher
->start(status
);
474 matchEnd
= rule
->fRuleMatcher
->end(status
);
475 breakGroup
= rule
->fRuleMatcher
->pattern().groupNumberFromName("BreakPosition", status
);
476 hasBreak
= U_SUCCESS(status
);
477 if (status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
) {
478 status
= U_ZERO_ERROR
;
480 if (hasBreak
|| fString
.moveIndex32(matchStart
, 1) < matchEnd
) {
486 if (matchingRule
== NULL
) {
487 // No reference rule matched. This is an error in the rules that should never happen.
488 IntlTest::gTest
->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
489 __FILE__
, __LINE__
, strIdx
);
491 status
= U_INVALID_FORMAT_ERROR
;
494 if (matchingRule
->fRuleMatcher
->group(status
).length() == 0) {
495 // Zero length rule match. This is also an error in the rule expressions.
496 IntlTest::gTest
->errln("%s:%d Zero length rule match.",
498 status
= U_INVALID_FORMAT_ERROR
;
502 // Record which rule matched over the length of the match.
503 for (int i
= matchStart
; i
< matchEnd
; i
++) {
504 if (fRuleForPosition
.charAt(i
) == 0) {
505 fRuleForPosition
.setCharAt(i
, (UChar
)ruleNum
);
507 f2ndRuleForPos
.setCharAt(i
, (UChar
)ruleNum
);
511 // Break positions appear in rules as a matching named capture of zero length at the break position,
512 // the adjusted pattern contains (?<BreakPosition>)
514 int32_t breakPos
= matchingRule
->fRuleMatcher
->start(breakGroup
, status
);
515 if (U_FAILURE(status
) || breakPos
< 0) {
516 // Rule specified a break, but that break wasn't part of the match, even
517 // though the rule as a whole matched.
518 // Can't happen with regular expressions derived from (equivalent to) ICU break rules.
519 // Shouldn't get here.
520 IntlTest::gTest
->errln("%s:%d Internal Rule Error.", __FILE__
, __LINE__
);
521 status
= U_INVALID_FORMAT_ERROR
;
524 fExpectedBreaks
.setCharAt(breakPos
, (UChar
)1);
525 // printf("recording break at %d\n", breakPos);
526 // For the next iteration, pick up applying rules immediately after the break,
527 // which may differ from end of the match. The matching rule may have included
528 // context following the boundary that needs to be looked at again.
529 strIdx
= matchingRule
->fRuleMatcher
->end(breakGroup
, status
);
532 // Original rule didn't specify a break.
533 // Continue applying rules starting on the last code point of this match.
534 strIdx
= fString
.moveIndex32(matchEnd
, -1);
535 initialMatch
= false;
536 if (strIdx
== matchStart
) {
537 // Match was only one code point, no progress if we continue.
538 // Shouldn't get here, case is filtered out at top of loop.
540 ruleName
.appendInvariantChars(matchingRule
->fName
, status
);
541 IntlTest::gTest
->errln("%s:%d Rule %s internal error",
542 __FILE__
, __LINE__
, ruleName
.data());
543 status
= U_INVALID_FORMAT_ERROR
;
547 if (U_FAILURE(status
)) {
548 IntlTest::gTest
->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
549 __FILE__
, __LINE__
, u_errorName(status
));
555 void MonkeyTestData::clearActualBreaks() {
556 fActualBreaks
.remove();
557 // Actual Breaks length is one longer than the data string length, allowing
558 // for breaks before the first and after the last character in the data.
559 for (int32_t i
=0; i
<=fString
.length(); i
++) {
560 fActualBreaks
.append((UChar
)0);
564 void MonkeyTestData::dump(int32_t around
) const {
566 " char break Rule Character\n"
567 " pos code class R I name name\n"
568 "---------------------------------------------------------------------------------------------\n");
575 end
= fString
.length();
577 // Display context around a failure.
578 start
= fString
.moveIndex32(around
, -30);
579 end
= fString
.moveIndex32(around
, +30);
582 for (int charIdx
= start
; charIdx
< end
; charIdx
=fString
.moveIndex32(charIdx
, 1)) {
583 UErrorCode status
= U_ZERO_ERROR
;
584 UChar32 c
= fString
.char32At(charIdx
);
585 const CharClass
*cc
= fBkRules
->getClassForChar(c
);
587 ccName
.appendInvariantChars(cc
->fName
, status
);
588 CharString ruleName
, secondRuleName
;
589 const BreakRule
*rule
= static_cast<BreakRule
*>(fBkRules
->fBreakRules
.elementAt(fRuleForPosition
.charAt(charIdx
)));
590 ruleName
.appendInvariantChars(rule
->fName
, status
);
591 if (f2ndRuleForPos
.charAt(charIdx
) > 0) {
592 const BreakRule
*secondRule
= static_cast<BreakRule
*>(fBkRules
->fBreakRules
.elementAt(f2ndRuleForPos
.charAt(charIdx
)));
593 secondRuleName
.appendInvariantChars(secondRule
->fName
, status
);
596 u_charName(c
, U_EXTENDED_CHAR_NAME
, cName
, sizeof(cName
), &status
);
598 printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n",
599 charIdx
, c
, ccName
.data(),
600 fExpectedBreaks
.charAt(charIdx
) ? '*' : '.',
601 fActualBreaks
.charAt(charIdx
) ? '*' : '.',
602 ruleName
.data(), secondRuleName
.data(), cName
608 //---------------------------------------------------------------------------------------
610 // class RBBIMonkeyImpl
612 //---------------------------------------------------------------------------------------
614 RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode
&status
) : fDumpExpansions(FALSE
), fThread(this) {
615 (void)status
; // suppress unused parameter compiler warning.
619 // RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the
620 // reference rules and creating the icu breakiterator to test,
621 // with its type and locale coming from the reference rules.
623 void RBBIMonkeyImpl::setup(const char *ruleFile
, UErrorCode
&status
) {
624 fRuleFileName
= ruleFile
;
625 openBreakRules(ruleFile
, status
);
626 if (U_FAILURE(status
)) {
627 IntlTest::gTest
->errln("%s:%d Error %s opening file %s.", __FILE__
, __LINE__
, u_errorName(status
), ruleFile
);
630 fRuleSet
.adoptInstead(new BreakRules(this, status
));
631 fRuleSet
->compileRules(fRuleCharBuffer
.getAlias(), status
);
632 if (U_FAILURE(status
)) {
633 IntlTest::gTest
->errln("%s:%d Error %s processing file %s.", __FILE__
, __LINE__
, u_errorName(status
), ruleFile
);
636 fBI
.adoptInstead(fRuleSet
->createICUBreakIterator(status
));
637 fTestData
.adoptInstead(new MonkeyTestData());
641 RBBIMonkeyImpl::~RBBIMonkeyImpl() {
645 void RBBIMonkeyImpl::openBreakRules(const char *fileName
, UErrorCode
&status
) {
647 path
.append(IntlTest::getSourceTestData(status
), status
);
648 path
.append("break_rules" U_FILE_SEP_STRING
, status
);
649 path
.appendPathPart(fileName
, status
);
650 const char *codePage
= "UTF-8";
651 fRuleCharBuffer
.adoptInstead(ucbuf_open(path
.data(), &codePage
, TRUE
, FALSE
, &status
));
655 void RBBIMonkeyImpl::startTest() {
656 fThread
.start(); // invokes runTest() in a separate thread.
659 void RBBIMonkeyImpl::join() {
664 #define MONKEY_ERROR(msg, index) { \
665 IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
666 __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
667 if (fVerbose) { fTestData->dump(index); } \
668 status = U_INVALID_STATE_ERROR; \
671 void RBBIMonkeyImpl::runTest() {
672 UErrorCode status
= U_ZERO_ERROR
;
673 int32_t errorCount
= 0;
674 for (int64_t loopCount
= 0; fLoopCount
< 0 || loopCount
< fLoopCount
; loopCount
++) {
675 status
= U_ZERO_ERROR
;
676 fTestData
->set(fRuleSet
.getAlias(), fRandomGenerator
, status
);
678 IntlTest::gTest
->dataerrln("Unable to run test because fBI is null.");
681 if ( uprv_strcmp(fRuleFileName
,"line_loose_cj.txt") == 0 && fTestData
->fRandomSeed
==1712915859 ) {
682 continue; // known bug around index 103-104, break expected/actual 0/1, fwd 0020 200D | FDFC, rev 1325A 0020 | 200D
684 // fTestData->dump();
685 testForwards(status
);
686 testPrevious(status
);
687 testFollowing(status
);
688 testPreceding(status
);
689 testIsBoundary(status
);
690 testIsBoundaryRandom(status
);
692 if (fLoopCount
< 0 && loopCount
% 100 == 0) {
693 fprintf(stderr
, ".");
695 if (U_FAILURE(status
)) {
696 if (++errorCount
> 10) {
703 void RBBIMonkeyImpl::testForwards(UErrorCode
&status
) {
704 if (U_FAILURE(status
)) {
707 fTestData
->clearActualBreaks();
708 fBI
->setText(fTestData
->fString
);
709 int32_t previousBreak
= -2;
710 for (int32_t bk
=fBI
->first(); bk
!= BreakIterator::DONE
; bk
=fBI
->next()) {
711 if (bk
<= previousBreak
) {
712 MONKEY_ERROR("Break Iterator Stall", bk
);
715 if (bk
< 0 || bk
> fTestData
->fString
.length()) {
716 MONKEY_ERROR("Boundary out of bounds", bk
);
719 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
721 checkResults("testForwards", FORWARD
, status
);
724 void RBBIMonkeyImpl::testFollowing(UErrorCode
&status
) {
725 if (U_FAILURE(status
)) {
728 fTestData
->clearActualBreaks();
729 fBI
->setText(fTestData
->fString
);
730 int32_t nextBreak
= -1;
731 for (int32_t i
=-1 ; i
<fTestData
->fString
.length(); ++i
) {
732 int32_t bk
= fBI
->following(i
);
733 if (bk
== BreakIterator::DONE
&& i
== fTestData
->fString
.length()) {
736 if (bk
== nextBreak
&& bk
> i
) {
737 // i is in the gap between two breaks.
740 if (i
== nextBreak
&& bk
> nextBreak
) {
741 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
745 MONKEY_ERROR("following(i)", i
);
748 checkResults("testFollowing", FORWARD
, status
);
753 void RBBIMonkeyImpl::testPrevious(UErrorCode
&status
) {
754 if (U_FAILURE(status
)) {return;}
756 fTestData
->clearActualBreaks();
757 fBI
->setText(fTestData
->fString
);
758 int32_t previousBreak
= INT32_MAX
;
759 for (int32_t bk
=fBI
->last(); bk
!= BreakIterator::DONE
; bk
=fBI
->previous()) {
760 if (bk
>= previousBreak
) {
761 MONKEY_ERROR("Break Iterator Stall", bk
);
764 if (bk
< 0 || bk
> fTestData
->fString
.length()) {
765 MONKEY_ERROR("Boundary out of bounds", bk
);
768 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
770 checkResults("testPrevious", REVERSE
, status
);
774 void RBBIMonkeyImpl::testPreceding(UErrorCode
&status
) {
775 if (U_FAILURE(status
)) {
778 fTestData
->clearActualBreaks();
779 fBI
->setText(fTestData
->fString
);
780 int32_t nextBreak
= fTestData
->fString
.length()+1;
781 for (int32_t i
=fTestData
->fString
.length()+1 ; i
>=0; --i
) {
782 int32_t bk
= fBI
->preceding(i
);
783 // printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
784 if (bk
== BreakIterator::DONE
&& i
== 0) {
787 if (bk
== nextBreak
&& bk
< i
) {
788 // i is in the gap between two breaks.
791 if (i
<fTestData
->fString
.length() && fTestData
->fString
.getChar32Start(i
) < i
) {
792 // i indexes to a trailing surrogate.
793 // Break Iterators treat an index to either half as referring to the supplemental code point,
794 // with preceding going to some preceding code point.
795 if (fBI
->preceding(i
) != fBI
->preceding(fTestData
->fString
.getChar32Start(i
))) {
796 MONKEY_ERROR("preceding of trailing surrogate error", i
);
800 if (i
== nextBreak
&& bk
< nextBreak
) {
801 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
805 MONKEY_ERROR("preceding(i)", i
);
808 checkResults("testPreceding", REVERSE
, status
);
812 void RBBIMonkeyImpl::testIsBoundary(UErrorCode
&status
) {
813 if (U_FAILURE(status
)) {
816 fTestData
->clearActualBreaks();
817 fBI
->setText(fTestData
->fString
);
818 for (int i
=fTestData
->fString
.length(); i
>=0; --i
) {
819 if (fBI
->isBoundary(i
)) {
820 fTestData
->fActualBreaks
.setCharAt(i
, 1);
823 checkResults("testForwards", FORWARD
, status
);
826 void RBBIMonkeyImpl::testIsBoundaryRandom(UErrorCode
&status
) {
827 if (U_FAILURE(status
)) {
830 fBI
->setText(fTestData
->fString
);
832 int stringLen
= fTestData
->fString
.length();
833 for (int i
=stringLen
; i
>=0; --i
) {
834 int strIdx
= fRandomGenerator() % stringLen
;
835 if (fTestData
->fExpectedBreaks
.charAt(strIdx
) != fBI
->isBoundary(strIdx
)) {
836 IntlTest::gTest
->errln("%s:%d testIsBoundaryRandom failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
837 __FILE__
, __LINE__
, strIdx
, fRuleFileName
, fTestData
->fRandomSeed
);
841 status
= U_INVALID_STATE_ERROR
;
849 void RBBIMonkeyImpl::checkResults(const char *msg
, CheckDirection direction
, UErrorCode
&status
) {
850 if (U_FAILURE(status
)) {
853 if (direction
== FORWARD
) {
854 for (int i
=0; i
<=fTestData
->fString
.length(); ++i
) {
855 if (fTestData
->fExpectedBreaks
.charAt(i
) != fTestData
->fActualBreaks
.charAt(i
)) {
857 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
858 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-2), fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
859 fRuleFileName
, fTestData
->fRandomSeed
);
861 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
862 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
863 fRuleFileName
, fTestData
->fRandomSeed
);
868 status
= U_INVALID_STATE_ERROR
; // Prevent the test from continuing, which would likely
869 break; // produce many redundant errors.
873 for (int i
=fTestData
->fString
.length(); i
>=0; i
--) {
874 if (fTestData
->fExpectedBreaks
.charAt(i
) != fTestData
->fActualBreaks
.charAt(i
)) {
876 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
877 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-2), fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
878 fRuleFileName
, fTestData
->fRandomSeed
);
880 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
881 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
882 fRuleFileName
, fTestData
->fRandomSeed
);
887 status
= U_INVALID_STATE_ERROR
;
896 //---------------------------------------------------------------------------------------
898 // class RBBIMonkeyTest implementation.
900 //---------------------------------------------------------------------------------------
901 RBBIMonkeyTest::RBBIMonkeyTest() {
904 RBBIMonkeyTest::~RBBIMonkeyTest() {
908 // params, taken from this->fParams.
909 // rules=file_name Name of file containing the reference rules.
910 // seed=nnnnn Random number starting seed.
911 // Setting the seed allows errors to be reproduced.
912 // loop=nnn Looping count. Controls running time.
914 // 0 or greater: run length.
915 // expansions debug option, show expansions of rules and sets.
916 // verbose Display details of the failure.
918 // Parameters on the intltest command line follow the test name, and are preceded by '@'.
920 // intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
922 void RBBIMonkeyTest::testMonkey() {
923 // printf("Test parameters: %s\n", fParams);
924 UnicodeString
params(fParams
);
925 UErrorCode status
= U_ZERO_ERROR
;
927 const char *tests
[] = {"grapheme.txt", "word.txt", "line.txt", "line_cj.txt", "sentence.txt", "line_normal.txt",
928 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
930 CharString testNameFromParams
;
931 if (getStringParam("rules", params
, testNameFromParams
, status
)) {
932 tests
[0] = testNameFromParams
.data();
936 int64_t loopCount
= quick
? 100 : 5000;
937 getIntParam("loop", params
, loopCount
, status
);
939 UBool dumpExpansions
= FALSE
;
940 getBoolParam("expansions", params
, dumpExpansions
, status
);
942 UBool verbose
= FALSE
;
943 getBoolParam("verbose", params
, verbose
, status
);
946 getIntParam("seed", params
, seed
, status
);
948 if (params
.length() != 0) {
949 // Options processing did not consume all of the parameters. Something unrecognized was present.
950 CharString unrecognizedParameters
;
951 unrecognizedParameters
.append(CStr(params
)(), -1, status
);
952 errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__
, __LINE__
, unrecognizedParameters
.data());
956 UVector
startedTests(status
);
957 if (U_FAILURE(status
)) {
958 errln("%s:%d: error %s while setting up test.", __FILE__
, __LINE__
, u_errorName(status
));
962 // Monkey testing is multi-threaded.
963 // Each set of break rules to be tested is run in a separate thread.
964 // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
966 for (i
=0; tests
[i
] != NULL
; ++i
) {
967 logln("beginning testing of %s", tests
[i
]);
968 LocalPointer
<RBBIMonkeyImpl
> test(new RBBIMonkeyImpl(status
));
969 if (U_FAILURE(status
)) {
970 dataerrln("%s:%d: error %s while starting test %s.", __FILE__
, __LINE__
, u_errorName(status
), tests
[i
]);
973 test
->fDumpExpansions
= dumpExpansions
;
974 test
->fVerbose
= verbose
;
975 test
->fRandomGenerator
.seed(static_cast<uint32_t>(seed
));
976 test
->fLoopCount
= static_cast<int32_t>(loopCount
);
977 test
->setup(tests
[i
], status
);
978 if (U_FAILURE(status
)) {
979 dataerrln("%s:%d: error %s while starting test %s.", __FILE__
, __LINE__
, u_errorName(status
), tests
[i
]);
983 startedTests
.addElement(test
.orphan(), status
);
984 if (U_FAILURE(status
)) {
985 errln("%s:%d: error %s while starting test %s.", __FILE__
, __LINE__
, u_errorName(status
), tests
[i
]);
990 for (i
=0; i
<startedTests
.size(); ++i
) {
991 RBBIMonkeyImpl
*test
= static_cast<RBBIMonkeyImpl
*>(startedTests
.elementAt(i
));
998 UBool
RBBIMonkeyTest::getIntParam(UnicodeString name
, UnicodeString
¶ms
, int64_t &val
, UErrorCode
&status
) {
999 name
.append(" *= *(-?\\d+) *,? *");
1000 RegexMatcher
m(name
, params
, 0, status
);
1002 // The param exists. Convert the string to an int.
1004 str
.append(CStr(m
.group(1, status
))(), -1, status
);
1005 val
= strtol(str
.data(), NULL
, 10);
1007 // Delete this parameter from the params string.
1009 params
= m
.replaceFirst(UnicodeString(), status
);
1015 UBool
RBBIMonkeyTest::getStringParam(UnicodeString name
, UnicodeString
¶ms
, CharString
&dest
, UErrorCode
&status
) {
1016 name
.append(" *= *([^ ,]*) *,? *");
1017 RegexMatcher
m(name
, params
, 0, status
);
1019 // The param exists.
1020 dest
.append(CStr(m
.group(1, status
))(), -1, status
);
1022 // Delete this parameter from the params string.
1024 params
= m
.replaceFirst(UnicodeString(), status
);
1030 UBool
RBBIMonkeyTest::getBoolParam(UnicodeString name
, UnicodeString
¶ms
, UBool
&dest
, UErrorCode
&status
) {
1031 name
.append("(?: *= *(true|false))? *,? *");
1032 RegexMatcher
m(name
, params
, UREGEX_CASE_INSENSITIVE
, status
);
1034 if (m
.start(1, status
) > 0) {
1035 // user option included a value.
1036 dest
= m
.group(1, status
).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT
) == 0;
1038 // No explicit user value, implies true.
1042 // Delete this parameter from the params string.
1044 params
= m
.replaceFirst(UnicodeString(), status
);
1050 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */