1 /********************************************************************
2 * Copyright (c) 2016, International Business Machines Corporation and
3 * others. All Rights Reserved.
4 ********************************************************************/
7 #include "unicode/utypes.h"
9 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
11 #include "rbbimonkeytest.h"
12 #include "unicode/utypes.h"
13 #include "unicode/brkiter.h"
14 #include "unicode/utf16.h"
15 #include "unicode/uniset.h"
16 #include "unicode/unistr.h"
31 void RBBIMonkeyTest::runIndexedTest(int32_t index
, UBool exec
, const char* &name
, char* params
) {
32 fParams
= params
; // Work around TESTCASE_AUTO not being able to pass params to test function.
35 TESTCASE_AUTO(testMonkey
);
39 //---------------------------------------------------------------------------------------
41 // class BreakRule implementation.
43 //---------------------------------------------------------------------------------------
45 BreakRule::BreakRule() // : all field default initialized.
49 BreakRule::~BreakRule() {}
52 //---------------------------------------------------------------------------------------
54 // class BreakRules implementation.
56 //---------------------------------------------------------------------------------------
57 BreakRules::BreakRules(RBBIMonkeyImpl
*monkeyImpl
, UErrorCode
&status
) :
58 fMonkeyImpl(monkeyImpl
), fBreakRules(status
), fType(UBRK_COUNT
) {
59 fCharClasses
.adoptInstead(uhash_open(uhash_hashUnicodeString
,
60 uhash_compareUnicodeString
,
61 NULL
, // value comparator.
63 if (U_FAILURE(status
)) {
66 uhash_setKeyDeleter(fCharClasses
.getAlias(), uprv_deleteUObject
);
67 uhash_setValueDeleter(fCharClasses
.getAlias(), uprv_deleteUObject
);
68 fBreakRules
.setDeleter(uprv_deleteUObject
);
70 fCharClassList
.adoptInstead(new UVector(status
));
72 fSetRefsMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
73 "(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative lookbehind for '{' or '=' or '[:'
74 // (the identifier is a unicode property name or value)
75 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
78 // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
79 fCommentsMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
80 "(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';')
81 "[ \\t]*+" // Match white space.
82 "(#.*)?+" // Optional # plus whatever follows
83 "\\R$" // new-line at end of line.
86 // Match (initial parse) of a character class defintion line.
87 fClassDefMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
88 "[ \\t]*" // leading white space
89 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
90 "[ \\t]*=[ \\t]*" // =
91 "(?<ClassDef>.*?)" // The char class UnicodeSet expression
92 "[ \\t]*;$"), // ; <end of line>
95 // Match (initial parse) of a break rule line.
96 fRuleDefMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
97 "[ \\t]*" // leading white space
98 "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name
99 "[ \\t]*:[ \\t]*" // :
100 "(?<RuleDef>.*?)" // The rule definition
101 "[ \\t]*;$"), // ; <end of line>
107 BreakRules::~BreakRules() {}
110 CharClass
*BreakRules::addCharClass(const UnicodeString
&name
, const UnicodeString
&definition
, UErrorCode
&status
) {
112 // Create the expanded definition for this char class,
113 // replacing any set references with the corresponding definition.
115 UnicodeString expandedDef
;
116 UnicodeString emptyString
;
117 fSetRefsMatcher
->reset(definition
);
118 while (fSetRefsMatcher
->find() && U_SUCCESS(status
)) {
119 const UnicodeString name
=
120 fSetRefsMatcher
->group(fSetRefsMatcher
->pattern().groupNumberFromName("ClassName", status
), status
);
121 CharClass
*nameClass
= static_cast<CharClass
*>(uhash_get(fCharClasses
.getAlias(), &name
));
122 const UnicodeString
&expansionForName
= nameClass
? nameClass
->fExpandedDef
: name
;
124 fSetRefsMatcher
->appendReplacement(expandedDef
, emptyString
, status
);
125 expandedDef
.append(expansionForName
);
127 fSetRefsMatcher
->appendTail(expandedDef
);
129 // Verify that the expanded set defintion is valid.
131 if (fMonkeyImpl
->fDumpExpansions
) {
132 printf("epandedDef: %s\n", CStr(expandedDef
)());
135 UnicodeSet
*s
= new UnicodeSet(expandedDef
, USET_IGNORE_SPACE
, NULL
, status
);
136 if (U_FAILURE(status
)) {
137 IntlTest::gTest
->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__
, __LINE__
,
138 u_errorName(status
), CStr(name
)());
141 CharClass
*cclass
= new CharClass(name
, definition
, expandedDef
, s
);
142 CharClass
*previousClass
= static_cast<CharClass
*>(uhash_put(fCharClasses
.getAlias(),
143 new UnicodeString(name
), // Key, owned by hash table.
144 cclass
, // Value, owned by hash table.
147 if (previousClass
!= NULL
) {
148 // Duplicate class def.
149 // These are legitimate, they are adustments of an existing class.
150 // TODO: will need to keep the old around when we handle tailorings.
151 IntlTest::gTest
->logln("Redefinition of character class %s\n", CStr(cclass
->fName
)());
152 delete previousClass
;
158 void BreakRules::addRule(const UnicodeString
&name
, const UnicodeString
&definition
, UErrorCode
&status
) {
159 LocalPointer
<BreakRule
> thisRule(new BreakRule
);
160 thisRule
->fName
= name
;
161 thisRule
->fRule
= definition
;
163 // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
164 // This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
165 UnicodeString emptyString
;
167 // Expand the char class definitions within the rule.
168 fSetRefsMatcher
->reset(definition
);
169 while (fSetRefsMatcher
->find() && U_SUCCESS(status
)) {
170 const UnicodeString name
=
171 fSetRefsMatcher
->group(fSetRefsMatcher
->pattern().groupNumberFromName("ClassName", status
), status
);
172 CharClass
*nameClass
= static_cast<CharClass
*>(uhash_get(fCharClasses
.getAlias(), &name
));
174 IntlTest::gTest
->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
175 __FILE__
, __LINE__
, CStr(name
)(), CStr(definition
)());
177 const UnicodeString
&expansionForName
= nameClass
? nameClass
->fExpandedDef
: name
;
179 fSetRefsMatcher
->appendReplacement(thisRule
->fExpandedRule
, emptyString
, status
);
180 thisRule
->fExpandedRule
.append(expansionForName
);
182 fSetRefsMatcher
->appendTail(thisRule
->fExpandedRule
);
184 // Replace the divide sign (\u00f7) with a regular expression named capture.
185 // When running the rules, a match that includes this group means we found a break position.
187 int32_t dividePos
= thisRule
->fExpandedRule
.indexOf((UChar
)0x00f7);
188 if (dividePos
>= 0) {
189 thisRule
->fExpandedRule
.replace(dividePos
, 1, UnicodeString("(?<BreakPosition>)"));
191 if (thisRule
->fExpandedRule
.indexOf((UChar
)0x00f7) != -1) {
192 status
= U_ILLEGAL_ARGUMENT_ERROR
; // TODO: produce a good error message.
195 // UAX break rule set definitions can be empty, just [].
196 // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
197 // also matches nothing.
199 static const UChar emptySet
[] = {(UChar
)0x5b, (UChar
)0x5d, 0};
201 while ((where
= thisRule
->fExpandedRule
.indexOf(emptySet
, 2, 0)) >= 0) {
202 thisRule
->fExpandedRule
.replace(where
, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
204 if (fMonkeyImpl
->fDumpExpansions
) {
205 printf("fExpandedRule: %s\n", CStr(thisRule
->fExpandedRule
)());
208 // Compile a regular expression for this rule.
209 thisRule
->fRuleMatcher
.adoptInstead(new RegexMatcher(thisRule
->fExpandedRule
, UREGEX_COMMENTS
| UREGEX_DOTALL
, status
));
210 if (U_FAILURE(status
)) {
211 IntlTest::gTest
->errln("%s:%d Error creating regular expression for %s",
212 __FILE__
, __LINE__
, CStr(thisRule
->fExpandedRule
)());
216 // Put this new rule into the vector of all Rules.
217 fBreakRules
.addElement(thisRule
.orphan(), status
);
221 bool BreakRules::setKeywordParameter(const UnicodeString
&keyword
, const UnicodeString
&value
, UErrorCode
&status
) {
222 if (keyword
== UnicodeString("locale")) {
223 CharString localeName
;
224 localeName
.append(CStr(value
)(), -1, status
);
225 fLocale
= Locale::createFromName(localeName
.data());
228 if (keyword
== UnicodeString("type")) {
229 if (value
== UnicodeString("grapheme")) {
230 fType
= UBRK_CHARACTER
;
231 } else if (value
== UnicodeString("word")) {
233 } else if (value
== UnicodeString("line")) {
235 } else if (value
== UnicodeString("sentence")) {
236 fType
= UBRK_SENTENCE
;
238 IntlTest::gTest
->errln("%s:%d Unrecognized break type %s", __FILE__
, __LINE__
, CStr(value
)());
242 // TODO: add tailoring base setting here.
246 RuleBasedBreakIterator
*BreakRules::createICUBreakIterator(UErrorCode
&status
) {
247 if (U_FAILURE(status
)) {
250 RuleBasedBreakIterator
*bi
= NULL
;
253 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createCharacterInstance(fLocale
, status
));
256 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createWordInstance(fLocale
, status
));
259 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createLineInstance(fLocale
, status
));
262 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createSentenceInstance(fLocale
, status
));
265 IntlTest::gTest
->errln("%s:%d Bad break iterator type of %d", __FILE__
, __LINE__
, fType
);
266 status
= U_ILLEGAL_ARGUMENT_ERROR
;
272 void BreakRules::compileRules(UCHARBUF
*rules
, UErrorCode
&status
) {
273 if (U_FAILURE(status
)) {
277 UnicodeString emptyString
;
278 for (int32_t lineNumber
=0; ;lineNumber
++) { // Loop once per input line.
279 if (U_FAILURE(status
)) {
282 int32_t lineLength
= 0;
283 const UChar
*lineBuf
= ucbuf_readline(rules
, &lineLength
, &status
);
284 if (lineBuf
== NULL
) {
287 UnicodeString
line(lineBuf
, lineLength
);
289 // Strip comment lines.
290 fCommentsMatcher
->reset(line
);
291 line
= fCommentsMatcher
->replaceFirst(emptyString
, status
);
292 if (line
.isEmpty()) {
296 // Recognize character class definition and keyword lines
297 fClassDefMatcher
->reset(line
);
298 if (fClassDefMatcher
->matches(status
)) {
299 UnicodeString className
= fClassDefMatcher
->group(fClassDefMatcher
->pattern().groupNumberFromName("ClassName", status
), status
);
300 UnicodeString classDef
= fClassDefMatcher
->group(fClassDefMatcher
->pattern().groupNumberFromName("ClassDef", status
), status
);
301 if (fMonkeyImpl
->fDumpExpansions
) {
302 printf("scanned class: %s = %s\n", CStr(className
)(), CStr(classDef
)());
304 if (setKeywordParameter(className
, classDef
, status
)) {
305 // The scanned item was "type = ..." or "locale = ...", etc.
306 // which are not actual character classes.
309 addCharClass(className
, classDef
, status
);
313 // Recognize rule lines.
314 fRuleDefMatcher
->reset(line
);
315 if (fRuleDefMatcher
->matches(status
)) {
316 UnicodeString ruleName
= fRuleDefMatcher
->group(fRuleDefMatcher
->pattern().groupNumberFromName("RuleName", status
), status
);
317 UnicodeString ruleDef
= fRuleDefMatcher
->group(fRuleDefMatcher
->pattern().groupNumberFromName("RuleDef", status
), status
);
318 if (fMonkeyImpl
->fDumpExpansions
) {
319 printf("scanned rule: %s : %s\n", CStr(ruleName
)(), CStr(ruleDef
)());
321 addRule(ruleName
, ruleDef
, status
);
325 IntlTest::gTest
->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
326 __FILE__
, __LINE__
, fMonkeyImpl
->fRuleFileName
, CStr(line
)());
329 // Build the vector of char classes, omitting the dictionary class if there is one.
330 // This will be used when constructing the random text to be tested.
332 // Also compute the "other" set, consisting of any characters not included in
333 // one or more of the user defined sets.
335 UnicodeSet
otherSet((UChar32
)0, 0x10ffff);
336 int32_t pos
= UHASH_FIRST
;
337 const UHashElement
*el
= NULL
;
338 while ((el
= uhash_nextElement(fCharClasses
.getAlias(), &pos
)) != NULL
) {
339 const UnicodeString
*ccName
= static_cast<const UnicodeString
*>(el
->key
.pointer
);
340 CharClass
*cclass
= static_cast<CharClass
*>(el
->value
.pointer
);
341 // printf(" Adding %s\n", CStr(*ccName)());
342 if (*ccName
!= cclass
->fName
) {
343 IntlTest::gTest
->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
344 __FILE__
, __LINE__
, CStr(*ccName
)(), CStr(cclass
->fName
)());
346 const UnicodeSet
*set
= cclass
->fSet
.getAlias();
347 otherSet
.removeAll(*set
);
348 if (*ccName
== UnicodeString("dictionary")) {
349 fDictionarySet
= *set
;
351 fCharClassList
->addElement(cclass
, status
);
355 if (!otherSet
.isEmpty()) {
356 // fprintf(stderr, "have an other set.\n");
357 UnicodeString pattern
;
358 CharClass
*cclass
= addCharClass(UnicodeString("__Others"), otherSet
.toPattern(pattern
), status
);
359 fCharClassList
->addElement(cclass
, status
);
364 const CharClass
*BreakRules::getClassForChar(UChar32 c
, int32_t *iter
) const {
365 int32_t localIter
= 0;
366 int32_t &it
= iter
? *iter
: localIter
;
368 while (it
< fCharClassList
->size()) {
369 const CharClass
*cc
= static_cast<const CharClass
*>(fCharClassList
->elementAt(it
));
371 if (cc
->fSet
->contains(c
)) {
378 //---------------------------------------------------------------------------------------
380 // class MonkeyTestData implementation.
382 //---------------------------------------------------------------------------------------
384 void MonkeyTestData::set(BreakRules
*rules
, IntlTest::icu_rand
&rand
, UErrorCode
&status
) {
385 const int32_t dataLength
= 1000;
387 // Fill the test string with random characters.
388 // First randomly pick a char class, then randomly pick a character from that class.
389 // Exclude any characters from the dictionary set.
391 // std::cout << "Populating Test Data" << std::endl;
392 fRandomSeed
= rand
.getSeed(); // Save initial seed for use in error messages,
393 // allowing recreation of failing data.
396 for (int32_t n
=0; n
<dataLength
;) {
397 int charClassIndex
= rand() % rules
->fCharClassList
->size();
398 const CharClass
*cclass
= static_cast<CharClass
*>(rules
->fCharClassList
->elementAt(charClassIndex
));
399 if (cclass
->fSet
->size() == 0) {
400 // Some rules or tailorings do end up with empty char classes.
403 int32_t charIndex
= rand() % cclass
->fSet
->size();
404 UChar32 c
= cclass
->fSet
->charAt(charIndex
);
405 if (U16_IS_TRAIL(c
) && fString
.length() > 0 && U16_IS_LEAD(fString
.charAt(fString
.length()-1))) {
406 // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
407 // Don't let random unpaired surrogates combine in the test data because they might
408 // produce an unwanted dictionary character.
412 if (!rules
->fDictionarySet
.contains(c
)) {
418 // Reset each rule matcher regex with this new string.
419 // (Although we are always using the same string object, ICU regular expressions
420 // don't like the underlying string data changing without doing a reset).
422 for (int32_t ruleNum
=0; ruleNum
<rules
->fBreakRules
.size(); ruleNum
++) {
423 BreakRule
*rule
= static_cast<BreakRule
*>(rules
->fBreakRules
.elementAt(ruleNum
));
424 rule
->fRuleMatcher
->reset(fString
);
427 // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
428 // Expected and Actual breaks are one longer than the input string; a non-zero value
429 // will indicate a boundary preceding that position.
432 fExpectedBreaks
= fActualBreaks
;
433 fRuleForPosition
= fActualBreaks
;
434 f2ndRuleForPos
= fActualBreaks
;
436 // Apply reference rules to find the expected breaks.
438 fExpectedBreaks
.setCharAt(0, (UChar
)1); // Force an expected break before the start of the text.
439 // ICU always reports a break there.
440 // The reference rules do not have a means to do so.
442 while (strIdx
< fString
.length()) {
443 BreakRule
*matchingRule
= NULL
;
444 UBool hasBreak
= FALSE
;
446 int32_t matchStart
= 0;
447 int32_t matchEnd
= 0;
448 int32_t breakGroup
= 0;
449 for (ruleNum
=0; ruleNum
<rules
->fBreakRules
.size(); ruleNum
++) {
450 BreakRule
*rule
= static_cast<BreakRule
*>(rules
->fBreakRules
.elementAt(ruleNum
));
451 rule
->fRuleMatcher
->reset();
452 if (rule
->fRuleMatcher
->lookingAt(strIdx
, status
)) {
453 // A candidate rule match, check further to see if we take it or continue to check other rules.
454 // Matches of zero or one codepoint count only if they also specify a break.
455 matchStart
= rule
->fRuleMatcher
->start(status
);
456 matchEnd
= rule
->fRuleMatcher
->end(status
);
457 breakGroup
= rule
->fRuleMatcher
->pattern().groupNumberFromName("BreakPosition", status
);
458 hasBreak
= U_SUCCESS(status
);
459 if (status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
) {
460 status
= U_ZERO_ERROR
;
462 if (hasBreak
|| fString
.moveIndex32(matchStart
, 1) < matchEnd
) {
468 if (matchingRule
== NULL
) {
469 // No reference rule matched. This is an error in the rules that should never happen.
470 IntlTest::gTest
->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
471 __FILE__
, __LINE__
, strIdx
);
473 status
= U_INVALID_FORMAT_ERROR
;
476 if (matchingRule
->fRuleMatcher
->group(status
).length() == 0) {
477 // Zero length rule match. This is also an error in the rule expressions.
478 IntlTest::gTest
->errln("%s:%d Zero length rule match.",
480 status
= U_INVALID_FORMAT_ERROR
;
484 // Record which rule matched over the length of the match.
485 for (int i
= matchStart
; i
< matchEnd
; i
++) {
486 if (fRuleForPosition
.charAt(i
) == 0) {
487 fRuleForPosition
.setCharAt(i
, (UChar
)ruleNum
);
489 f2ndRuleForPos
.setCharAt(i
, (UChar
)ruleNum
);
493 // Break positions appear in rules as a matching named capture of zero length at the break position,
494 // the adjusted pattern contains (?<BreakPosition>)
496 int32_t breakPos
= matchingRule
->fRuleMatcher
->start(breakGroup
, status
);
497 if (U_FAILURE(status
) || breakPos
< 0) {
498 // Rule specified a break, but that break wasn't part of the match, even
499 // though the rule as a whole matched.
500 // Can't happen with regular expressions derived from (equivalent to) ICU break rules.
501 // Shouldn't get here.
502 IntlTest::gTest
->errln("%s:%d Internal Rule Error.", __FILE__
, __LINE__
);
503 status
= U_INVALID_FORMAT_ERROR
;
506 fExpectedBreaks
.setCharAt(breakPos
, (UChar
)1);
507 // printf("recording break at %d\n", breakPos);
508 // For the next iteration, pick up applying rules immediately after the break,
509 // which may differ from end of the match. The matching rule may have included
510 // context following the boundary that needs to be looked at again.
511 strIdx
= matchingRule
->fRuleMatcher
->end(breakGroup
, status
);
513 // Original rule didn't specify a break.
514 // Continue applying rules starting on the last code point of this match.
515 strIdx
= fString
.moveIndex32(matchEnd
, -1);
516 if (strIdx
== matchStart
) {
517 // Match was only one code point, no progress if we continue.
518 // Shouldn't get here, case is filtered out at top of loop.
520 ruleName
.appendInvariantChars(matchingRule
->fName
, status
);
521 IntlTest::gTest
->errln("%s:%d Rule %s internal error",
522 __FILE__
, __LINE__
, ruleName
.data());
523 status
= U_INVALID_FORMAT_ERROR
;
527 if (U_FAILURE(status
)) {
528 IntlTest::gTest
->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
529 __FILE__
, __LINE__
, u_errorName(status
));
535 void MonkeyTestData::clearActualBreaks() {
536 fActualBreaks
.remove();
537 // Actual Breaks length is one longer than the data string length, allowing
538 // for breaks before the first and after the last character in the data.
539 for (int32_t i
=0; i
<=fString
.length(); i
++) {
540 fActualBreaks
.append((UChar
)0);
544 void MonkeyTestData::dump(int32_t around
) const {
546 " char break Rule Character\n"
547 " pos code class R I name name\n"
548 "---------------------------------------------------------------------------------------------\n");
555 end
= fString
.length();
557 // Display context around a failure.
558 start
= fString
.moveIndex32(around
, -30);
559 end
= fString
.moveIndex32(around
, +30);
562 for (int charIdx
= start
; charIdx
< end
; charIdx
=fString
.moveIndex32(charIdx
, 1)) {
563 UErrorCode status
= U_ZERO_ERROR
;
564 UChar32 c
= fString
.char32At(charIdx
);
565 const CharClass
*cc
= fBkRules
->getClassForChar(c
);
567 ccName
.appendInvariantChars(cc
->fName
, status
);
568 CharString ruleName
, secondRuleName
;
569 const BreakRule
*rule
= static_cast<BreakRule
*>(fBkRules
->fBreakRules
.elementAt(fRuleForPosition
.charAt(charIdx
)));
570 ruleName
.appendInvariantChars(rule
->fName
, status
);
571 if (f2ndRuleForPos
.charAt(charIdx
) > 0) {
572 const BreakRule
*secondRule
= static_cast<BreakRule
*>(fBkRules
->fBreakRules
.elementAt(f2ndRuleForPos
.charAt(charIdx
)));
573 secondRuleName
.appendInvariantChars(secondRule
->fName
, status
);
576 u_charName(c
, U_EXTENDED_CHAR_NAME
, cName
, sizeof(cName
), &status
);
578 printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n",
579 charIdx
, c
, ccName
.data(),
580 fExpectedBreaks
.charAt(charIdx
) ? '*' : '.',
581 fActualBreaks
.charAt(charIdx
) ? '*' : '.',
582 ruleName
.data(), secondRuleName
.data(), cName
588 //---------------------------------------------------------------------------------------
590 // class RBBIMonkeyImpl
592 //---------------------------------------------------------------------------------------
594 RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode
&status
) : fDumpExpansions(FALSE
), fThread(this) {
595 (void)status
; // suppress unused parameter compiler warning.
599 // RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the
600 // reference rules and creating the icu breakiterator to test,
601 // with its type and locale coming from the reference rules.
603 void RBBIMonkeyImpl::setup(const char *ruleFile
, UErrorCode
&status
) {
604 fRuleFileName
= ruleFile
;
605 openBreakRules(ruleFile
, status
);
606 if (U_FAILURE(status
)) {
607 IntlTest::gTest
->errln("%s:%d Error %s opening file %s.", __FILE__
, __LINE__
, u_errorName(status
), ruleFile
);
610 fRuleSet
.adoptInstead(new BreakRules(this, status
));
611 fRuleSet
->compileRules(fRuleCharBuffer
.getAlias(), status
);
612 if (U_FAILURE(status
)) {
613 IntlTest::gTest
->errln("%s:%d Error %s processing file %s.", __FILE__
, __LINE__
, u_errorName(status
), ruleFile
);
616 fBI
.adoptInstead(fRuleSet
->createICUBreakIterator(status
));
617 fTestData
.adoptInstead(new MonkeyTestData());
621 RBBIMonkeyImpl::~RBBIMonkeyImpl() {
625 void RBBIMonkeyImpl::openBreakRules(const char *fileName
, UErrorCode
&status
) {
627 path
.append(IntlTest::getSourceTestData(status
), status
);
628 path
.append("break_rules" U_FILE_SEP_STRING
, status
);
629 path
.appendPathPart(fileName
, status
);
630 const char *codePage
= "UTF-8";
631 fRuleCharBuffer
.adoptInstead(ucbuf_open(path
.data(), &codePage
, TRUE
, FALSE
, &status
));
635 void RBBIMonkeyImpl::startTest() {
636 fThread
.start(); // invokes runTest() in a separate thread.
639 void RBBIMonkeyImpl::join() {
644 #define MONKEY_ERROR(msg, index) { \
645 IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
646 __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
647 if (fVerbose) { fTestData->dump(index); } \
648 status = U_INVALID_STATE_ERROR; \
651 void RBBIMonkeyImpl::runTest() {
652 UErrorCode status
= U_ZERO_ERROR
;
653 int32_t errorCount
= 0;
654 for (int64_t loopCount
= 0; fLoopCount
< 0 || loopCount
< fLoopCount
; loopCount
++) {
655 status
= U_ZERO_ERROR
;
656 fTestData
->set(fRuleSet
.getAlias(), fRandomGenerator
, status
);
658 IntlTest::gTest
->dataerrln("Unable to run test because fBI is null.");
661 if ( uprv_strcmp(fRuleFileName
,"line_loose_cj.txt") == 0 && fTestData
->fRandomSeed
==1712915859 ) {
662 continue; // known bug around index 103-104, break expected/actual 0/1, fwd 0020 200D | FDFC, rev 1325A 0020 | 200D
664 // fTestData->dump();
665 testForwards(status
);
666 testPrevious(status
);
667 testFollowing(status
);
668 testPreceding(status
);
669 testIsBoundary(status
);
671 if (fLoopCount
< 0 && loopCount
% 100 == 0) {
672 fprintf(stderr
, ".");
674 if (U_FAILURE(status
)) {
675 if (++errorCount
> 10) {
682 void RBBIMonkeyImpl::testForwards(UErrorCode
&status
) {
683 if (U_FAILURE(status
)) {
686 fTestData
->clearActualBreaks();
687 fBI
->setText(fTestData
->fString
);
688 int32_t previousBreak
= -2;
689 for (int32_t bk
=fBI
->first(); bk
!= BreakIterator::DONE
; bk
=fBI
->next()) {
690 if (bk
<= previousBreak
) {
691 MONKEY_ERROR("Break Iterator Stall", bk
);
694 if (bk
< 0 || bk
> fTestData
->fString
.length()) {
695 MONKEY_ERROR("Boundary out of bounds", bk
);
698 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
700 checkResults("testForwards", FORWARD
, status
);
703 void RBBIMonkeyImpl::testFollowing(UErrorCode
&status
) {
704 if (U_FAILURE(status
)) {
707 fTestData
->clearActualBreaks();
708 fBI
->setText(fTestData
->fString
);
709 int32_t nextBreak
= -1;
710 for (int32_t i
=-1 ; i
<fTestData
->fString
.length(); ++i
) {
711 int32_t bk
= fBI
->following(i
);
712 if (bk
== BreakIterator::DONE
&& i
== fTestData
->fString
.length()) {
715 if (bk
== nextBreak
&& bk
> i
) {
716 // i is in the gap between two breaks.
719 if (i
== nextBreak
&& bk
> nextBreak
) {
720 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
724 MONKEY_ERROR("following(i)", i
);
727 checkResults("testFollowing", FORWARD
, status
);
732 void RBBIMonkeyImpl::testPrevious(UErrorCode
&status
) {
733 if (U_FAILURE(status
)) {return;}
735 fTestData
->clearActualBreaks();
736 fBI
->setText(fTestData
->fString
);
737 int32_t previousBreak
= INT32_MAX
;
738 for (int32_t bk
=fBI
->last(); bk
!= BreakIterator::DONE
; bk
=fBI
->previous()) {
739 if (bk
>= previousBreak
) {
740 MONKEY_ERROR("Break Iterator Stall", bk
);
743 if (bk
< 0 || bk
> fTestData
->fString
.length()) {
744 MONKEY_ERROR("Boundary out of bounds", bk
);
747 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
749 checkResults("testPrevious", REVERSE
, status
);
753 void RBBIMonkeyImpl::testPreceding(UErrorCode
&status
) {
754 if (U_FAILURE(status
)) {
757 fTestData
->clearActualBreaks();
758 fBI
->setText(fTestData
->fString
);
759 int32_t nextBreak
= fTestData
->fString
.length()+1;
760 for (int32_t i
=fTestData
->fString
.length()+1 ; i
>=0; --i
) {
761 int32_t bk
= fBI
->preceding(i
);
762 // printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
763 if (bk
== BreakIterator::DONE
&& i
== 0) {
766 if (bk
== nextBreak
&& bk
< i
) {
767 // i is in the gap between two breaks.
770 if (i
<fTestData
->fString
.length() && fTestData
->fString
.getChar32Start(i
) < i
) {
771 // i indexes to a trailing surrogate.
772 // Break Iterators treat an index to either half as referring to the supplemental code point,
773 // with preceding going to some preceding code point.
774 if (fBI
->preceding(i
) != fBI
->preceding(fTestData
->fString
.getChar32Start(i
))) {
775 MONKEY_ERROR("preceding of trailing surrogate error", i
);
779 if (i
== nextBreak
&& bk
< nextBreak
) {
780 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
784 MONKEY_ERROR("preceding(i)", i
);
787 checkResults("testPreceding", REVERSE
, status
);
791 void RBBIMonkeyImpl::testIsBoundary(UErrorCode
&status
) {
792 if (U_FAILURE(status
)) {
795 fTestData
->clearActualBreaks();
796 fBI
->setText(fTestData
->fString
);
797 for (int i
=fTestData
->fString
.length(); i
>=0; --i
) {
798 if (fBI
->isBoundary(i
)) {
799 fTestData
->fActualBreaks
.setCharAt(i
, 1);
802 checkResults("testForwards", FORWARD
, status
);
805 void RBBIMonkeyImpl::checkResults(const char *msg
, CheckDirection direction
, UErrorCode
&status
) {
806 if (U_FAILURE(status
)) {
809 if (direction
== FORWARD
) {
810 for (int i
=0; i
<=fTestData
->fString
.length(); ++i
) {
811 if (fTestData
->fExpectedBreaks
.charAt(i
) != fTestData
->fActualBreaks
.charAt(i
)) {
813 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
814 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-2), fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
815 fRuleFileName
, fTestData
->fRandomSeed
);
817 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
818 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
819 fRuleFileName
, fTestData
->fRandomSeed
);
824 status
= U_INVALID_STATE_ERROR
; // Prevent the test from continuing, which would likely
825 break; // produce many redundant errors.
829 for (int i
=fTestData
->fString
.length(); i
>=0; i
--) {
830 if (fTestData
->fExpectedBreaks
.charAt(i
) != fTestData
->fActualBreaks
.charAt(i
)) {
832 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
833 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-2), fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
834 fRuleFileName
, fTestData
->fRandomSeed
);
836 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
837 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
838 fRuleFileName
, fTestData
->fRandomSeed
);
843 status
= U_INVALID_STATE_ERROR
;
852 //---------------------------------------------------------------------------------------
854 // class RBBIMonkeyTest implementation.
856 //---------------------------------------------------------------------------------------
857 RBBIMonkeyTest::RBBIMonkeyTest() {
860 RBBIMonkeyTest::~RBBIMonkeyTest() {
864 // params, taken from this->fParams.
865 // rules=file_name Name of file containing the reference rules.
866 // seed=nnnnn Random number starting seed.
867 // Setting the seed allows errors to be reproduced.
868 // loop=nnn Looping count. Controls running time.
870 // 0 or greater: run length.
871 // expansions debug option, show expansions of rules and sets.
872 // verbose Display details of the failure.
874 // Parameters on the intltest command line follow the test name, and are preceded by '@'.
876 // intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
878 void RBBIMonkeyTest::testMonkey() {
879 // printf("Test parameters: %s\n", fParams);
880 UnicodeString
params(fParams
);
881 UErrorCode status
= U_ZERO_ERROR
;
883 const char *tests
[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
884 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
886 CharString testNameFromParams
;
887 if (getStringParam("rules", params
, testNameFromParams
, status
)) {
888 tests
[0] = testNameFromParams
.data();
892 int64_t loopCount
= quick
? 100 : 5000;
893 getIntParam("loop", params
, loopCount
, status
);
895 UBool dumpExpansions
= FALSE
;
896 getBoolParam("expansions", params
, dumpExpansions
, status
);
898 UBool verbose
= FALSE
;
899 getBoolParam("verbose", params
, verbose
, status
);
902 getIntParam("seed", params
, seed
, status
);
904 if (params
.length() != 0) {
905 // Options processing did not consume all of the parameters. Something unrecognized was present.
906 CharString unrecognizedParameters
;
907 unrecognizedParameters
.append(CStr(params
)(), -1, status
);
908 errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__
, __LINE__
, unrecognizedParameters
.data());
912 UVector
startedTests(status
);
913 if (U_FAILURE(status
)) {
914 errln("%s:%d: error %s while setting up test.", __FILE__
, __LINE__
, u_errorName(status
));
918 // Monkey testing is multi-threaded.
919 // Each set of break rules to be tested is run in a separate thread.
920 // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
922 for (i
=0; tests
[i
] != NULL
; ++i
) {
923 logln("beginning testing of %s", tests
[i
]);
924 RBBIMonkeyImpl
*test
= new RBBIMonkeyImpl(status
);
925 test
->fDumpExpansions
= dumpExpansions
;
926 test
->fVerbose
= verbose
;
927 test
->fRandomGenerator
.seed((uint32_t)seed
);
928 test
->fLoopCount
= loopCount
;
929 test
->setup(tests
[i
], status
);
931 startedTests
.addElement(test
, status
);
932 if (U_FAILURE(status
)) {
937 if (U_FAILURE(status
)) {
938 dataerrln("%s:%d: error %s while starting test %s.", __FILE__
, __LINE__
, u_errorName(status
), tests
[i
]);
941 for (i
=0; i
<startedTests
.size(); ++i
) {
942 RBBIMonkeyImpl
*test
= static_cast<RBBIMonkeyImpl
*>(startedTests
.elementAt(i
));
949 UBool
RBBIMonkeyTest::getIntParam(UnicodeString name
, UnicodeString
¶ms
, int64_t &val
, UErrorCode
&status
) {
950 name
.append(" *= *(-?\\d+) *,? *");
951 RegexMatcher
m(name
, params
, 0, status
);
953 // The param exists. Convert the string to an int.
955 str
.append(CStr(m
.group(1, status
))(), -1, status
);
956 val
= strtol(str
.data(), NULL
, 10);
958 // Delete this parameter from the params string.
960 params
= m
.replaceFirst(UnicodeString(), status
);
966 UBool
RBBIMonkeyTest::getStringParam(UnicodeString name
, UnicodeString
¶ms
, CharString
&dest
, UErrorCode
&status
) {
967 name
.append(" *= *([^ ,]*) *,? *");
968 RegexMatcher
m(name
, params
, 0, status
);
971 dest
.append(CStr(m
.group(1, status
))(), -1, status
);
973 // Delete this parameter from the params string.
975 params
= m
.replaceFirst(UnicodeString(), status
);
981 UBool
RBBIMonkeyTest::getBoolParam(UnicodeString name
, UnicodeString
¶ms
, UBool
&dest
, UErrorCode
&status
) {
982 name
.append("(?: *= *(true|false))? *,? *");
983 RegexMatcher
m(name
, params
, UREGEX_CASE_INSENSITIVE
, status
);
985 if (m
.start(1, status
) > 0) {
986 // user option included a value.
987 dest
= m
.group(1, status
).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT
) == 0;
989 // No explicit user value, implies true.
993 // Delete this parameter from the params string.
995 params
= m
.replaceFirst(UnicodeString(), status
);
1001 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */