1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * Copyright (c) 2016, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 ********************************************************************/
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
13 #include "rbbimonkeytest.h"
14 #include "unicode/utypes.h"
15 #include "unicode/brkiter.h"
16 #include "unicode/utf16.h"
17 #include "unicode/uniset.h"
18 #include "unicode/unistr.h"
35 void RBBIMonkeyTest::runIndexedTest(int32_t index
, UBool exec
, const char* &name
, char* params
) {
36 fParams
= params
; // Work around TESTCASE_AUTO not being able to pass params to test function.
39 TESTCASE_AUTO(testMonkey
);
43 //---------------------------------------------------------------------------------------
45 // class BreakRule implementation.
47 //---------------------------------------------------------------------------------------
49 BreakRule::BreakRule() // : all field default initialized.
53 BreakRule::~BreakRule() {}
56 //---------------------------------------------------------------------------------------
58 // class BreakRules implementation.
60 //---------------------------------------------------------------------------------------
61 BreakRules::BreakRules(RBBIMonkeyImpl
*monkeyImpl
, UErrorCode
&status
) :
62 fMonkeyImpl(monkeyImpl
), fBreakRules(status
), fType(UBRK_COUNT
) {
63 fCharClasses
.adoptInstead(uhash_open(uhash_hashUnicodeString
,
64 uhash_compareUnicodeString
,
65 NULL
, // value comparator.
67 if (U_FAILURE(status
)) {
70 uhash_setKeyDeleter(fCharClasses
.getAlias(), uprv_deleteUObject
);
71 uhash_setValueDeleter(fCharClasses
.getAlias(), uprv_deleteUObject
);
72 fBreakRules
.setDeleter(uprv_deleteUObject
);
74 fCharClassList
.adoptInstead(new UVector(status
));
76 fSetRefsMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
77 "(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative look behind for '{' or '=' or '[:'
78 // (the identifier is a unicode property name or value)
79 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
82 // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
83 fCommentsMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
84 "(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';')
85 "[ \\t]*+" // Match white space.
86 "(#.*)?+" // Optional # plus whatever follows
87 "\\R$" // new-line at end of line.
90 // Match (initial parse) of a character class definition line.
91 fClassDefMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
92 "[ \\t]*" // leading white space
93 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
94 "[ \\t]*=[ \\t]*" // =
95 "(?<ClassDef>.*?)" // The char class UnicodeSet expression
96 "[ \\t]*;$"), // ; <end of line>
99 // Match (initial parse) of a break rule line.
100 fRuleDefMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
101 "[ \\t]*" // leading white space
102 "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name
103 "[ \\t]*:[ \\t]*" // :
104 "(?<RuleDef>.*?)" // The rule definition
105 "[ \\t]*;$"), // ; <end of line>
111 BreakRules::~BreakRules() {}
114 CharClass
*BreakRules::addCharClass(const UnicodeString
&name
, const UnicodeString
&definition
, UErrorCode
&status
) {
116 // Create the expanded definition for this char class,
117 // replacing any set references with the corresponding definition.
119 UnicodeString expandedDef
;
120 UnicodeString emptyString
;
121 fSetRefsMatcher
->reset(definition
);
122 while (fSetRefsMatcher
->find() && U_SUCCESS(status
)) {
123 const UnicodeString name
=
124 fSetRefsMatcher
->group(fSetRefsMatcher
->pattern().groupNumberFromName("ClassName", status
), status
);
125 CharClass
*nameClass
= static_cast<CharClass
*>(uhash_get(fCharClasses
.getAlias(), &name
));
126 const UnicodeString
&expansionForName
= nameClass
? nameClass
->fExpandedDef
: name
;
128 fSetRefsMatcher
->appendReplacement(expandedDef
, emptyString
, status
);
129 expandedDef
.append(expansionForName
);
131 fSetRefsMatcher
->appendTail(expandedDef
);
133 // Verify that the expanded set definition is valid.
135 if (fMonkeyImpl
->fDumpExpansions
) {
136 printf("epandedDef: %s\n", CStr(expandedDef
)());
139 UnicodeSet
*s
= new UnicodeSet(expandedDef
, USET_IGNORE_SPACE
, NULL
, status
);
140 if (U_FAILURE(status
)) {
141 IntlTest::gTest
->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__
, __LINE__
,
142 u_errorName(status
), CStr(name
)());
145 CharClass
*cclass
= new CharClass(name
, definition
, expandedDef
, s
);
146 CharClass
*previousClass
= static_cast<CharClass
*>(uhash_put(fCharClasses
.getAlias(),
147 new UnicodeString(name
), // Key, owned by hash table.
148 cclass
, // Value, owned by hash table.
151 if (previousClass
!= NULL
) {
152 // Duplicate class def.
153 // These are legitimate, they are adjustments of an existing class.
154 // TODO: will need to keep the old around when we handle tailorings.
155 IntlTest::gTest
->logln("Redefinition of character class %s\n", CStr(cclass
->fName
)());
156 delete previousClass
;
162 void BreakRules::addRule(const UnicodeString
&name
, const UnicodeString
&definition
, UErrorCode
&status
) {
163 LocalPointer
<BreakRule
> thisRule(new BreakRule
);
164 thisRule
->fName
= name
;
165 thisRule
->fRule
= definition
;
167 // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
168 // This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
169 UnicodeString emptyString
;
171 // Expand the char class definitions within the rule.
172 fSetRefsMatcher
->reset(definition
);
173 while (fSetRefsMatcher
->find() && U_SUCCESS(status
)) {
174 const UnicodeString name
=
175 fSetRefsMatcher
->group(fSetRefsMatcher
->pattern().groupNumberFromName("ClassName", status
), status
);
176 CharClass
*nameClass
= static_cast<CharClass
*>(uhash_get(fCharClasses
.getAlias(), &name
));
178 IntlTest::gTest
->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
179 __FILE__
, __LINE__
, CStr(name
)(), CStr(definition
)());
181 const UnicodeString
&expansionForName
= nameClass
? nameClass
->fExpandedDef
: name
;
183 fSetRefsMatcher
->appendReplacement(thisRule
->fExpandedRule
, emptyString
, status
);
184 thisRule
->fExpandedRule
.append(expansionForName
);
186 fSetRefsMatcher
->appendTail(thisRule
->fExpandedRule
);
188 // Replace the divide sign (\u00f7) with a regular expression named capture.
189 // When running the rules, a match that includes this group means we found a break position.
191 int32_t dividePos
= thisRule
->fExpandedRule
.indexOf((UChar
)0x00f7);
192 if (dividePos
>= 0) {
193 thisRule
->fExpandedRule
.replace(dividePos
, 1, UnicodeString("(?<BreakPosition>)"));
195 if (thisRule
->fExpandedRule
.indexOf((UChar
)0x00f7) != -1) {
196 status
= U_ILLEGAL_ARGUMENT_ERROR
; // TODO: produce a good error message.
199 // UAX break rule set definitions can be empty, just [].
200 // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
201 // also matches nothing.
203 static const UChar emptySet
[] = {(UChar
)0x5b, (UChar
)0x5d, 0};
205 while ((where
= thisRule
->fExpandedRule
.indexOf(emptySet
, 2, 0)) >= 0) {
206 thisRule
->fExpandedRule
.replace(where
, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
208 if (fMonkeyImpl
->fDumpExpansions
) {
209 printf("fExpandedRule: %s\n", CStr(thisRule
->fExpandedRule
)());
212 // Compile a regular expression for this rule.
213 thisRule
->fRuleMatcher
.adoptInstead(new RegexMatcher(thisRule
->fExpandedRule
, UREGEX_COMMENTS
| UREGEX_DOTALL
, status
));
214 if (U_FAILURE(status
)) {
215 IntlTest::gTest
->errln("%s:%d Error creating regular expression for %s",
216 __FILE__
, __LINE__
, CStr(thisRule
->fExpandedRule
)());
220 // Put this new rule into the vector of all Rules.
221 fBreakRules
.addElement(thisRule
.orphan(), status
);
225 bool BreakRules::setKeywordParameter(const UnicodeString
&keyword
, const UnicodeString
&value
, UErrorCode
&status
) {
226 if (keyword
== UnicodeString("locale")) {
227 CharString localeName
;
228 localeName
.append(CStr(value
)(), -1, status
);
229 fLocale
= Locale::createFromName(localeName
.data());
232 if (keyword
== UnicodeString("type")) {
233 if (value
== UnicodeString("grapheme")) {
234 fType
= UBRK_CHARACTER
;
235 } else if (value
== UnicodeString("word")) {
237 } else if (value
== UnicodeString("line")) {
239 } else if (value
== UnicodeString("sentence")) {
240 fType
= UBRK_SENTENCE
;
242 IntlTest::gTest
->errln("%s:%d Unrecognized break type %s", __FILE__
, __LINE__
, CStr(value
)());
246 // TODO: add tailoring base setting here.
250 RuleBasedBreakIterator
*BreakRules::createICUBreakIterator(UErrorCode
&status
) {
251 if (U_FAILURE(status
)) {
254 RuleBasedBreakIterator
*bi
= NULL
;
257 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createCharacterInstance(fLocale
, status
));
260 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createWordInstance(fLocale
, status
));
263 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createLineInstance(fLocale
, status
));
266 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createSentenceInstance(fLocale
, status
));
269 IntlTest::gTest
->errln("%s:%d Bad break iterator type of %d", __FILE__
, __LINE__
, fType
);
270 status
= U_ILLEGAL_ARGUMENT_ERROR
;
276 void BreakRules::compileRules(UCHARBUF
*rules
, UErrorCode
&status
) {
277 if (U_FAILURE(status
)) {
281 UnicodeString emptyString
;
282 for (int32_t lineNumber
=0; ;lineNumber
++) { // Loop once per input line.
283 if (U_FAILURE(status
)) {
286 int32_t lineLength
= 0;
287 const UChar
*lineBuf
= ucbuf_readline(rules
, &lineLength
, &status
);
288 if (lineBuf
== NULL
) {
291 UnicodeString
line(lineBuf
, lineLength
);
293 // Strip comment lines.
294 fCommentsMatcher
->reset(line
);
295 line
= fCommentsMatcher
->replaceFirst(emptyString
, status
);
296 if (line
.isEmpty()) {
300 // Recognize character class definition and keyword lines
301 fClassDefMatcher
->reset(line
);
302 if (fClassDefMatcher
->matches(status
)) {
303 UnicodeString className
= fClassDefMatcher
->group(fClassDefMatcher
->pattern().groupNumberFromName("ClassName", status
), status
);
304 UnicodeString classDef
= fClassDefMatcher
->group(fClassDefMatcher
->pattern().groupNumberFromName("ClassDef", status
), status
);
305 if (fMonkeyImpl
->fDumpExpansions
) {
306 printf("scanned class: %s = %s\n", CStr(className
)(), CStr(classDef
)());
308 if (setKeywordParameter(className
, classDef
, status
)) {
309 // The scanned item was "type = ..." or "locale = ...", etc.
310 // which are not actual character classes.
313 addCharClass(className
, classDef
, status
);
317 // Recognize rule lines.
318 fRuleDefMatcher
->reset(line
);
319 if (fRuleDefMatcher
->matches(status
)) {
320 UnicodeString ruleName
= fRuleDefMatcher
->group(fRuleDefMatcher
->pattern().groupNumberFromName("RuleName", status
), status
);
321 UnicodeString ruleDef
= fRuleDefMatcher
->group(fRuleDefMatcher
->pattern().groupNumberFromName("RuleDef", status
), status
);
322 if (fMonkeyImpl
->fDumpExpansions
) {
323 printf("scanned rule: %s : %s\n", CStr(ruleName
)(), CStr(ruleDef
)());
325 addRule(ruleName
, ruleDef
, status
);
329 IntlTest::gTest
->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
330 __FILE__
, __LINE__
, fMonkeyImpl
->fRuleFileName
, CStr(line
)());
333 // Build the vector of char classes, omitting the dictionary class if there is one.
334 // This will be used when constructing the random text to be tested.
336 // Also compute the "other" set, consisting of any characters not included in
337 // one or more of the user defined sets.
339 UnicodeSet
otherSet((UChar32
)0, 0x10ffff);
340 int32_t pos
= UHASH_FIRST
;
341 const UHashElement
*el
= NULL
;
342 while ((el
= uhash_nextElement(fCharClasses
.getAlias(), &pos
)) != NULL
) {
343 const UnicodeString
*ccName
= static_cast<const UnicodeString
*>(el
->key
.pointer
);
344 CharClass
*cclass
= static_cast<CharClass
*>(el
->value
.pointer
);
345 // printf(" Adding %s\n", CStr(*ccName)());
346 if (*ccName
!= cclass
->fName
) {
347 IntlTest::gTest
->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
348 __FILE__
, __LINE__
, CStr(*ccName
)(), CStr(cclass
->fName
)());
350 const UnicodeSet
*set
= cclass
->fSet
.getAlias();
351 otherSet
.removeAll(*set
);
352 if (*ccName
== UnicodeString("dictionary")) {
353 fDictionarySet
= *set
;
355 fCharClassList
->addElement(cclass
, status
);
359 if (!otherSet
.isEmpty()) {
360 // fprintf(stderr, "have an other set.\n");
361 UnicodeString pattern
;
362 CharClass
*cclass
= addCharClass(UnicodeString("__Others"), otherSet
.toPattern(pattern
), status
);
363 fCharClassList
->addElement(cclass
, status
);
368 const CharClass
*BreakRules::getClassForChar(UChar32 c
, int32_t *iter
) const {
369 int32_t localIter
= 0;
370 int32_t &it
= iter
? *iter
: localIter
;
372 while (it
< fCharClassList
->size()) {
373 const CharClass
*cc
= static_cast<const CharClass
*>(fCharClassList
->elementAt(it
));
375 if (cc
->fSet
->contains(c
)) {
382 //---------------------------------------------------------------------------------------
384 // class MonkeyTestData implementation.
386 //---------------------------------------------------------------------------------------
388 void MonkeyTestData::set(BreakRules
*rules
, IntlTest::icu_rand
&rand
, UErrorCode
&status
) {
389 const int32_t dataLength
= 1000;
391 // Fill the test string with random characters.
392 // First randomly pick a char class, then randomly pick a character from that class.
393 // Exclude any characters from the dictionary set.
395 // std::cout << "Populating Test Data" << std::endl;
396 fRandomSeed
= rand
.getSeed(); // Save initial seed for use in error messages,
397 // allowing recreation of failing data.
400 for (int32_t n
=0; n
<dataLength
;) {
401 int charClassIndex
= rand() % rules
->fCharClassList
->size();
402 const CharClass
*cclass
= static_cast<CharClass
*>(rules
->fCharClassList
->elementAt(charClassIndex
));
403 if (cclass
->fSet
->size() == 0) {
404 // Some rules or tailorings do end up with empty char classes.
407 int32_t charIndex
= rand() % cclass
->fSet
->size();
408 UChar32 c
= cclass
->fSet
->charAt(charIndex
);
409 if (U16_IS_TRAIL(c
) && fString
.length() > 0 && U16_IS_LEAD(fString
.charAt(fString
.length()-1))) {
410 // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
411 // Don't let random unpaired surrogates combine in the test data because they might
412 // produce an unwanted dictionary character.
416 if (!rules
->fDictionarySet
.contains(c
)) {
422 // Reset each rule matcher regex with this new string.
423 // (Although we are always using the same string object, ICU regular expressions
424 // don't like the underlying string data changing without doing a reset).
426 for (int32_t ruleNum
=0; ruleNum
<rules
->fBreakRules
.size(); ruleNum
++) {
427 BreakRule
*rule
= static_cast<BreakRule
*>(rules
->fBreakRules
.elementAt(ruleNum
));
428 rule
->fRuleMatcher
->reset(fString
);
431 // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
432 // Expected and Actual breaks are one longer than the input string; a non-zero value
433 // will indicate a boundary preceding that position.
436 fExpectedBreaks
= fActualBreaks
;
437 fRuleForPosition
= fActualBreaks
;
438 f2ndRuleForPos
= fActualBreaks
;
440 // Apply reference rules to find the expected breaks.
442 fExpectedBreaks
.setCharAt(0, (UChar
)1); // Force an expected break before the start of the text.
443 // ICU always reports a break there.
444 // The reference rules do not have a means to do so.
446 while (strIdx
< fString
.length()) {
447 BreakRule
*matchingRule
= NULL
;
448 UBool hasBreak
= FALSE
;
450 int32_t matchStart
= 0;
451 int32_t matchEnd
= 0;
452 int32_t breakGroup
= 0;
453 for (ruleNum
=0; ruleNum
<rules
->fBreakRules
.size(); ruleNum
++) {
454 BreakRule
*rule
= static_cast<BreakRule
*>(rules
->fBreakRules
.elementAt(ruleNum
));
455 rule
->fRuleMatcher
->reset();
456 if (rule
->fRuleMatcher
->lookingAt(strIdx
, status
)) {
457 // A candidate rule match, check further to see if we take it or continue to check other rules.
458 // Matches of zero or one codepoint count only if they also specify a break.
459 matchStart
= rule
->fRuleMatcher
->start(status
);
460 matchEnd
= rule
->fRuleMatcher
->end(status
);
461 breakGroup
= rule
->fRuleMatcher
->pattern().groupNumberFromName("BreakPosition", status
);
462 hasBreak
= U_SUCCESS(status
);
463 if (status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
) {
464 status
= U_ZERO_ERROR
;
466 if (hasBreak
|| fString
.moveIndex32(matchStart
, 1) < matchEnd
) {
472 if (matchingRule
== NULL
) {
473 // No reference rule matched. This is an error in the rules that should never happen.
474 IntlTest::gTest
->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
475 __FILE__
, __LINE__
, strIdx
);
477 status
= U_INVALID_FORMAT_ERROR
;
480 if (matchingRule
->fRuleMatcher
->group(status
).length() == 0) {
481 // Zero length rule match. This is also an error in the rule expressions.
482 IntlTest::gTest
->errln("%s:%d Zero length rule match.",
484 status
= U_INVALID_FORMAT_ERROR
;
488 // Record which rule matched over the length of the match.
489 for (int i
= matchStart
; i
< matchEnd
; i
++) {
490 if (fRuleForPosition
.charAt(i
) == 0) {
491 fRuleForPosition
.setCharAt(i
, (UChar
)ruleNum
);
493 f2ndRuleForPos
.setCharAt(i
, (UChar
)ruleNum
);
497 // Break positions appear in rules as a matching named capture of zero length at the break position,
498 // the adjusted pattern contains (?<BreakPosition>)
500 int32_t breakPos
= matchingRule
->fRuleMatcher
->start(breakGroup
, status
);
501 if (U_FAILURE(status
) || breakPos
< 0) {
502 // Rule specified a break, but that break wasn't part of the match, even
503 // though the rule as a whole matched.
504 // Can't happen with regular expressions derived from (equivalent to) ICU break rules.
505 // Shouldn't get here.
506 IntlTest::gTest
->errln("%s:%d Internal Rule Error.", __FILE__
, __LINE__
);
507 status
= U_INVALID_FORMAT_ERROR
;
510 fExpectedBreaks
.setCharAt(breakPos
, (UChar
)1);
511 // printf("recording break at %d\n", breakPos);
512 // For the next iteration, pick up applying rules immediately after the break,
513 // which may differ from end of the match. The matching rule may have included
514 // context following the boundary that needs to be looked at again.
515 strIdx
= matchingRule
->fRuleMatcher
->end(breakGroup
, status
);
517 // Original rule didn't specify a break.
518 // Continue applying rules starting on the last code point of this match.
519 strIdx
= fString
.moveIndex32(matchEnd
, -1);
520 if (strIdx
== matchStart
) {
521 // Match was only one code point, no progress if we continue.
522 // Shouldn't get here, case is filtered out at top of loop.
524 ruleName
.appendInvariantChars(matchingRule
->fName
, status
);
525 IntlTest::gTest
->errln("%s:%d Rule %s internal error",
526 __FILE__
, __LINE__
, ruleName
.data());
527 status
= U_INVALID_FORMAT_ERROR
;
531 if (U_FAILURE(status
)) {
532 IntlTest::gTest
->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
533 __FILE__
, __LINE__
, u_errorName(status
));
539 void MonkeyTestData::clearActualBreaks() {
540 fActualBreaks
.remove();
541 // Actual Breaks length is one longer than the data string length, allowing
542 // for breaks before the first and after the last character in the data.
543 for (int32_t i
=0; i
<=fString
.length(); i
++) {
544 fActualBreaks
.append((UChar
)0);
548 void MonkeyTestData::dump(int32_t around
) const {
550 " char break Rule Character\n"
551 " pos code class R I name name\n"
552 "---------------------------------------------------------------------------------------------\n");
559 end
= fString
.length();
561 // Display context around a failure.
562 start
= fString
.moveIndex32(around
, -30);
563 end
= fString
.moveIndex32(around
, +30);
566 for (int charIdx
= start
; charIdx
< end
; charIdx
=fString
.moveIndex32(charIdx
, 1)) {
567 UErrorCode status
= U_ZERO_ERROR
;
568 UChar32 c
= fString
.char32At(charIdx
);
569 const CharClass
*cc
= fBkRules
->getClassForChar(c
);
571 ccName
.appendInvariantChars(cc
->fName
, status
);
572 CharString ruleName
, secondRuleName
;
573 const BreakRule
*rule
= static_cast<BreakRule
*>(fBkRules
->fBreakRules
.elementAt(fRuleForPosition
.charAt(charIdx
)));
574 ruleName
.appendInvariantChars(rule
->fName
, status
);
575 if (f2ndRuleForPos
.charAt(charIdx
) > 0) {
576 const BreakRule
*secondRule
= static_cast<BreakRule
*>(fBkRules
->fBreakRules
.elementAt(f2ndRuleForPos
.charAt(charIdx
)));
577 secondRuleName
.appendInvariantChars(secondRule
->fName
, status
);
580 u_charName(c
, U_EXTENDED_CHAR_NAME
, cName
, sizeof(cName
), &status
);
582 printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n",
583 charIdx
, c
, ccName
.data(),
584 fExpectedBreaks
.charAt(charIdx
) ? '*' : '.',
585 fActualBreaks
.charAt(charIdx
) ? '*' : '.',
586 ruleName
.data(), secondRuleName
.data(), cName
592 //---------------------------------------------------------------------------------------
594 // class RBBIMonkeyImpl
596 //---------------------------------------------------------------------------------------
598 RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode
&status
) : fDumpExpansions(FALSE
), fThread(this) {
599 (void)status
; // suppress unused parameter compiler warning.
603 // RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the
604 // reference rules and creating the icu breakiterator to test,
605 // with its type and locale coming from the reference rules.
607 void RBBIMonkeyImpl::setup(const char *ruleFile
, UErrorCode
&status
) {
608 fRuleFileName
= ruleFile
;
609 openBreakRules(ruleFile
, status
);
610 if (U_FAILURE(status
)) {
611 IntlTest::gTest
->errln("%s:%d Error %s opening file %s.", __FILE__
, __LINE__
, u_errorName(status
), ruleFile
);
614 fRuleSet
.adoptInstead(new BreakRules(this, status
));
615 fRuleSet
->compileRules(fRuleCharBuffer
.getAlias(), status
);
616 if (U_FAILURE(status
)) {
617 IntlTest::gTest
->errln("%s:%d Error %s processing file %s.", __FILE__
, __LINE__
, u_errorName(status
), ruleFile
);
620 fBI
.adoptInstead(fRuleSet
->createICUBreakIterator(status
));
621 fTestData
.adoptInstead(new MonkeyTestData());
625 RBBIMonkeyImpl::~RBBIMonkeyImpl() {
629 void RBBIMonkeyImpl::openBreakRules(const char *fileName
, UErrorCode
&status
) {
631 path
.append(IntlTest::getSourceTestData(status
), status
);
632 path
.append("break_rules" U_FILE_SEP_STRING
, status
);
633 path
.appendPathPart(fileName
, status
);
634 const char *codePage
= "UTF-8";
635 fRuleCharBuffer
.adoptInstead(ucbuf_open(path
.data(), &codePage
, TRUE
, FALSE
, &status
));
639 void RBBIMonkeyImpl::startTest() {
640 fThread
.start(); // invokes runTest() in a separate thread.
643 void RBBIMonkeyImpl::join() {
648 #define MONKEY_ERROR(msg, index) { \
649 IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
650 __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
651 if (fVerbose) { fTestData->dump(index); } \
652 status = U_INVALID_STATE_ERROR; \
655 void RBBIMonkeyImpl::runTest() {
656 UErrorCode status
= U_ZERO_ERROR
;
657 int32_t errorCount
= 0;
658 for (int64_t loopCount
= 0; fLoopCount
< 0 || loopCount
< fLoopCount
; loopCount
++) {
659 status
= U_ZERO_ERROR
;
660 fTestData
->set(fRuleSet
.getAlias(), fRandomGenerator
, status
);
662 IntlTest::gTest
->dataerrln("Unable to run test because fBI is null.");
665 if ( uprv_strcmp(fRuleFileName
,"line_loose_cj.txt") == 0 && fTestData
->fRandomSeed
==1712915859 ) {
666 continue; // known bug around index 103-104, break expected/actual 0/1, fwd 0020 200D | FDFC, rev 1325A 0020 | 200D
668 // fTestData->dump();
669 testForwards(status
);
670 testPrevious(status
);
671 testFollowing(status
);
672 testPreceding(status
);
673 testIsBoundary(status
);
674 testIsBoundaryRandom(status
);
676 if (fLoopCount
< 0 && loopCount
% 100 == 0) {
677 fprintf(stderr
, ".");
679 if (U_FAILURE(status
)) {
680 if (++errorCount
> 10) {
687 void RBBIMonkeyImpl::testForwards(UErrorCode
&status
) {
688 if (U_FAILURE(status
)) {
691 fTestData
->clearActualBreaks();
692 fBI
->setText(fTestData
->fString
);
693 int32_t previousBreak
= -2;
694 for (int32_t bk
=fBI
->first(); bk
!= BreakIterator::DONE
; bk
=fBI
->next()) {
695 if (bk
<= previousBreak
) {
696 MONKEY_ERROR("Break Iterator Stall", bk
);
699 if (bk
< 0 || bk
> fTestData
->fString
.length()) {
700 MONKEY_ERROR("Boundary out of bounds", bk
);
703 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
705 checkResults("testForwards", FORWARD
, status
);
708 void RBBIMonkeyImpl::testFollowing(UErrorCode
&status
) {
709 if (U_FAILURE(status
)) {
712 fTestData
->clearActualBreaks();
713 fBI
->setText(fTestData
->fString
);
714 int32_t nextBreak
= -1;
715 for (int32_t i
=-1 ; i
<fTestData
->fString
.length(); ++i
) {
716 int32_t bk
= fBI
->following(i
);
717 if (bk
== BreakIterator::DONE
&& i
== fTestData
->fString
.length()) {
720 if (bk
== nextBreak
&& bk
> i
) {
721 // i is in the gap between two breaks.
724 if (i
== nextBreak
&& bk
> nextBreak
) {
725 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
729 MONKEY_ERROR("following(i)", i
);
732 checkResults("testFollowing", FORWARD
, status
);
737 void RBBIMonkeyImpl::testPrevious(UErrorCode
&status
) {
738 if (U_FAILURE(status
)) {return;}
740 fTestData
->clearActualBreaks();
741 fBI
->setText(fTestData
->fString
);
742 int32_t previousBreak
= INT32_MAX
;
743 for (int32_t bk
=fBI
->last(); bk
!= BreakIterator::DONE
; bk
=fBI
->previous()) {
744 if (bk
>= previousBreak
) {
745 MONKEY_ERROR("Break Iterator Stall", bk
);
748 if (bk
< 0 || bk
> fTestData
->fString
.length()) {
749 MONKEY_ERROR("Boundary out of bounds", bk
);
752 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
754 checkResults("testPrevious", REVERSE
, status
);
758 void RBBIMonkeyImpl::testPreceding(UErrorCode
&status
) {
759 if (U_FAILURE(status
)) {
762 fTestData
->clearActualBreaks();
763 fBI
->setText(fTestData
->fString
);
764 int32_t nextBreak
= fTestData
->fString
.length()+1;
765 for (int32_t i
=fTestData
->fString
.length()+1 ; i
>=0; --i
) {
766 int32_t bk
= fBI
->preceding(i
);
767 // printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
768 if (bk
== BreakIterator::DONE
&& i
== 0) {
771 if (bk
== nextBreak
&& bk
< i
) {
772 // i is in the gap between two breaks.
775 if (i
<fTestData
->fString
.length() && fTestData
->fString
.getChar32Start(i
) < i
) {
776 // i indexes to a trailing surrogate.
777 // Break Iterators treat an index to either half as referring to the supplemental code point,
778 // with preceding going to some preceding code point.
779 if (fBI
->preceding(i
) != fBI
->preceding(fTestData
->fString
.getChar32Start(i
))) {
780 MONKEY_ERROR("preceding of trailing surrogate error", i
);
784 if (i
== nextBreak
&& bk
< nextBreak
) {
785 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
789 MONKEY_ERROR("preceding(i)", i
);
792 checkResults("testPreceding", REVERSE
, status
);
796 void RBBIMonkeyImpl::testIsBoundary(UErrorCode
&status
) {
797 if (U_FAILURE(status
)) {
800 fTestData
->clearActualBreaks();
801 fBI
->setText(fTestData
->fString
);
802 for (int i
=fTestData
->fString
.length(); i
>=0; --i
) {
803 if (fBI
->isBoundary(i
)) {
804 fTestData
->fActualBreaks
.setCharAt(i
, 1);
807 checkResults("testForwards", FORWARD
, status
);
810 void RBBIMonkeyImpl::testIsBoundaryRandom(UErrorCode
&status
) {
811 if (U_FAILURE(status
)) {
814 fBI
->setText(fTestData
->fString
);
816 int stringLen
= fTestData
->fString
.length();
817 for (int i
=stringLen
; i
>=0; --i
) {
818 int strIdx
= fRandomGenerator() % stringLen
;
819 if (fTestData
->fExpectedBreaks
.charAt(strIdx
) != fBI
->isBoundary(strIdx
)) {
820 IntlTest::gTest
->errln("%s:%d testIsBoundaryRandom failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
821 __FILE__
, __LINE__
, strIdx
, fRuleFileName
, fTestData
->fRandomSeed
);
825 status
= U_INVALID_STATE_ERROR
;
833 void RBBIMonkeyImpl::checkResults(const char *msg
, CheckDirection direction
, UErrorCode
&status
) {
834 if (U_FAILURE(status
)) {
837 if (direction
== FORWARD
) {
838 for (int i
=0; i
<=fTestData
->fString
.length(); ++i
) {
839 if (fTestData
->fExpectedBreaks
.charAt(i
) != fTestData
->fActualBreaks
.charAt(i
)) {
841 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
842 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-2), fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
843 fRuleFileName
, fTestData
->fRandomSeed
);
845 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
846 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
847 fRuleFileName
, fTestData
->fRandomSeed
);
852 status
= U_INVALID_STATE_ERROR
; // Prevent the test from continuing, which would likely
853 break; // produce many redundant errors.
857 for (int i
=fTestData
->fString
.length(); i
>=0; i
--) {
858 if (fTestData
->fExpectedBreaks
.charAt(i
) != fTestData
->fActualBreaks
.charAt(i
)) {
860 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
861 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-2), fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
862 fRuleFileName
, fTestData
->fRandomSeed
);
864 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
865 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
866 fRuleFileName
, fTestData
->fRandomSeed
);
871 status
= U_INVALID_STATE_ERROR
;
880 //---------------------------------------------------------------------------------------
882 // class RBBIMonkeyTest implementation.
884 //---------------------------------------------------------------------------------------
885 RBBIMonkeyTest::RBBIMonkeyTest() {
888 RBBIMonkeyTest::~RBBIMonkeyTest() {
892 // params, taken from this->fParams.
893 // rules=file_name Name of file containing the reference rules.
894 // seed=nnnnn Random number starting seed.
895 // Setting the seed allows errors to be reproduced.
896 // loop=nnn Looping count. Controls running time.
898 // 0 or greater: run length.
899 // expansions debug option, show expansions of rules and sets.
900 // verbose Display details of the failure.
902 // Parameters on the intltest command line follow the test name, and are preceded by '@'.
904 // intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
906 void RBBIMonkeyTest::testMonkey() {
907 // printf("Test parameters: %s\n", fParams);
908 UnicodeString
params(fParams
);
909 UErrorCode status
= U_ZERO_ERROR
;
911 const char *tests
[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
912 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
914 CharString testNameFromParams
;
915 if (getStringParam("rules", params
, testNameFromParams
, status
)) {
916 tests
[0] = testNameFromParams
.data();
920 int64_t loopCount
= quick
? 100 : 5000;
921 getIntParam("loop", params
, loopCount
, status
);
923 UBool dumpExpansions
= FALSE
;
924 getBoolParam("expansions", params
, dumpExpansions
, status
);
926 UBool verbose
= FALSE
;
927 getBoolParam("verbose", params
, verbose
, status
);
930 getIntParam("seed", params
, seed
, status
);
932 if (params
.length() != 0) {
933 // Options processing did not consume all of the parameters. Something unrecognized was present.
934 CharString unrecognizedParameters
;
935 unrecognizedParameters
.append(CStr(params
)(), -1, status
);
936 errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__
, __LINE__
, unrecognizedParameters
.data());
940 UVector
startedTests(status
);
941 if (U_FAILURE(status
)) {
942 errln("%s:%d: error %s while setting up test.", __FILE__
, __LINE__
, u_errorName(status
));
946 // Monkey testing is multi-threaded.
947 // Each set of break rules to be tested is run in a separate thread.
948 // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
950 for (i
=0; tests
[i
] != NULL
; ++i
) {
951 logln("beginning testing of %s", tests
[i
]);
952 LocalPointer
<RBBIMonkeyImpl
> test(new RBBIMonkeyImpl(status
));
953 if (U_FAILURE(status
)) {
954 dataerrln("%s:%d: error %s while starting test %s.", __FILE__
, __LINE__
, u_errorName(status
), tests
[i
]);
957 test
->fDumpExpansions
= dumpExpansions
;
958 test
->fVerbose
= verbose
;
959 test
->fRandomGenerator
.seed((uint32_t)seed
);
960 test
->fLoopCount
= loopCount
;
961 test
->setup(tests
[i
], status
);
962 if (U_FAILURE(status
)) {
963 dataerrln("%s:%d: error %s while starting test %s.", __FILE__
, __LINE__
, u_errorName(status
), tests
[i
]);
967 startedTests
.addElement(test
.orphan(), status
);
968 if (U_FAILURE(status
)) {
969 errln("%s:%d: error %s while starting test %s.", __FILE__
, __LINE__
, u_errorName(status
), tests
[i
]);
974 for (i
=0; i
<startedTests
.size(); ++i
) {
975 RBBIMonkeyImpl
*test
= static_cast<RBBIMonkeyImpl
*>(startedTests
.elementAt(i
));
982 UBool
RBBIMonkeyTest::getIntParam(UnicodeString name
, UnicodeString
¶ms
, int64_t &val
, UErrorCode
&status
) {
983 name
.append(" *= *(-?\\d+) *,? *");
984 RegexMatcher
m(name
, params
, 0, status
);
986 // The param exists. Convert the string to an int.
988 str
.append(CStr(m
.group(1, status
))(), -1, status
);
989 val
= strtol(str
.data(), NULL
, 10);
991 // Delete this parameter from the params string.
993 params
= m
.replaceFirst(UnicodeString(), status
);
999 UBool
RBBIMonkeyTest::getStringParam(UnicodeString name
, UnicodeString
¶ms
, CharString
&dest
, UErrorCode
&status
) {
1000 name
.append(" *= *([^ ,]*) *,? *");
1001 RegexMatcher
m(name
, params
, 0, status
);
1003 // The param exists.
1004 dest
.append(CStr(m
.group(1, status
))(), -1, status
);
1006 // Delete this parameter from the params string.
1008 params
= m
.replaceFirst(UnicodeString(), status
);
1014 UBool
RBBIMonkeyTest::getBoolParam(UnicodeString name
, UnicodeString
¶ms
, UBool
&dest
, UErrorCode
&status
) {
1015 name
.append("(?: *= *(true|false))? *,? *");
1016 RegexMatcher
m(name
, params
, UREGEX_CASE_INSENSITIVE
, status
);
1018 if (m
.start(1, status
) > 0) {
1019 // user option included a value.
1020 dest
= m
.group(1, status
).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT
) == 0;
1022 // No explicit user value, implies true.
1026 // Delete this parameter from the params string.
1028 params
= m
.replaceFirst(UnicodeString(), status
);
1034 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */