1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * Copyright (c) 2016, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 ********************************************************************/
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
13 #include "rbbimonkeytest.h"
14 #include "unicode/utypes.h"
15 #include "unicode/brkiter.h"
16 #include "unicode/utf16.h"
17 #include "unicode/uniset.h"
18 #include "unicode/unistr.h"
35 void RBBIMonkeyTest::runIndexedTest(int32_t index
, UBool exec
, const char* &name
, char* params
) {
36 fParams
= params
; // Work around TESTCASE_AUTO not being able to pass params to test function.
39 TESTCASE_AUTO(testMonkey
);
43 //---------------------------------------------------------------------------------------
45 // class BreakRule implementation.
47 //---------------------------------------------------------------------------------------
49 BreakRule::BreakRule() // : all field default initialized.
53 BreakRule::~BreakRule() {}
56 //---------------------------------------------------------------------------------------
58 // class BreakRules implementation.
60 //---------------------------------------------------------------------------------------
61 BreakRules::BreakRules(RBBIMonkeyImpl
*monkeyImpl
, UErrorCode
&status
) :
62 fMonkeyImpl(monkeyImpl
), fBreakRules(status
), fType(UBRK_COUNT
) {
63 fCharClasses
.adoptInstead(uhash_open(uhash_hashUnicodeString
,
64 uhash_compareUnicodeString
,
65 NULL
, // value comparator.
67 if (U_FAILURE(status
)) {
70 uhash_setKeyDeleter(fCharClasses
.getAlias(), uprv_deleteUObject
);
71 uhash_setValueDeleter(fCharClasses
.getAlias(), uprv_deleteUObject
);
72 fBreakRules
.setDeleter(uprv_deleteUObject
);
74 fCharClassList
.adoptInstead(new UVector(status
));
76 fSetRefsMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
77 "(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative lookbehind for '{' or '=' or '[:'
78 // (the identifier is a unicode property name or value)
79 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
82 // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
83 fCommentsMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
84 "(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';')
85 "[ \\t]*+" // Match white space.
86 "(#.*)?+" // Optional # plus whatever follows
87 "\\R$" // new-line at end of line.
90 // Match (initial parse) of a character class defintion line.
91 fClassDefMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
92 "[ \\t]*" // leading white space
93 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
94 "[ \\t]*=[ \\t]*" // =
95 "(?<ClassDef>.*?)" // The char class UnicodeSet expression
96 "[ \\t]*;$"), // ; <end of line>
99 // Match (initial parse) of a break rule line.
100 fRuleDefMatcher
.adoptInstead(new RegexMatcher(UnicodeString(
101 "[ \\t]*" // leading white space
102 "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name
103 "[ \\t]*:[ \\t]*" // :
104 "(?<RuleDef>.*?)" // The rule definition
105 "[ \\t]*;$"), // ; <end of line>
111 BreakRules::~BreakRules() {}
114 CharClass
*BreakRules::addCharClass(const UnicodeString
&name
, const UnicodeString
&definition
, UErrorCode
&status
) {
116 // Create the expanded definition for this char class,
117 // replacing any set references with the corresponding definition.
119 UnicodeString expandedDef
;
120 UnicodeString emptyString
;
121 fSetRefsMatcher
->reset(definition
);
122 while (fSetRefsMatcher
->find() && U_SUCCESS(status
)) {
123 const UnicodeString name
=
124 fSetRefsMatcher
->group(fSetRefsMatcher
->pattern().groupNumberFromName("ClassName", status
), status
);
125 CharClass
*nameClass
= static_cast<CharClass
*>(uhash_get(fCharClasses
.getAlias(), &name
));
126 const UnicodeString
&expansionForName
= nameClass
? nameClass
->fExpandedDef
: name
;
128 fSetRefsMatcher
->appendReplacement(expandedDef
, emptyString
, status
);
129 expandedDef
.append(expansionForName
);
131 fSetRefsMatcher
->appendTail(expandedDef
);
133 // Verify that the expanded set defintion is valid.
135 if (fMonkeyImpl
->fDumpExpansions
) {
136 printf("epandedDef: %s\n", CStr(expandedDef
)());
139 UnicodeSet
*s
= new UnicodeSet(expandedDef
, USET_IGNORE_SPACE
, NULL
, status
);
140 if (U_FAILURE(status
)) {
141 IntlTest::gTest
->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__
, __LINE__
,
142 u_errorName(status
), CStr(name
)());
145 CharClass
*cclass
= new CharClass(name
, definition
, expandedDef
, s
);
146 CharClass
*previousClass
= static_cast<CharClass
*>(uhash_put(fCharClasses
.getAlias(),
147 new UnicodeString(name
), // Key, owned by hash table.
148 cclass
, // Value, owned by hash table.
151 if (previousClass
!= NULL
) {
152 // Duplicate class def.
153 // These are legitimate, they are adustments of an existing class.
154 // TODO: will need to keep the old around when we handle tailorings.
155 IntlTest::gTest
->logln("Redefinition of character class %s\n", CStr(cclass
->fName
)());
156 delete previousClass
;
162 void BreakRules::addRule(const UnicodeString
&name
, const UnicodeString
&definition
, UErrorCode
&status
) {
163 LocalPointer
<BreakRule
> thisRule(new BreakRule
);
164 thisRule
->fName
= name
;
165 thisRule
->fRule
= definition
;
167 // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
168 // This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
169 UnicodeString emptyString
;
171 // Expand the char class definitions within the rule.
172 fSetRefsMatcher
->reset(definition
);
173 while (fSetRefsMatcher
->find() && U_SUCCESS(status
)) {
174 const UnicodeString name
=
175 fSetRefsMatcher
->group(fSetRefsMatcher
->pattern().groupNumberFromName("ClassName", status
), status
);
176 CharClass
*nameClass
= static_cast<CharClass
*>(uhash_get(fCharClasses
.getAlias(), &name
));
178 IntlTest::gTest
->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
179 __FILE__
, __LINE__
, CStr(name
)(), CStr(definition
)());
181 const UnicodeString
&expansionForName
= nameClass
? nameClass
->fExpandedDef
: name
;
183 fSetRefsMatcher
->appendReplacement(thisRule
->fExpandedRule
, emptyString
, status
);
184 thisRule
->fExpandedRule
.append(expansionForName
);
186 fSetRefsMatcher
->appendTail(thisRule
->fExpandedRule
);
188 // Replace the divide sign (\u00f7) with a regular expression named capture.
189 // When running the rules, a match that includes this group means we found a break position.
191 int32_t dividePos
= thisRule
->fExpandedRule
.indexOf((UChar
)0x00f7);
192 if (dividePos
>= 0) {
193 thisRule
->fExpandedRule
.replace(dividePos
, 1, UnicodeString("(?<BreakPosition>)"));
195 if (thisRule
->fExpandedRule
.indexOf((UChar
)0x00f7) != -1) {
196 status
= U_ILLEGAL_ARGUMENT_ERROR
; // TODO: produce a good error message.
199 // UAX break rule set definitions can be empty, just [].
200 // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
201 // also matches nothing.
203 static const UChar emptySet
[] = {(UChar
)0x5b, (UChar
)0x5d, 0};
205 while ((where
= thisRule
->fExpandedRule
.indexOf(emptySet
, 2, 0)) >= 0) {
206 thisRule
->fExpandedRule
.replace(where
, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
208 if (fMonkeyImpl
->fDumpExpansions
) {
209 printf("fExpandedRule: %s\n", CStr(thisRule
->fExpandedRule
)());
212 // Compile a regular expression for this rule.
213 thisRule
->fRuleMatcher
.adoptInstead(new RegexMatcher(thisRule
->fExpandedRule
, UREGEX_COMMENTS
| UREGEX_DOTALL
, status
));
214 if (U_FAILURE(status
)) {
215 IntlTest::gTest
->errln("%s:%d Error creating regular expression for %s",
216 __FILE__
, __LINE__
, CStr(thisRule
->fExpandedRule
)());
220 // Put this new rule into the vector of all Rules.
221 fBreakRules
.addElement(thisRule
.orphan(), status
);
225 bool BreakRules::setKeywordParameter(const UnicodeString
&keyword
, const UnicodeString
&value
, UErrorCode
&status
) {
226 if (keyword
== UnicodeString("locale")) {
227 CharString localeName
;
228 localeName
.append(CStr(value
)(), -1, status
);
229 fLocale
= Locale::createFromName(localeName
.data());
232 if (keyword
== UnicodeString("type")) {
233 if (value
== UnicodeString("grapheme")) {
234 fType
= UBRK_CHARACTER
;
235 } else if (value
== UnicodeString("word")) {
237 } else if (value
== UnicodeString("line")) {
239 } else if (value
== UnicodeString("sentence")) {
240 fType
= UBRK_SENTENCE
;
242 IntlTest::gTest
->errln("%s:%d Unrecognized break type %s", __FILE__
, __LINE__
, CStr(value
)());
246 // TODO: add tailoring base setting here.
250 RuleBasedBreakIterator
*BreakRules::createICUBreakIterator(UErrorCode
&status
) {
251 if (U_FAILURE(status
)) {
254 RuleBasedBreakIterator
*bi
= NULL
;
257 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createCharacterInstance(fLocale
, status
));
260 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createWordInstance(fLocale
, status
));
263 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createLineInstance(fLocale
, status
));
266 bi
= dynamic_cast<RuleBasedBreakIterator
*>(BreakIterator::createSentenceInstance(fLocale
, status
));
269 IntlTest::gTest
->errln("%s:%d Bad break iterator type of %d", __FILE__
, __LINE__
, fType
);
270 status
= U_ILLEGAL_ARGUMENT_ERROR
;
276 void BreakRules::compileRules(UCHARBUF
*rules
, UErrorCode
&status
) {
277 if (U_FAILURE(status
)) {
281 UnicodeString emptyString
;
282 for (int32_t lineNumber
=0; ;lineNumber
++) { // Loop once per input line.
283 if (U_FAILURE(status
)) {
286 int32_t lineLength
= 0;
287 const UChar
*lineBuf
= ucbuf_readline(rules
, &lineLength
, &status
);
288 if (lineBuf
== NULL
) {
291 UnicodeString
line(lineBuf
, lineLength
);
293 // Strip comment lines.
294 fCommentsMatcher
->reset(line
);
295 line
= fCommentsMatcher
->replaceFirst(emptyString
, status
);
296 if (line
.isEmpty()) {
300 // Recognize character class definition and keyword lines
301 fClassDefMatcher
->reset(line
);
302 if (fClassDefMatcher
->matches(status
)) {
303 UnicodeString className
= fClassDefMatcher
->group(fClassDefMatcher
->pattern().groupNumberFromName("ClassName", status
), status
);
304 UnicodeString classDef
= fClassDefMatcher
->group(fClassDefMatcher
->pattern().groupNumberFromName("ClassDef", status
), status
);
305 if (fMonkeyImpl
->fDumpExpansions
) {
306 printf("scanned class: %s = %s\n", CStr(className
)(), CStr(classDef
)());
308 if (setKeywordParameter(className
, classDef
, status
)) {
309 // The scanned item was "type = ..." or "locale = ...", etc.
310 // which are not actual character classes.
313 addCharClass(className
, classDef
, status
);
317 // Recognize rule lines.
318 fRuleDefMatcher
->reset(line
);
319 if (fRuleDefMatcher
->matches(status
)) {
320 UnicodeString ruleName
= fRuleDefMatcher
->group(fRuleDefMatcher
->pattern().groupNumberFromName("RuleName", status
), status
);
321 UnicodeString ruleDef
= fRuleDefMatcher
->group(fRuleDefMatcher
->pattern().groupNumberFromName("RuleDef", status
), status
);
322 if (fMonkeyImpl
->fDumpExpansions
) {
323 printf("scanned rule: %s : %s\n", CStr(ruleName
)(), CStr(ruleDef
)());
325 addRule(ruleName
, ruleDef
, status
);
329 IntlTest::gTest
->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
330 __FILE__
, __LINE__
, fMonkeyImpl
->fRuleFileName
, CStr(line
)());
333 // Build the vector of char classes, omitting the dictionary class if there is one.
334 // This will be used when constructing the random text to be tested.
336 // Also compute the "other" set, consisting of any characters not included in
337 // one or more of the user defined sets.
339 UnicodeSet
otherSet((UChar32
)0, 0x10ffff);
340 int32_t pos
= UHASH_FIRST
;
341 const UHashElement
*el
= NULL
;
342 while ((el
= uhash_nextElement(fCharClasses
.getAlias(), &pos
)) != NULL
) {
343 const UnicodeString
*ccName
= static_cast<const UnicodeString
*>(el
->key
.pointer
);
344 CharClass
*cclass
= static_cast<CharClass
*>(el
->value
.pointer
);
345 // printf(" Adding %s\n", CStr(*ccName)());
346 if (*ccName
!= cclass
->fName
) {
347 IntlTest::gTest
->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
348 __FILE__
, __LINE__
, CStr(*ccName
)(), CStr(cclass
->fName
)());
350 const UnicodeSet
*set
= cclass
->fSet
.getAlias();
351 otherSet
.removeAll(*set
);
352 if (*ccName
== UnicodeString("dictionary")) {
353 fDictionarySet
= *set
;
355 fCharClassList
->addElement(cclass
, status
);
359 if (!otherSet
.isEmpty()) {
360 // fprintf(stderr, "have an other set.\n");
361 UnicodeString pattern
;
362 CharClass
*cclass
= addCharClass(UnicodeString("__Others"), otherSet
.toPattern(pattern
), status
);
363 fCharClassList
->addElement(cclass
, status
);
368 const CharClass
*BreakRules::getClassForChar(UChar32 c
, int32_t *iter
) const {
369 int32_t localIter
= 0;
370 int32_t &it
= iter
? *iter
: localIter
;
372 while (it
< fCharClassList
->size()) {
373 const CharClass
*cc
= static_cast<const CharClass
*>(fCharClassList
->elementAt(it
));
375 if (cc
->fSet
->contains(c
)) {
382 //---------------------------------------------------------------------------------------
384 // class MonkeyTestData implementation.
386 //---------------------------------------------------------------------------------------
388 void MonkeyTestData::set(BreakRules
*rules
, IntlTest::icu_rand
&rand
, UErrorCode
&status
) {
389 const int32_t dataLength
= 1000;
391 // Fill the test string with random characters.
392 // First randomly pick a char class, then randomly pick a character from that class.
393 // Exclude any characters from the dictionary set.
395 // std::cout << "Populating Test Data" << std::endl;
396 fRandomSeed
= rand
.getSeed(); // Save initial seed for use in error messages,
397 // allowing recreation of failing data.
400 for (int32_t n
=0; n
<dataLength
;) {
401 int charClassIndex
= rand() % rules
->fCharClassList
->size();
402 const CharClass
*cclass
= static_cast<CharClass
*>(rules
->fCharClassList
->elementAt(charClassIndex
));
403 if (cclass
->fSet
->size() == 0) {
404 // Some rules or tailorings do end up with empty char classes.
407 int32_t charIndex
= rand() % cclass
->fSet
->size();
408 UChar32 c
= cclass
->fSet
->charAt(charIndex
);
409 if (U16_IS_TRAIL(c
) && fString
.length() > 0 && U16_IS_LEAD(fString
.charAt(fString
.length()-1))) {
410 // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
411 // Don't let random unpaired surrogates combine in the test data because they might
412 // produce an unwanted dictionary character.
416 if (!rules
->fDictionarySet
.contains(c
)) {
422 // Reset each rule matcher regex with this new string.
423 // (Although we are always using the same string object, ICU regular expressions
424 // don't like the underlying string data changing without doing a reset).
426 for (int32_t ruleNum
=0; ruleNum
<rules
->fBreakRules
.size(); ruleNum
++) {
427 BreakRule
*rule
= static_cast<BreakRule
*>(rules
->fBreakRules
.elementAt(ruleNum
));
428 rule
->fRuleMatcher
->reset(fString
);
431 // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
432 // Expected and Actual breaks are one longer than the input string; a non-zero value
433 // will indicate a boundary preceding that position.
436 fExpectedBreaks
= fActualBreaks
;
437 fRuleForPosition
= fActualBreaks
;
438 f2ndRuleForPos
= fActualBreaks
;
440 // Apply reference rules to find the expected breaks.
442 fExpectedBreaks
.setCharAt(0, (UChar
)1); // Force an expected break before the start of the text.
443 // ICU always reports a break there.
444 // The reference rules do not have a means to do so.
446 while (strIdx
< fString
.length()) {
447 BreakRule
*matchingRule
= NULL
;
448 UBool hasBreak
= FALSE
;
450 int32_t matchStart
= 0;
451 int32_t matchEnd
= 0;
452 int32_t breakGroup
= 0;
453 for (ruleNum
=0; ruleNum
<rules
->fBreakRules
.size(); ruleNum
++) {
454 BreakRule
*rule
= static_cast<BreakRule
*>(rules
->fBreakRules
.elementAt(ruleNum
));
455 rule
->fRuleMatcher
->reset();
456 if (rule
->fRuleMatcher
->lookingAt(strIdx
, status
)) {
457 // A candidate rule match, check further to see if we take it or continue to check other rules.
458 // Matches of zero or one codepoint count only if they also specify a break.
459 matchStart
= rule
->fRuleMatcher
->start(status
);
460 matchEnd
= rule
->fRuleMatcher
->end(status
);
461 breakGroup
= rule
->fRuleMatcher
->pattern().groupNumberFromName("BreakPosition", status
);
462 hasBreak
= U_SUCCESS(status
);
463 if (status
== U_REGEX_INVALID_CAPTURE_GROUP_NAME
) {
464 status
= U_ZERO_ERROR
;
466 if (hasBreak
|| fString
.moveIndex32(matchStart
, 1) < matchEnd
) {
472 if (matchingRule
== NULL
) {
473 // No reference rule matched. This is an error in the rules that should never happen.
474 IntlTest::gTest
->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
475 __FILE__
, __LINE__
, strIdx
);
477 status
= U_INVALID_FORMAT_ERROR
;
480 if (matchingRule
->fRuleMatcher
->group(status
).length() == 0) {
481 // Zero length rule match. This is also an error in the rule expressions.
482 IntlTest::gTest
->errln("%s:%d Zero length rule match.",
484 status
= U_INVALID_FORMAT_ERROR
;
488 // Record which rule matched over the length of the match.
489 for (int i
= matchStart
; i
< matchEnd
; i
++) {
490 if (fRuleForPosition
.charAt(i
) == 0) {
491 fRuleForPosition
.setCharAt(i
, (UChar
)ruleNum
);
493 f2ndRuleForPos
.setCharAt(i
, (UChar
)ruleNum
);
497 // Break positions appear in rules as a matching named capture of zero length at the break position,
498 // the adjusted pattern contains (?<BreakPosition>)
500 int32_t breakPos
= matchingRule
->fRuleMatcher
->start(breakGroup
, status
);
501 if (U_FAILURE(status
) || breakPos
< 0) {
502 // Rule specified a break, but that break wasn't part of the match, even
503 // though the rule as a whole matched.
504 // Can't happen with regular expressions derived from (equivalent to) ICU break rules.
505 // Shouldn't get here.
506 IntlTest::gTest
->errln("%s:%d Internal Rule Error.", __FILE__
, __LINE__
);
507 status
= U_INVALID_FORMAT_ERROR
;
510 fExpectedBreaks
.setCharAt(breakPos
, (UChar
)1);
511 // printf("recording break at %d\n", breakPos);
512 // For the next iteration, pick up applying rules immediately after the break,
513 // which may differ from end of the match. The matching rule may have included
514 // context following the boundary that needs to be looked at again.
515 strIdx
= matchingRule
->fRuleMatcher
->end(breakGroup
, status
);
517 // Original rule didn't specify a break.
518 // Continue applying rules starting on the last code point of this match.
519 strIdx
= fString
.moveIndex32(matchEnd
, -1);
520 if (strIdx
== matchStart
) {
521 // Match was only one code point, no progress if we continue.
522 // Shouldn't get here, case is filtered out at top of loop.
524 ruleName
.appendInvariantChars(matchingRule
->fName
, status
);
525 IntlTest::gTest
->errln("%s:%d Rule %s internal error",
526 __FILE__
, __LINE__
, ruleName
.data());
527 status
= U_INVALID_FORMAT_ERROR
;
531 if (U_FAILURE(status
)) {
532 IntlTest::gTest
->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
533 __FILE__
, __LINE__
, u_errorName(status
));
539 void MonkeyTestData::clearActualBreaks() {
540 fActualBreaks
.remove();
541 // Actual Breaks length is one longer than the data string length, allowing
542 // for breaks before the first and after the last character in the data.
543 for (int32_t i
=0; i
<=fString
.length(); i
++) {
544 fActualBreaks
.append((UChar
)0);
548 void MonkeyTestData::dump(int32_t around
) const {
550 " char break Rule Character\n"
551 " pos code class R I name name\n"
552 "---------------------------------------------------------------------------------------------\n");
559 end
= fString
.length();
561 // Display context around a failure.
562 start
= fString
.moveIndex32(around
, -30);
563 end
= fString
.moveIndex32(around
, +30);
566 for (int charIdx
= start
; charIdx
< end
; charIdx
=fString
.moveIndex32(charIdx
, 1)) {
567 UErrorCode status
= U_ZERO_ERROR
;
568 UChar32 c
= fString
.char32At(charIdx
);
569 const CharClass
*cc
= fBkRules
->getClassForChar(c
);
571 ccName
.appendInvariantChars(cc
->fName
, status
);
572 CharString ruleName
, secondRuleName
;
573 const BreakRule
*rule
= static_cast<BreakRule
*>(fBkRules
->fBreakRules
.elementAt(fRuleForPosition
.charAt(charIdx
)));
574 ruleName
.appendInvariantChars(rule
->fName
, status
);
575 if (f2ndRuleForPos
.charAt(charIdx
) > 0) {
576 const BreakRule
*secondRule
= static_cast<BreakRule
*>(fBkRules
->fBreakRules
.elementAt(f2ndRuleForPos
.charAt(charIdx
)));
577 secondRuleName
.appendInvariantChars(secondRule
->fName
, status
);
580 u_charName(c
, U_EXTENDED_CHAR_NAME
, cName
, sizeof(cName
), &status
);
582 printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n",
583 charIdx
, c
, ccName
.data(),
584 fExpectedBreaks
.charAt(charIdx
) ? '*' : '.',
585 fActualBreaks
.charAt(charIdx
) ? '*' : '.',
586 ruleName
.data(), secondRuleName
.data(), cName
592 //---------------------------------------------------------------------------------------
594 // class RBBIMonkeyImpl
596 //---------------------------------------------------------------------------------------
598 RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode
&status
) : fDumpExpansions(FALSE
), fThread(this) {
599 (void)status
; // suppress unused parameter compiler warning.
603 // RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the
604 // reference rules and creating the icu breakiterator to test,
605 // with its type and locale coming from the reference rules.
607 void RBBIMonkeyImpl::setup(const char *ruleFile
, UErrorCode
&status
) {
608 fRuleFileName
= ruleFile
;
609 openBreakRules(ruleFile
, status
);
610 if (U_FAILURE(status
)) {
611 IntlTest::gTest
->errln("%s:%d Error %s opening file %s.", __FILE__
, __LINE__
, u_errorName(status
), ruleFile
);
614 fRuleSet
.adoptInstead(new BreakRules(this, status
));
615 fRuleSet
->compileRules(fRuleCharBuffer
.getAlias(), status
);
616 if (U_FAILURE(status
)) {
617 IntlTest::gTest
->errln("%s:%d Error %s processing file %s.", __FILE__
, __LINE__
, u_errorName(status
), ruleFile
);
620 fBI
.adoptInstead(fRuleSet
->createICUBreakIterator(status
));
621 fTestData
.adoptInstead(new MonkeyTestData());
625 RBBIMonkeyImpl::~RBBIMonkeyImpl() {
629 void RBBIMonkeyImpl::openBreakRules(const char *fileName
, UErrorCode
&status
) {
631 path
.append(IntlTest::getSourceTestData(status
), status
);
632 path
.append("break_rules" U_FILE_SEP_STRING
, status
);
633 path
.appendPathPart(fileName
, status
);
634 const char *codePage
= "UTF-8";
635 fRuleCharBuffer
.adoptInstead(ucbuf_open(path
.data(), &codePage
, TRUE
, FALSE
, &status
));
639 void RBBIMonkeyImpl::startTest() {
640 fThread
.start(); // invokes runTest() in a separate thread.
643 void RBBIMonkeyImpl::join() {
648 #define MONKEY_ERROR(msg, index) { \
649 IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
650 __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
651 if (fVerbose) { fTestData->dump(index); } \
652 status = U_INVALID_STATE_ERROR; \
655 void RBBIMonkeyImpl::runTest() {
656 UErrorCode status
= U_ZERO_ERROR
;
657 int32_t errorCount
= 0;
658 for (int64_t loopCount
= 0; fLoopCount
< 0 || loopCount
< fLoopCount
; loopCount
++) {
659 status
= U_ZERO_ERROR
;
660 fTestData
->set(fRuleSet
.getAlias(), fRandomGenerator
, status
);
662 IntlTest::gTest
->dataerrln("Unable to run test because fBI is null.");
665 if ( uprv_strcmp(fRuleFileName
,"line_loose_cj.txt") == 0 && fTestData
->fRandomSeed
==1712915859 ) {
666 continue; // known bug around index 103-104, break expected/actual 0/1, fwd 0020 200D | FDFC, rev 1325A 0020 | 200D
668 // fTestData->dump();
669 testForwards(status
);
670 testPrevious(status
);
671 testFollowing(status
);
672 testPreceding(status
);
673 testIsBoundary(status
);
675 if (fLoopCount
< 0 && loopCount
% 100 == 0) {
676 fprintf(stderr
, ".");
678 if (U_FAILURE(status
)) {
679 if (++errorCount
> 10) {
686 void RBBIMonkeyImpl::testForwards(UErrorCode
&status
) {
687 if (U_FAILURE(status
)) {
690 fTestData
->clearActualBreaks();
691 fBI
->setText(fTestData
->fString
);
692 int32_t previousBreak
= -2;
693 for (int32_t bk
=fBI
->first(); bk
!= BreakIterator::DONE
; bk
=fBI
->next()) {
694 if (bk
<= previousBreak
) {
695 MONKEY_ERROR("Break Iterator Stall", bk
);
698 if (bk
< 0 || bk
> fTestData
->fString
.length()) {
699 MONKEY_ERROR("Boundary out of bounds", bk
);
702 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
704 checkResults("testForwards", FORWARD
, status
);
707 void RBBIMonkeyImpl::testFollowing(UErrorCode
&status
) {
708 if (U_FAILURE(status
)) {
711 fTestData
->clearActualBreaks();
712 fBI
->setText(fTestData
->fString
);
713 int32_t nextBreak
= -1;
714 for (int32_t i
=-1 ; i
<fTestData
->fString
.length(); ++i
) {
715 int32_t bk
= fBI
->following(i
);
716 if (bk
== BreakIterator::DONE
&& i
== fTestData
->fString
.length()) {
719 if (bk
== nextBreak
&& bk
> i
) {
720 // i is in the gap between two breaks.
723 if (i
== nextBreak
&& bk
> nextBreak
) {
724 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
728 MONKEY_ERROR("following(i)", i
);
731 checkResults("testFollowing", FORWARD
, status
);
736 void RBBIMonkeyImpl::testPrevious(UErrorCode
&status
) {
737 if (U_FAILURE(status
)) {return;}
739 fTestData
->clearActualBreaks();
740 fBI
->setText(fTestData
->fString
);
741 int32_t previousBreak
= INT32_MAX
;
742 for (int32_t bk
=fBI
->last(); bk
!= BreakIterator::DONE
; bk
=fBI
->previous()) {
743 if (bk
>= previousBreak
) {
744 MONKEY_ERROR("Break Iterator Stall", bk
);
747 if (bk
< 0 || bk
> fTestData
->fString
.length()) {
748 MONKEY_ERROR("Boundary out of bounds", bk
);
751 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
753 checkResults("testPrevious", REVERSE
, status
);
757 void RBBIMonkeyImpl::testPreceding(UErrorCode
&status
) {
758 if (U_FAILURE(status
)) {
761 fTestData
->clearActualBreaks();
762 fBI
->setText(fTestData
->fString
);
763 int32_t nextBreak
= fTestData
->fString
.length()+1;
764 for (int32_t i
=fTestData
->fString
.length()+1 ; i
>=0; --i
) {
765 int32_t bk
= fBI
->preceding(i
);
766 // printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
767 if (bk
== BreakIterator::DONE
&& i
== 0) {
770 if (bk
== nextBreak
&& bk
< i
) {
771 // i is in the gap between two breaks.
774 if (i
<fTestData
->fString
.length() && fTestData
->fString
.getChar32Start(i
) < i
) {
775 // i indexes to a trailing surrogate.
776 // Break Iterators treat an index to either half as referring to the supplemental code point,
777 // with preceding going to some preceding code point.
778 if (fBI
->preceding(i
) != fBI
->preceding(fTestData
->fString
.getChar32Start(i
))) {
779 MONKEY_ERROR("preceding of trailing surrogate error", i
);
783 if (i
== nextBreak
&& bk
< nextBreak
) {
784 fTestData
->fActualBreaks
.setCharAt(bk
, 1);
788 MONKEY_ERROR("preceding(i)", i
);
791 checkResults("testPreceding", REVERSE
, status
);
795 void RBBIMonkeyImpl::testIsBoundary(UErrorCode
&status
) {
796 if (U_FAILURE(status
)) {
799 fTestData
->clearActualBreaks();
800 fBI
->setText(fTestData
->fString
);
801 for (int i
=fTestData
->fString
.length(); i
>=0; --i
) {
802 if (fBI
->isBoundary(i
)) {
803 fTestData
->fActualBreaks
.setCharAt(i
, 1);
806 checkResults("testForwards", FORWARD
, status
);
809 void RBBIMonkeyImpl::checkResults(const char *msg
, CheckDirection direction
, UErrorCode
&status
) {
810 if (U_FAILURE(status
)) {
813 if (direction
== FORWARD
) {
814 for (int i
=0; i
<=fTestData
->fString
.length(); ++i
) {
815 if (fTestData
->fExpectedBreaks
.charAt(i
) != fTestData
->fActualBreaks
.charAt(i
)) {
817 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
818 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-2), fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
819 fRuleFileName
, fTestData
->fRandomSeed
);
821 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
822 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
823 fRuleFileName
, fTestData
->fRandomSeed
);
828 status
= U_INVALID_STATE_ERROR
; // Prevent the test from continuing, which would likely
829 break; // produce many redundant errors.
833 for (int i
=fTestData
->fString
.length(); i
>=0; i
--) {
834 if (fTestData
->fExpectedBreaks
.charAt(i
) != fTestData
->fActualBreaks
.charAt(i
)) {
836 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
837 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-2), fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
838 fRuleFileName
, fTestData
->fRandomSeed
);
840 IntlTest::gTest
->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
841 __FILE__
, __LINE__
, msg
, i
, fTestData
->fString
.char32At(i
-1), fTestData
->fString
.char32At(i
), fTestData
->fExpectedBreaks
.charAt(i
), fTestData
->fActualBreaks
.charAt(i
),
842 fRuleFileName
, fTestData
->fRandomSeed
);
847 status
= U_INVALID_STATE_ERROR
;
856 //---------------------------------------------------------------------------------------
858 // class RBBIMonkeyTest implementation.
860 //---------------------------------------------------------------------------------------
861 RBBIMonkeyTest::RBBIMonkeyTest() {
864 RBBIMonkeyTest::~RBBIMonkeyTest() {
868 // params, taken from this->fParams.
869 // rules=file_name Name of file containing the reference rules.
870 // seed=nnnnn Random number starting seed.
871 // Setting the seed allows errors to be reproduced.
872 // loop=nnn Looping count. Controls running time.
874 // 0 or greater: run length.
875 // expansions debug option, show expansions of rules and sets.
876 // verbose Display details of the failure.
878 // Parameters on the intltest command line follow the test name, and are preceded by '@'.
880 // intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
882 void RBBIMonkeyTest::testMonkey() {
883 // printf("Test parameters: %s\n", fParams);
884 UnicodeString
params(fParams
);
885 UErrorCode status
= U_ZERO_ERROR
;
887 const char *tests
[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
888 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
890 CharString testNameFromParams
;
891 if (getStringParam("rules", params
, testNameFromParams
, status
)) {
892 tests
[0] = testNameFromParams
.data();
896 int64_t loopCount
= quick
? 100 : 5000;
897 getIntParam("loop", params
, loopCount
, status
);
899 UBool dumpExpansions
= FALSE
;
900 getBoolParam("expansions", params
, dumpExpansions
, status
);
902 UBool verbose
= FALSE
;
903 getBoolParam("verbose", params
, verbose
, status
);
906 getIntParam("seed", params
, seed
, status
);
908 if (params
.length() != 0) {
909 // Options processing did not consume all of the parameters. Something unrecognized was present.
910 CharString unrecognizedParameters
;
911 unrecognizedParameters
.append(CStr(params
)(), -1, status
);
912 errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__
, __LINE__
, unrecognizedParameters
.data());
916 UVector
startedTests(status
);
917 if (U_FAILURE(status
)) {
918 errln("%s:%d: error %s while setting up test.", __FILE__
, __LINE__
, u_errorName(status
));
922 // Monkey testing is multi-threaded.
923 // Each set of break rules to be tested is run in a separate thread.
924 // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
926 for (i
=0; tests
[i
] != NULL
; ++i
) {
927 logln("beginning testing of %s", tests
[i
]);
928 RBBIMonkeyImpl
*test
= new RBBIMonkeyImpl(status
);
929 if (U_FAILURE(status
)) {
930 errln("%s:%d: error %s while starting test %s.", __FILE__
, __LINE__
, u_errorName(status
), tests
[i
]);
933 test
->fDumpExpansions
= dumpExpansions
;
934 test
->fVerbose
= verbose
;
935 test
->fRandomGenerator
.seed((uint32_t)seed
);
936 test
->fLoopCount
= loopCount
;
937 test
->setup(tests
[i
], status
);
938 if (U_FAILURE(status
)) {
939 errln("%s:%d: error %s while starting test %s.", __FILE__
, __LINE__
, u_errorName(status
), tests
[i
]);
943 startedTests
.addElement(test
, status
);
944 if (U_FAILURE(status
)) {
945 errln("%s:%d: error %s while starting test %s.", __FILE__
, __LINE__
, u_errorName(status
), tests
[i
]);
950 for (i
=0; i
<startedTests
.size(); ++i
) {
951 RBBIMonkeyImpl
*test
= static_cast<RBBIMonkeyImpl
*>(startedTests
.elementAt(i
));
958 UBool
RBBIMonkeyTest::getIntParam(UnicodeString name
, UnicodeString
¶ms
, int64_t &val
, UErrorCode
&status
) {
959 name
.append(" *= *(-?\\d+) *,? *");
960 RegexMatcher
m(name
, params
, 0, status
);
962 // The param exists. Convert the string to an int.
964 str
.append(CStr(m
.group(1, status
))(), -1, status
);
965 val
= strtol(str
.data(), NULL
, 10);
967 // Delete this parameter from the params string.
969 params
= m
.replaceFirst(UnicodeString(), status
);
975 UBool
RBBIMonkeyTest::getStringParam(UnicodeString name
, UnicodeString
¶ms
, CharString
&dest
, UErrorCode
&status
) {
976 name
.append(" *= *([^ ,]*) *,? *");
977 RegexMatcher
m(name
, params
, 0, status
);
980 dest
.append(CStr(m
.group(1, status
))(), -1, status
);
982 // Delete this parameter from the params string.
984 params
= m
.replaceFirst(UnicodeString(), status
);
990 UBool
RBBIMonkeyTest::getBoolParam(UnicodeString name
, UnicodeString
¶ms
, UBool
&dest
, UErrorCode
&status
) {
991 name
.append("(?: *= *(true|false))? *,? *");
992 RegexMatcher
m(name
, params
, UREGEX_CASE_INSENSITIVE
, status
);
994 if (m
.start(1, status
) > 0) {
995 // user option included a value.
996 dest
= m
.group(1, status
).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT
) == 0;
998 // No explicit user value, implies true.
1002 // Delete this parameter from the params string.
1004 params
= m
.replaceFirst(UnicodeString(), status
);
1010 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */