icuSources/test/intltest/rbbimonkeytest.cpp

   1 /********************************************************************
   2  * Copyright (c) 2016, International Business Machines Corporation and
   3  * others. All Rights Reserved.
   4  ********************************************************************/
   5
   6
   7 #include "unicode/utypes.h"
   8
   9 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
  10
  11 #include "rbbimonkeytest.h"
  12 #include "unicode/utypes.h"
  13 #include "unicode/brkiter.h"
  14 #include "unicode/utf16.h"
  15 #include "unicode/uniset.h"
  16 #include "unicode/unistr.h"
  17
  18 #include "charstr.h"
  19 #include "cmemory.h"
  20 #include "cstr.h"
  21 #include "uelement.h"
  22 #include "uhash.h"
  23 #include "cstring.h"
  24
  25 #include "iostream"
  26 #include "string"
  27
  28 using namespace icu;
  29
  30
  31 void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) {
  32     fParams = params;            // Work around TESTCASE_AUTO not being able to pass params to test function.
  33
  34     TESTCASE_AUTO_BEGIN;
  35     TESTCASE_AUTO(testMonkey);
  36     TESTCASE_AUTO_END;
  37 }
  38
  39 //---------------------------------------------------------------------------------------
  40 //
  41 //   class BreakRule implementation.
  42 //
  43 //---------------------------------------------------------------------------------------
  44
  45 BreakRule::BreakRule()      // :  all field default initialized.
  46 {
  47 }
  48
  49 BreakRule::~BreakRule() {}
  50
  51
  52 //---------------------------------------------------------------------------------------
  53 //
  54 //   class BreakRules implementation.
  55 //
  56 //---------------------------------------------------------------------------------------
  57 BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status)  :
  58         fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) {
  59     fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString,
  60                                          uhash_compareUnicodeString,
  61                                          NULL,      // value comparator.
  62                                          &status));
  63     if (U_FAILURE(status)) {
  64         return;
  65     }
  66     uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
  67     uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
  68     fBreakRules.setDeleter(uprv_deleteUObject);
  69
  70     fCharClassList.adoptInstead(new UVector(status));
  71
  72     fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
  73              "(?!(?:\\{|=|\\[:)[ \\t]{0,4})"              // Negative lookbehind for '{' or '=' or '[:'
  74                                                           //   (the identifier is a unicode property name or value)
  75              "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"),     // The char class name
  76         0, status));
  77
  78     // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
  79     fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
  80                 "(^|(?<=;))"                    // Start either at start of line, or just after a ';' (look-behind for ';')
  81                 "[ \\t]*+"                      //   Match white space.
  82                 "(#.*)?+"                       //   Optional # plus whatever follows
  83                 "\\R$"                          //   new-line at end of line.
  84             ), 0, status));
  85
  86     // Match (initial parse) of a character class defintion line.
  87     fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
  88                 "[ \\t]*"                                // leading white space
  89                 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"   // The char class name
  90                 "[ \\t]*=[ \\t]*"                        //   =
  91                 "(?<ClassDef>.*?)"                       // The char class UnicodeSet expression
  92                 "[ \\t]*;$"),                     // ; <end of line>
  93             0, status));
  94
  95     // Match (initial parse) of a break rule line.
  96     fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
  97                 "[ \\t]*"                                // leading white space
  98                 "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)"    // The rule name
  99                 "[ \\t]*:[ \\t]*"                        //   :
 100                 "(?<RuleDef>.*?)"                        // The rule definition
 101                 "[ \\t]*;$"),                            // ; <end of line>
 102             0, status));
 103
 104 }
 105
 106
 107 BreakRules::~BreakRules() {}
 108
 109
 110 CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
 111
 112     // Create the expanded definition for this char class,
 113     // replacing any set references with the corresponding definition.
 114
 115     UnicodeString expandedDef;
 116     UnicodeString emptyString;
 117     fSetRefsMatcher->reset(definition);
 118     while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
 119         const UnicodeString name =
 120                 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
 121         CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
 122         const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
 123
 124         fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status);
 125         expandedDef.append(expansionForName);
 126     }
 127     fSetRefsMatcher->appendTail(expandedDef);
 128
 129     // Verify that the expanded set defintion is valid.
 130
 131     if (fMonkeyImpl->fDumpExpansions) {
 132         printf("epandedDef: %s\n", CStr(expandedDef)());
 133     }
 134
 135     UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status);
 136     if (U_FAILURE(status)) {
 137         IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__,
 138                                u_errorName(status), CStr(name)());
 139         return NULL;
 140     }
 141     CharClass *cclass = new CharClass(name, definition, expandedDef, s);
 142     CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(),
 143                                                         new UnicodeString(name),   // Key, owned by hash table.
 144                                                         cclass,                    // Value, owned by hash table.
 145                                                         &status));
 146
 147     if (previousClass != NULL) {
 148         // Duplicate class def.
 149         // These are legitimate, they are adustments of an existing class.
 150         // TODO: will need to keep the old around when we handle tailorings.
 151         IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
 152         delete previousClass;
 153     }
 154     return cclass;
 155 }
 156
 157
 158 void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
 159     LocalPointer<BreakRule> thisRule(new BreakRule);
 160     thisRule->fName = name;
 161     thisRule->fRule = definition;
 162
 163     // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
 164     // This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
 165     UnicodeString emptyString;
 166
 167     // Expand the char class definitions within the rule.
 168     fSetRefsMatcher->reset(definition);
 169     while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
 170         const UnicodeString name =
 171                 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
 172         CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
 173         if (!nameClass) {
 174             IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
 175                 __FILE__, __LINE__, CStr(name)(), CStr(definition)());
 176         }
 177         const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
 178
 179         fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status);
 180         thisRule->fExpandedRule.append(expansionForName);
 181     }
 182     fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
 183
 184     // Replace the divide sign (\u00f7) with a regular expression named capture.
 185     // When running the rules, a match that includes this group means we found a break position.
 186
 187     int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7);
 188     if (dividePos >= 0) {
 189         thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)"));
 190     }
 191     if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) {
 192         status = U_ILLEGAL_ARGUMENT_ERROR;   // TODO: produce a good error message.
 193     }
 194
 195     // UAX break rule set definitions can be empty, just [].
 196     // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
 197     // also matches nothing.
 198
 199     static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0};
 200     int32_t where = 0;
 201     while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) {
 202         thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
 203     }
 204     if (fMonkeyImpl->fDumpExpansions) {
 205         printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)());
 206     }
 207
 208     // Compile a regular expression for this rule.
 209     thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status));
 210     if (U_FAILURE(status)) {
 211         IntlTest::gTest->errln("%s:%d Error creating regular expression for %s",
 212                 __FILE__, __LINE__, CStr(thisRule->fExpandedRule)());
 213         return;
 214     }
 215
 216     // Put this new rule into the vector of all Rules.
 217     fBreakRules.addElement(thisRule.orphan(), status);
 218 }
 219
 220
 221 bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) {
 222     if (keyword == UnicodeString("locale")) {
 223         CharString localeName;
 224         localeName.append(CStr(value)(), -1, status);
 225         fLocale = Locale::createFromName(localeName.data());
 226         return true;
 227     }
 228     if (keyword == UnicodeString("type")) {
 229         if (value == UnicodeString("grapheme")) {
 230             fType = UBRK_CHARACTER;
 231         } else if (value == UnicodeString("word")) {
 232             fType = UBRK_WORD;
 233         } else if (value == UnicodeString("line")) {
 234             fType = UBRK_LINE;
 235         } else if (value == UnicodeString("sentence")) {
 236             fType = UBRK_SENTENCE;
 237         } else {
 238             IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__,  CStr(value)());
 239         }
 240         return true;
 241     }
 242     // TODO: add tailoring base setting here.
 243     return false;
 244 }
 245
 246 RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) {
 247     if (U_FAILURE(status)) {
 248         return NULL;
 249     }
 250     RuleBasedBreakIterator *bi = NULL;
 251     switch(fType) {
 252         case UBRK_CHARACTER:
 253             bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status));
 254             break;
 255         case UBRK_WORD:
 256             bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status));
 257             break;
 258         case UBRK_LINE:
 259             bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status));
 260             break;
 261         case UBRK_SENTENCE:
 262             bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status));
 263             break;
 264         default:
 265             IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType);
 266             status = U_ILLEGAL_ARGUMENT_ERROR;
 267     }
 268     return bi;
 269 }
 270
 271
 272 void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) {
 273     if (U_FAILURE(status)) {
 274         return;
 275     }
 276
 277     UnicodeString emptyString;
 278     for (int32_t lineNumber=0; ;lineNumber++) {    // Loop once per input line.
 279         if (U_FAILURE(status)) {
 280             return;
 281         }
 282         int32_t lineLength = 0;
 283         const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status);
 284         if (lineBuf == NULL) {
 285             break;
 286         }
 287         UnicodeString line(lineBuf, lineLength);
 288
 289         // Strip comment lines.
 290         fCommentsMatcher->reset(line);
 291         line = fCommentsMatcher->replaceFirst(emptyString, status);
 292         if (line.isEmpty()) {
 293             continue;
 294         }
 295
 296         // Recognize character class definition and keyword lines
 297         fClassDefMatcher->reset(line);
 298         if (fClassDefMatcher->matches(status)) {
 299             UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status);
 300             UnicodeString classDef  = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status);
 301             if (fMonkeyImpl->fDumpExpansions) {
 302                 printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)());
 303             }
 304             if (setKeywordParameter(className, classDef, status)) {
 305                 // The scanned item was "type = ..." or "locale = ...", etc.
 306                 //   which are not actual character classes.
 307                 continue;
 308             }
 309             addCharClass(className, classDef, status);
 310             continue;
 311         }
 312
 313         // Recognize rule lines.
 314         fRuleDefMatcher->reset(line);
 315         if (fRuleDefMatcher->matches(status)) {
 316             UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status);
 317             UnicodeString ruleDef  = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status);
 318             if (fMonkeyImpl->fDumpExpansions) {
 319                 printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)());
 320             }
 321             addRule(ruleName, ruleDef, status);
 322             continue;
 323         }
 324
 325         IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
 326             __FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)());
 327     }
 328
 329     // Build the vector of char classes, omitting the dictionary class if there is one.
 330     // This will be used when constructing the random text to be tested.
 331
 332     // Also compute the "other" set, consisting of any characters not included in
 333     // one or more of the user defined sets.
 334
 335     UnicodeSet otherSet((UChar32)0, 0x10ffff);
 336     int32_t pos = UHASH_FIRST;
 337     const UHashElement *el = NULL;
 338     while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) {
 339         const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer);
 340         CharClass *cclass = static_cast<CharClass *>(el->value.pointer);
 341         // printf("    Adding %s\n", CStr(*ccName)());
 342         if (*ccName != cclass->fName) {
 343             IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
 344                     __FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)());
 345         }
 346         const UnicodeSet *set = cclass->fSet.getAlias();
 347         otherSet.removeAll(*set);
 348         if (*ccName == UnicodeString("dictionary")) {
 349             fDictionarySet = *set;
 350         } else {
 351             fCharClassList->addElement(cclass, status);
 352         }
 353     }
 354
 355     if (!otherSet.isEmpty()) {
 356         // fprintf(stderr, "have an other set.\n");
 357         UnicodeString pattern;
 358         CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status);
 359         fCharClassList->addElement(cclass, status);
 360     }
 361 }
 362
 363
 364 const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const {
 365    int32_t localIter = 0;
 366    int32_t &it = iter? *iter : localIter;
 367
 368    while (it < fCharClassList->size()) {
 369        const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it));
 370        ++it;
 371        if (cc->fSet->contains(c)) {
 372            return cc;
 373        }
 374     }
 375     return NULL;
 376 }
 377
 378 //---------------------------------------------------------------------------------------
 379 //
 380 //   class MonkeyTestData implementation.
 381 //
 382 //---------------------------------------------------------------------------------------
 383
 384 void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) {
 385     const int32_t dataLength = 1000;
 386
 387     // Fill the test string with random characters.
 388     // First randomly pick a char class, then randomly pick a character from that class.
 389     // Exclude any characters from the dictionary set.
 390
 391     // std::cout << "Populating Test Data" << std::endl;
 392     fRandomSeed = rand.getSeed();         // Save initial seed for use in error messages,
 393                                           // allowing recreation of failing data.
 394     fBkRules = rules;
 395     fString.remove();
 396     for (int32_t n=0; n<dataLength;) {
 397         int charClassIndex = rand() % rules->fCharClassList->size();
 398         const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex));
 399         if (cclass->fSet->size() == 0) {
 400             // Some rules or tailorings do end up with empty char classes.
 401             continue;
 402         }
 403         int32_t charIndex = rand() % cclass->fSet->size();
 404         UChar32 c = cclass->fSet->charAt(charIndex);
 405         if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) {
 406             // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
 407             // Don't let random unpaired surrogates combine in the test data because they might
 408             // produce an unwanted dictionary character.
 409             continue;
 410         }
 411
 412         if (!rules->fDictionarySet.contains(c)) {
 413             fString.append(c);
 414             ++n;
 415         }
 416     }
 417
 418     // Reset each rule matcher regex with this new string.
 419     //    (Although we are always using the same string object, ICU regular expressions
 420     //    don't like the underlying string data changing without doing a reset).
 421
 422     for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
 423         BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
 424             rule->fRuleMatcher->reset(fString);
 425     }
 426
 427     // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
 428     // Expected and Actual breaks are one longer than the input string; a non-zero value
 429     // will indicate a boundary preceding that position.
 430
 431     clearActualBreaks();
 432     fExpectedBreaks  = fActualBreaks;
 433     fRuleForPosition = fActualBreaks;
 434     f2ndRuleForPos   = fActualBreaks;
 435
 436     // Apply reference rules to find the expected breaks.
 437
 438     fExpectedBreaks.setCharAt(0, (UChar)1);  // Force an expected break before the start of the text.
 439                                              // ICU always reports a break there.
 440                                              // The reference rules do not have a means to do so.
 441     int32_t strIdx = 0;
 442     while (strIdx < fString.length()) {
 443         BreakRule *matchingRule = NULL;
 444         UBool      hasBreak = FALSE;
 445         int32_t ruleNum = 0;
 446         int32_t matchStart = 0;
 447         int32_t matchEnd = 0;
 448         int32_t breakGroup = 0;
 449         for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
 450             BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
 451             rule->fRuleMatcher->reset();
 452             if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
 453                 // A candidate rule match, check further to see if we take it or continue to check other rules.
 454                 // Matches of zero or one codepoint count only if they also specify a break.
 455                 matchStart = rule->fRuleMatcher->start(status);
 456                 matchEnd = rule->fRuleMatcher->end(status);
 457                 breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status);
 458                 hasBreak = U_SUCCESS(status);
 459                 if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
 460                     status = U_ZERO_ERROR;
 461                 }
 462                 if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) {
 463                     matchingRule = rule;
 464                     break;
 465                 }
 466             }
 467         }
 468         if (matchingRule == NULL) {
 469             // No reference rule matched. This is an error in the rules that should never happen.
 470             IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
 471                  __FILE__, __LINE__, strIdx);
 472             dump(strIdx);
 473             status = U_INVALID_FORMAT_ERROR;
 474             return;
 475         }
 476         if (matchingRule->fRuleMatcher->group(status).length() == 0) {
 477             // Zero length rule match. This is also an error in the rule expressions.
 478             IntlTest::gTest->errln("%s:%d Zero length rule match.",
 479                 __FILE__, __LINE__);
 480             status =  U_INVALID_FORMAT_ERROR;
 481             return;
 482         }
 483
 484         // Record which rule matched over the length of the match.
 485         for (int i = matchStart; i < matchEnd; i++) {
 486             if (fRuleForPosition.charAt(i) == 0) {
 487                 fRuleForPosition.setCharAt(i, (UChar)ruleNum);
 488             } else {
 489                 f2ndRuleForPos.setCharAt(i, (UChar)ruleNum);
 490             }
 491         }
 492
 493         // Break positions appear in rules as a matching named capture of zero length at the break position,
 494         //   the adjusted pattern contains (?<BreakPosition>)
 495         if (hasBreak) {
 496             int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status);
 497             if (U_FAILURE(status) || breakPos < 0) {
 498                 // Rule specified a break, but that break wasn't part of the match, even
 499                 // though the rule as a whole matched.
 500                 // Can't happen with regular expressions derived from (equivalent to) ICU break rules.
 501                 // Shouldn't get here.
 502                 IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__);
 503                 status =  U_INVALID_FORMAT_ERROR;
 504                 break;
 505             }
 506             fExpectedBreaks.setCharAt(breakPos, (UChar)1);
 507             // printf("recording break at %d\n", breakPos);
 508             // For the next iteration, pick up applying rules immediately after the break,
 509             // which may differ from end of the match. The matching rule may have included
 510             // context following the boundary that needs to be looked at again.
 511             strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
 512         } else {
 513             // Original rule didn't specify a break.
 514             // Continue applying rules starting on the last code point of this match.
 515             strIdx = fString.moveIndex32(matchEnd, -1);
 516             if (strIdx == matchStart) {
 517                 // Match was only one code point, no progress if we continue.
 518                 // Shouldn't get here, case is filtered out at top of loop.
 519                 CharString ruleName;
 520                 ruleName.appendInvariantChars(matchingRule->fName, status);
 521                 IntlTest::gTest->errln("%s:%d Rule %s internal error",
 522                         __FILE__, __LINE__, ruleName.data());
 523                 status = U_INVALID_FORMAT_ERROR;
 524                 break;
 525             }
 526         }
 527         if (U_FAILURE(status)) {
 528             IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
 529                 __FILE__, __LINE__, u_errorName(status));
 530             break;
 531         }
 532     }
 533 }
 534
 535 void MonkeyTestData::clearActualBreaks() {
 536     fActualBreaks.remove();
 537     // Actual Breaks length is one longer than the data string length, allowing
 538     //    for breaks before the first and after the last character in the data.
 539     for (int32_t i=0; i<=fString.length(); i++) {
 540         fActualBreaks.append((UChar)0);
 541     }
 542 }
 543
 544 void MonkeyTestData::dump(int32_t around) const {
 545     printf("\n"
 546            "         char                        break  Rule                     Character\n"
 547            "   pos   code   class                 R I   name                     name\n"
 548            "---------------------------------------------------------------------------------------------\n");
 549
 550     int32_t start;
 551     int32_t end;
 552
 553     if (around == -1) {
 554         start = 0;
 555         end = fString.length();
 556     } else {
 557         // Display context around a failure.
 558         start = fString.moveIndex32(around, -30);
 559         end = fString.moveIndex32(around, +30);
 560     }
 561
 562     for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) {
 563         UErrorCode status = U_ZERO_ERROR;
 564         UChar32 c = fString.char32At(charIdx);
 565         const CharClass *cc = fBkRules->getClassForChar(c);
 566         CharString ccName;
 567         ccName.appendInvariantChars(cc->fName, status);
 568         CharString ruleName, secondRuleName;
 569         const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx)));
 570         ruleName.appendInvariantChars(rule->fName, status);
 571         if (f2ndRuleForPos.charAt(charIdx) > 0) {
 572             const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx)));
 573             secondRuleName.appendInvariantChars(secondRule->fName, status);
 574         }
 575         char cName[200];
 576         u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
 577
 578         printf("  %4.1d %6.4x   %-20s  %c %c   %-10s %-10s    %s\n",
 579             charIdx, c, ccName.data(),
 580             fExpectedBreaks.charAt(charIdx) ? '*' : '.',
 581             fActualBreaks.charAt(charIdx) ? '*' : '.',
 582             ruleName.data(), secondRuleName.data(), cName
 583         );
 584     }
 585 }
 586
 587
 588 //---------------------------------------------------------------------------------------
 589 //
 590 //   class RBBIMonkeyImpl
 591 //
 592 //---------------------------------------------------------------------------------------
 593
 594 RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) {
 595     (void)status;    // suppress unused parameter compiler warning.
 596 }
 597
 598
 599 // RBBIMonkeyImpl setup       does all of the setup for a single rule set - compiling the
 600 //                            reference rules and creating the icu breakiterator to test,
 601 //                            with its type and locale coming from the reference rules.
 602
 603 void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) {
 604     fRuleFileName = ruleFile;
 605     openBreakRules(ruleFile, status);
 606     if (U_FAILURE(status)) {
 607         IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
 608         return;
 609     }
 610     fRuleSet.adoptInstead(new BreakRules(this, status));
 611     fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status);
 612     if (U_FAILURE(status)) {
 613         IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
 614         return;
 615     }
 616     fBI.adoptInstead(fRuleSet->createICUBreakIterator(status));
 617     fTestData.adoptInstead(new MonkeyTestData());
 618 }
 619
 620
 621 RBBIMonkeyImpl::~RBBIMonkeyImpl() {
 622 }
 623
 624
 625 void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) {
 626     CharString path;
 627     path.append(IntlTest::getSourceTestData(status), status);
 628     path.append("break_rules" U_FILE_SEP_STRING, status);
 629     path.appendPathPart(fileName, status);
 630     const char *codePage = "UTF-8";
 631     fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status));
 632 }
 633
 634
 635 void RBBIMonkeyImpl::startTest() {
 636     fThread.start();   // invokes runTest() in a separate thread.
 637 }
 638
 639 void RBBIMonkeyImpl::join() {
 640     fThread.join();
 641 }
 642
 643
 644 #define MONKEY_ERROR(msg, index) { \
 645     IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
 646                     __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
 647     if (fVerbose) { fTestData->dump(index); } \
 648     status = U_INVALID_STATE_ERROR;  \
 649 }
 650
 651 void RBBIMonkeyImpl::runTest() {
 652     UErrorCode status = U_ZERO_ERROR;
 653     int32_t errorCount = 0;
 654     for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
 655         status = U_ZERO_ERROR;
 656         fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status);
 657         if (fBI.isNull()) {
 658             IntlTest::gTest->dataerrln("Unable to run test because fBI is null.");
 659             return;
 660         }
 661         if ( uprv_strcmp(fRuleFileName,"line_loose_cj.txt") == 0 && fTestData->fRandomSeed==1712915859 ) {
 662             continue; // known bug around index 103-104, break expected/actual 0/1, fwd 0020 200D | FDFC, rev 1325A 0020 | 200D
 663         }
 664         // fTestData->dump();
 665         testForwards(status);
 666         testPrevious(status);
 667         testFollowing(status);
 668         testPreceding(status);
 669         testIsBoundary(status);
 670
 671         if (fLoopCount < 0 && loopCount % 100 == 0) {
 672             fprintf(stderr, ".");
 673         }
 674         if (U_FAILURE(status)) {
 675             if (++errorCount > 10) {
 676                 return;
 677             }
 678         }
 679     }
 680 }
 681
 682 void RBBIMonkeyImpl::testForwards(UErrorCode &status) {
 683     if (U_FAILURE(status)) {
 684         return;
 685     }
 686     fTestData->clearActualBreaks();
 687     fBI->setText(fTestData->fString);
 688     int32_t previousBreak = -2;
 689     for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) {
 690         if (bk <= previousBreak) {
 691             MONKEY_ERROR("Break Iterator Stall", bk);
 692             return;
 693         }
 694         if (bk < 0 || bk > fTestData->fString.length()) {
 695             MONKEY_ERROR("Boundary out of bounds", bk);
 696             return;
 697         }
 698         fTestData->fActualBreaks.setCharAt(bk, 1);
 699     }
 700     checkResults("testForwards", FORWARD, status);
 701 }
 702
 703 void RBBIMonkeyImpl::testFollowing(UErrorCode &status) {
 704     if (U_FAILURE(status)) {
 705         return;
 706     }
 707     fTestData->clearActualBreaks();
 708     fBI->setText(fTestData->fString);
 709     int32_t nextBreak = -1;
 710     for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) {
 711         int32_t bk = fBI->following(i);
 712         if (bk == BreakIterator::DONE && i == fTestData->fString.length()) {
 713             continue;
 714         }
 715         if (bk == nextBreak && bk > i) {
 716             // i is in the gap between two breaks.
 717             continue;
 718         }
 719         if (i == nextBreak && bk > nextBreak) {
 720             fTestData->fActualBreaks.setCharAt(bk, 1);
 721             nextBreak = bk;
 722             continue;
 723         }
 724         MONKEY_ERROR("following(i)", i);
 725         return;
 726     }
 727     checkResults("testFollowing", FORWARD, status);
 728 }
 729
 730
 731
 732 void RBBIMonkeyImpl::testPrevious(UErrorCode &status) {
 733     if (U_FAILURE(status)) {return;}
 734
 735     fTestData->clearActualBreaks();
 736     fBI->setText(fTestData->fString);
 737     int32_t previousBreak = INT32_MAX;
 738     for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) {
 739          if (bk >= previousBreak) {
 740             MONKEY_ERROR("Break Iterator Stall", bk);
 741             return;
 742         }
 743         if (bk < 0 || bk > fTestData->fString.length()) {
 744             MONKEY_ERROR("Boundary out of bounds", bk);
 745             return;
 746         }
 747         fTestData->fActualBreaks.setCharAt(bk, 1);
 748     }
 749     checkResults("testPrevious", REVERSE, status);
 750 }
 751
 752
 753 void RBBIMonkeyImpl::testPreceding(UErrorCode &status) {
 754     if (U_FAILURE(status)) {
 755         return;
 756     }
 757     fTestData->clearActualBreaks();
 758     fBI->setText(fTestData->fString);
 759     int32_t nextBreak = fTestData->fString.length()+1;
 760     for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) {
 761         int32_t bk = fBI->preceding(i);
 762         // printf("i:%d  bk:%d  nextBreak:%d\n", i, bk, nextBreak);
 763         if (bk == BreakIterator::DONE && i == 0) {
 764             continue;
 765         }
 766         if (bk == nextBreak && bk < i) {
 767             // i is in the gap between two breaks.
 768             continue;
 769         }
 770         if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) {
 771             // i indexes to a trailing surrogate.
 772             // Break Iterators treat an index to either half as referring to the supplemental code point,
 773             // with preceding going to some preceding code point.
 774             if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) {
 775                 MONKEY_ERROR("preceding of trailing surrogate error", i);
 776             }
 777             continue;
 778         }
 779         if (i == nextBreak && bk < nextBreak) {
 780             fTestData->fActualBreaks.setCharAt(bk, 1);
 781             nextBreak = bk;
 782             continue;
 783         }
 784         MONKEY_ERROR("preceding(i)", i);
 785         return;
 786     }
 787     checkResults("testPreceding", REVERSE, status);
 788 }
 789
 790
 791 void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) {
 792     if (U_FAILURE(status)) {
 793         return;
 794     }
 795     fTestData->clearActualBreaks();
 796     fBI->setText(fTestData->fString);
 797     for (int i=fTestData->fString.length(); i>=0; --i) {
 798         if (fBI->isBoundary(i)) {
 799             fTestData->fActualBreaks.setCharAt(i, 1);
 800         }
 801     }
 802     checkResults("testForwards", FORWARD, status);
 803 }
 804
 805 void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) {
 806     if (U_FAILURE(status)) {
 807         return;
 808     }
 809     if (direction == FORWARD) {
 810         for (int i=0; i<=fTestData->fString.length(); ++i) {
 811             if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
 812                 if (i > 1) {
 813                     IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
 814                         __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-2), fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
 815                         fRuleFileName, fTestData->fRandomSeed);
 816                 } else {
 817                     IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
 818                         __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
 819                         fRuleFileName, fTestData->fRandomSeed);
 820                 }
 821                 if (fVerbose) {
 822                     fTestData->dump(i);
 823                 }
 824                 status = U_INVALID_STATE_ERROR;   // Prevent the test from continuing, which would likely
 825                 break;                            // produce many redundant errors.
 826             }
 827         }
 828     } else {
 829         for (int i=fTestData->fString.length(); i>=0; i--) {
 830             if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
 831                 if (i > 1) {
 832                     IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
 833                         __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-2), fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
 834                         fRuleFileName, fTestData->fRandomSeed);
 835                 } else {
 836                     IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
 837                         __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
 838                         fRuleFileName, fTestData->fRandomSeed);
 839                 }
 840                 if (fVerbose) {
 841                     fTestData->dump(i);
 842                 }
 843                 status = U_INVALID_STATE_ERROR;
 844                 break;
 845             }
 846         }
 847     }
 848 }
 849
 850
 851
 852 //---------------------------------------------------------------------------------------
 853 //
 854 //   class RBBIMonkeyTest implementation.
 855 //
 856 //---------------------------------------------------------------------------------------
 857 RBBIMonkeyTest::RBBIMonkeyTest() {
 858 }
 859
 860 RBBIMonkeyTest::~RBBIMonkeyTest() {
 861 }
 862
 863
 864 //     params, taken from this->fParams.
 865 //       rules=file_name   Name of file containing the reference rules.
 866 //       seed=nnnnn        Random number starting seed.
 867 //                         Setting the seed allows errors to be reproduced.
 868 //       loop=nnn          Looping count.  Controls running time.
 869 //                         -1:  run forever.
 870 //                          0 or greater:  run length.
 871 //       expansions        debug option, show expansions of rules and sets.
 872 //       verbose           Display details of the failure.
 873 //
 874 //     Parameters on the intltest command line follow the test name, and are preceded by '@'.
 875 //     For example,
 876 //           intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
 877 //
 878 void RBBIMonkeyTest::testMonkey() {
 879     // printf("Test parameters: %s\n", fParams);
 880     UnicodeString params(fParams);
 881     UErrorCode status = U_ZERO_ERROR;
 882
 883     const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
 884                            "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
 885                            NULL };
 886     CharString testNameFromParams;
 887     if (getStringParam("rules", params, testNameFromParams, status)) {
 888         tests[0] = testNameFromParams.data();
 889         tests[1] = NULL;
 890     }
 891
 892     int64_t loopCount = quick? 100 : 5000;
 893     getIntParam("loop", params, loopCount, status);
 894
 895     UBool dumpExpansions = FALSE;
 896     getBoolParam("expansions", params, dumpExpansions, status);
 897
 898     UBool verbose = FALSE;
 899     getBoolParam("verbose", params, verbose, status);
 900
 901     int64_t seed = 0;
 902     getIntParam("seed", params, seed, status);
 903
 904     if (params.length() != 0) {
 905         // Options processing did not consume all of the parameters. Something unrecognized was present.
 906         CharString unrecognizedParameters;
 907         unrecognizedParameters.append(CStr(params)(), -1, status);
 908         errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data());
 909         return;
 910     }
 911
 912     UVector startedTests(status);
 913     if (U_FAILURE(status)) {
 914         errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status));
 915         return;
 916     }
 917
 918     // Monkey testing is multi-threaded.
 919     // Each set of break rules to be tested is run in a separate thread.
 920     // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
 921     int32_t i;
 922     for (i=0; tests[i] != NULL; ++i) {
 923         logln("beginning testing of %s", tests[i]);
 924         RBBIMonkeyImpl *test = new RBBIMonkeyImpl(status);
 925         test->fDumpExpansions = dumpExpansions;
 926         test->fVerbose = verbose;
 927         test->fRandomGenerator.seed((uint32_t)seed);
 928         test->fLoopCount = loopCount;
 929         test->setup(tests[i], status);
 930         test->startTest();
 931         startedTests.addElement(test, status);
 932         if (U_FAILURE(status)) {
 933             break;
 934         }
 935     }
 936
 937     if (U_FAILURE(status)) {
 938         dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
 939     }
 940
 941     for (i=0; i<startedTests.size(); ++i) {
 942         RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i));
 943         test->join();
 944         delete test;
 945     }
 946 }
 947
 948
 949 UBool  RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status) {
 950     name.append(" *= *(-?\\d+) *,? *");
 951     RegexMatcher m(name, params, 0, status);
 952     if (m.find()) {
 953         // The param exists.  Convert the string to an int.
 954         CharString str;
 955         str.append(CStr(m.group(1, status))(), -1, status);
 956         val = strtol(str.data(),  NULL, 10);
 957
 958         // Delete this parameter from the params string.
 959         m.reset();
 960         params = m.replaceFirst(UnicodeString(), status);
 961         return TRUE;
 962     }
 963     return FALSE;
 964 }
 965
 966 UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status) {
 967     name.append(" *= *([^ ,]*) *,? *");
 968     RegexMatcher m(name, params, 0, status);
 969     if (m.find()) {
 970         // The param exists.
 971         dest.append(CStr(m.group(1, status))(), -1, status);
 972
 973         // Delete this parameter from the params string.
 974         m.reset();
 975         params = m.replaceFirst(UnicodeString(), status);
 976         return TRUE;
 977     }
 978     return FALSE;
 979 }
 980
 981 UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status) {
 982     name.append("(?: *= *(true|false))? *,? *");
 983     RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status);
 984     if (m.find()) {
 985         if (m.start(1, status) > 0) {
 986             // user option included a value.
 987             dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0;
 988         } else {
 989             // No explicit user value, implies true.
 990             dest = TRUE;
 991         }
 992
 993         // Delete this parameter from the params string.
 994         m.reset();
 995         params = m.replaceFirst(UnicodeString(), status);
 996         return TRUE;
 997     }
 998     return FALSE;
 999 }
1000
1001 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */