icuSources/test/intltest/rbbimonkeytest.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /********************************************************************
   4  * Copyright (c) 2016, International Business Machines Corporation and
   5  * others. All Rights Reserved.
   6  ********************************************************************/
   7
   8
   9 #include "unicode/utypes.h"
  10
  11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
  12
  13 #include "rbbimonkeytest.h"
  14 #include "unicode/utypes.h"
  15 #include "unicode/brkiter.h"
  16 #include "unicode/utf16.h"
  17 #include "unicode/uniset.h"
  18 #include "unicode/unistr.h"
  19
  20 #include "charstr.h"
  21 #include "cmemory.h"
  22 #include "cstr.h"
  23 #include "uelement.h"
  24 #include "uhash.h"
  25 #include "cstring.h"
  26
  27 #include <iostream>
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <string>
  31
  32 using namespace icu;
  33
  34
  35 void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) {
  36     fParams = params;            // Work around TESTCASE_AUTO not being able to pass params to test function.
  37
  38     TESTCASE_AUTO_BEGIN;
  39     TESTCASE_AUTO(testMonkey);
  40     TESTCASE_AUTO_END;
  41 }
  42
  43 //---------------------------------------------------------------------------------------
  44 //
  45 //   class BreakRule implementation.
  46 //
  47 //---------------------------------------------------------------------------------------
  48
  49 BreakRule::BreakRule()      // :  all field default initialized.
  50 {
  51 }
  52
  53 BreakRule::~BreakRule() {}
  54
  55
  56 //---------------------------------------------------------------------------------------
  57 //
  58 //   class BreakRules implementation.
  59 //
  60 //---------------------------------------------------------------------------------------
  61 BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status)  :
  62         fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) {
  63     fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString,
  64                                          uhash_compareUnicodeString,
  65                                          NULL,      // value comparator.
  66                                          &status));
  67     if (U_FAILURE(status)) {
  68         return;
  69     }
  70     uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
  71     uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
  72     fBreakRules.setDeleter(uprv_deleteUObject);
  73
  74     fCharClassList.adoptInstead(new UVector(status));
  75
  76     fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
  77              "(?!(?:\\{|=|\\[:)[ \\t]{0,4})"              // Negative look behind for '{' or '=' or '[:'
  78                                                           //   (the identifier is a unicode property name or value)
  79              "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"),     // The char class name
  80         0, status));
  81
  82     // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
  83     fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
  84                 "(^|(?<=;))"                    // Start either at start of line, or just after a ';' (look-behind for ';')
  85                 "[ \\t]*+"                      //   Match white space.
  86                 "(#.*)?+"                       //   Optional # plus whatever follows
  87                 "\\R$"                          //   new-line at end of line.
  88             ), 0, status));
  89
  90     // Match (initial parse) of a character class definition line.
  91     fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
  92                 "[ \\t]*"                                // leading white space
  93                 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"   // The char class name
  94                 "[ \\t]*=[ \\t]*"                        //   =
  95                 "(?<ClassDef>.*?)"                       // The char class UnicodeSet expression
  96                 "[ \\t]*;$"),                     // ; <end of line>
  97             0, status));
  98
  99     // Match (initial parse) of a break rule line.
 100     fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
 101                 "[ \\t]*"                                // leading white space
 102                 "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)"    // The rule name
 103                 "[ \\t]*:[ \\t]*"                        //   :
 104                 "(?<RuleDef>.*?)"                        // The rule definition
 105                 "[ \\t]*;$"),                            // ; <end of line>
 106             0, status));
 107
 108 }
 109
 110
 111 BreakRules::~BreakRules() {}
 112
 113
 114 CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
 115
 116     // Create the expanded definition for this char class,
 117     // replacing any set references with the corresponding definition.
 118
 119     UnicodeString expandedDef;
 120     UnicodeString emptyString;
 121     fSetRefsMatcher->reset(definition);
 122     while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
 123         const UnicodeString name =
 124                 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
 125         CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
 126         const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
 127
 128         fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status);
 129         expandedDef.append(expansionForName);
 130     }
 131     fSetRefsMatcher->appendTail(expandedDef);
 132
 133     // Verify that the expanded set definition is valid.
 134
 135     if (fMonkeyImpl->fDumpExpansions) {
 136         printf("epandedDef: %s\n", CStr(expandedDef)());
 137     }
 138
 139     UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status);
 140     if (U_FAILURE(status)) {
 141         IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__,
 142                                u_errorName(status), CStr(name)());
 143         return NULL;
 144     }
 145     CharClass *cclass = new CharClass(name, definition, expandedDef, s);
 146     CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(),
 147                                                         new UnicodeString(name),   // Key, owned by hash table.
 148                                                         cclass,                    // Value, owned by hash table.
 149                                                         &status));
 150
 151     if (previousClass != NULL) {
 152         // Duplicate class def.
 153         // These are legitimate, they are adjustments of an existing class.
 154         // TODO: will need to keep the old around when we handle tailorings.
 155         IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
 156         delete previousClass;
 157     }
 158     return cclass;
 159 }
 160
 161
 162 void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
 163     LocalPointer<BreakRule> thisRule(new BreakRule);
 164     thisRule->fName = name;
 165     thisRule->fRule = definition;
 166
 167     // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
 168     // This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
 169     UnicodeString emptyString;
 170
 171     // Expand the char class definitions within the rule.
 172     fSetRefsMatcher->reset(definition);
 173     while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
 174         const UnicodeString name =
 175                 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
 176         CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
 177         if (!nameClass) {
 178             IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
 179                 __FILE__, __LINE__, CStr(name)(), CStr(definition)());
 180         }
 181         const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
 182
 183         fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status);
 184         thisRule->fExpandedRule.append(expansionForName);
 185     }
 186     fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
 187
 188     // If rule begins with a '^' rule chaining is disallowed.
 189     // Strip off the '^' from the rule expression, and set the flag.
 190     if (thisRule->fExpandedRule.charAt(0) == u'^') {
 191         thisRule->fInitialMatchOnly = true;
 192         thisRule->fExpandedRule.remove(0, 1);
 193         thisRule->fExpandedRule.trim();
 194     }
 195
 196     // Replace the divide sign (\u00f7) with a regular expression named capture.
 197     // When running the rules, a match that includes this group means we found a break position.
 198
 199     int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7);
 200     if (dividePos >= 0) {
 201         thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)"));
 202     }
 203     if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) {
 204         status = U_ILLEGAL_ARGUMENT_ERROR;   // TODO: produce a good error message.
 205     }
 206
 207     // UAX break rule set definitions can be empty, just [].
 208     // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
 209     // also matches nothing.
 210
 211     static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0};
 212     int32_t where = 0;
 213     while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) {
 214         thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
 215     }
 216     if (fMonkeyImpl->fDumpExpansions) {
 217         printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)());
 218     }
 219
 220     // Compile a regular expression for this rule.
 221     thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status));
 222     if (U_FAILURE(status)) {
 223         IntlTest::gTest->errln("%s:%d Error creating regular expression for %s",
 224                 __FILE__, __LINE__, CStr(thisRule->fExpandedRule)());
 225         return;
 226     }
 227
 228     // Put this new rule into the vector of all Rules.
 229     fBreakRules.addElement(thisRule.orphan(), status);
 230 }
 231
 232
 233 bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) {
 234     if (keyword == UnicodeString("locale")) {
 235         CharString localeName;
 236         localeName.append(CStr(value)(), -1, status);
 237         fLocale = Locale::createFromName(localeName.data());
 238         return true;
 239     }
 240     if (keyword == UnicodeString("type")) {
 241         if (value == UnicodeString("grapheme")) {
 242             fType = UBRK_CHARACTER;
 243         } else if (value == UnicodeString("word")) {
 244             fType = UBRK_WORD;
 245         } else if (value == UnicodeString("line")) {
 246             fType = UBRK_LINE;
 247         } else if (value == UnicodeString("sentence")) {
 248             fType = UBRK_SENTENCE;
 249         } else {
 250             IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__,  CStr(value)());
 251         }
 252         return true;
 253     }
 254     // TODO: add tailoring base setting here.
 255     return false;
 256 }
 257
 258 RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) {
 259     if (U_FAILURE(status)) {
 260         return NULL;
 261     }
 262     RuleBasedBreakIterator *bi = NULL;
 263     switch(fType) {
 264         case UBRK_CHARACTER:
 265             bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status));
 266             break;
 267         case UBRK_WORD:
 268             bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status));
 269             break;
 270         case UBRK_LINE:
 271             bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status));
 272             break;
 273         case UBRK_SENTENCE:
 274             bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status));
 275             break;
 276         default:
 277             IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType);
 278             status = U_ILLEGAL_ARGUMENT_ERROR;
 279     }
 280     return bi;
 281 }
 282
 283
 284 void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) {
 285     if (U_FAILURE(status)) {
 286         return;
 287     }
 288
 289     UnicodeString emptyString;
 290     for (int32_t lineNumber=0; ;lineNumber++) {    // Loop once per input line.
 291         if (U_FAILURE(status)) {
 292             return;
 293         }
 294         int32_t lineLength = 0;
 295         const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status);
 296         if (lineBuf == NULL) {
 297             break;
 298         }
 299         UnicodeString line(lineBuf, lineLength);
 300
 301         // Strip comment lines.
 302         fCommentsMatcher->reset(line);
 303         line = fCommentsMatcher->replaceFirst(emptyString, status);
 304         if (line.isEmpty()) {
 305             continue;
 306         }
 307
 308         // Recognize character class definition and keyword lines
 309         fClassDefMatcher->reset(line);
 310         if (fClassDefMatcher->matches(status)) {
 311             UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status);
 312             UnicodeString classDef  = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status);
 313             if (fMonkeyImpl->fDumpExpansions) {
 314                 printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)());
 315             }
 316             if (setKeywordParameter(className, classDef, status)) {
 317                 // The scanned item was "type = ..." or "locale = ...", etc.
 318                 //   which are not actual character classes.
 319                 continue;
 320             }
 321             addCharClass(className, classDef, status);
 322             continue;
 323         }
 324
 325         // Recognize rule lines.
 326         fRuleDefMatcher->reset(line);
 327         if (fRuleDefMatcher->matches(status)) {
 328             UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status);
 329             UnicodeString ruleDef  = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status);
 330             if (fMonkeyImpl->fDumpExpansions) {
 331                 printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)());
 332             }
 333             addRule(ruleName, ruleDef, status);
 334             continue;
 335         }
 336
 337         IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
 338             __FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)());
 339     }
 340
 341     // Build the vector of char classes, omitting the dictionary class if there is one.
 342     // This will be used when constructing the random text to be tested.
 343
 344     // Also compute the "other" set, consisting of any characters not included in
 345     // one or more of the user defined sets.
 346
 347     UnicodeSet otherSet((UChar32)0, 0x10ffff);
 348     int32_t pos = UHASH_FIRST;
 349     const UHashElement *el = NULL;
 350     while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) {
 351         const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer);
 352         CharClass *cclass = static_cast<CharClass *>(el->value.pointer);
 353         // printf("    Adding %s\n", CStr(*ccName)());
 354         if (*ccName != cclass->fName) {
 355             IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
 356                     __FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)());
 357         }
 358         const UnicodeSet *set = cclass->fSet.getAlias();
 359         otherSet.removeAll(*set);
 360         if (*ccName == UnicodeString("dictionary")) {
 361             fDictionarySet = *set;
 362         } else {
 363             fCharClassList->addElement(cclass, status);
 364         }
 365     }
 366
 367     if (!otherSet.isEmpty()) {
 368         // fprintf(stderr, "have an other set.\n");
 369         UnicodeString pattern;
 370         CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status);
 371         fCharClassList->addElement(cclass, status);
 372     }
 373 }
 374
 375
 376 const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const {
 377    int32_t localIter = 0;
 378    int32_t &it = iter? *iter : localIter;
 379
 380    while (it < fCharClassList->size()) {
 381        const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it));
 382        ++it;
 383        if (cc->fSet->contains(c)) {
 384            return cc;
 385        }
 386     }
 387     return NULL;
 388 }
 389
 390 //---------------------------------------------------------------------------------------
 391 //
 392 //   class MonkeyTestData implementation.
 393 //
 394 //---------------------------------------------------------------------------------------
 395
 396 void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) {
 397     const int32_t dataLength = 1000;
 398
 399     // Fill the test string with random characters.
 400     // First randomly pick a char class, then randomly pick a character from that class.
 401     // Exclude any characters from the dictionary set.
 402
 403     // std::cout << "Populating Test Data" << std::endl;
 404     fRandomSeed = rand.getSeed();         // Save initial seed for use in error messages,
 405                                           // allowing recreation of failing data.
 406     fBkRules = rules;
 407     fString.remove();
 408     for (int32_t n=0; n<dataLength;) {
 409         int charClassIndex = rand() % rules->fCharClassList->size();
 410         const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex));
 411         if (cclass->fSet->size() == 0) {
 412             // Some rules or tailorings do end up with empty char classes.
 413             continue;
 414         }
 415         int32_t charIndex = rand() % cclass->fSet->size();
 416         UChar32 c = cclass->fSet->charAt(charIndex);
 417         if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) {
 418             // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
 419             // Don't let random unpaired surrogates combine in the test data because they might
 420             // produce an unwanted dictionary character.
 421             continue;
 422         }
 423
 424         if (!rules->fDictionarySet.contains(c)) {
 425             fString.append(c);
 426             ++n;
 427         }
 428     }
 429
 430     // Reset each rule matcher regex with this new string.
 431     //    (Although we are always using the same string object, ICU regular expressions
 432     //    don't like the underlying string data changing without doing a reset).
 433
 434     for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
 435         BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
 436             rule->fRuleMatcher->reset(fString);
 437     }
 438
 439     // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
 440     // Expected and Actual breaks are one longer than the input string; a non-zero value
 441     // will indicate a boundary preceding that position.
 442
 443     clearActualBreaks();
 444     fExpectedBreaks  = fActualBreaks;
 445     fRuleForPosition = fActualBreaks;
 446     f2ndRuleForPos   = fActualBreaks;
 447
 448     // Apply reference rules to find the expected breaks.
 449
 450     fExpectedBreaks.setCharAt(0, (UChar)1);  // Force an expected break before the start of the text.
 451                                              // ICU always reports a break there.
 452                                              // The reference rules do not have a means to do so.
 453     int32_t strIdx = 0;
 454     bool    initialMatch = true;             // True at start of text, and immediately after each boundary,
 455                                              // for control over rule chaining.
 456     while (strIdx < fString.length()) {
 457         BreakRule *matchingRule = NULL;
 458         UBool      hasBreak = FALSE;
 459         int32_t ruleNum = 0;
 460         int32_t matchStart = 0;
 461         int32_t matchEnd = 0;
 462         int32_t breakGroup = 0;
 463         for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
 464             BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
 465             if (rule->fInitialMatchOnly && !initialMatch) {
 466                 // Skip checking this '^' rule. (No rule chaining)
 467                 continue;
 468             }
 469             rule->fRuleMatcher->reset();
 470             if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
 471                 // A candidate rule match, check further to see if we take it or continue to check other rules.
 472                 // Matches of zero or one codepoint count only if they also specify a break.
 473                 matchStart = rule->fRuleMatcher->start(status);
 474                 matchEnd = rule->fRuleMatcher->end(status);
 475                 breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status);
 476                 hasBreak = U_SUCCESS(status);
 477                 if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
 478                     status = U_ZERO_ERROR;
 479                 }
 480                 if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) {
 481                     matchingRule = rule;
 482                     break;
 483                 }
 484             }
 485         }
 486         if (matchingRule == NULL) {
 487             // No reference rule matched. This is an error in the rules that should never happen.
 488             IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
 489                  __FILE__, __LINE__, strIdx);
 490             dump(strIdx);
 491             status = U_INVALID_FORMAT_ERROR;
 492             return;
 493         }
 494         if (matchingRule->fRuleMatcher->group(status).length() == 0) {
 495             // Zero length rule match. This is also an error in the rule expressions.
 496             IntlTest::gTest->errln("%s:%d Zero length rule match.",
 497                 __FILE__, __LINE__);
 498             status =  U_INVALID_FORMAT_ERROR;
 499             return;
 500         }
 501
 502         // Record which rule matched over the length of the match.
 503         for (int i = matchStart; i < matchEnd; i++) {
 504             if (fRuleForPosition.charAt(i) == 0) {
 505                 fRuleForPosition.setCharAt(i, (UChar)ruleNum);
 506             } else {
 507                 f2ndRuleForPos.setCharAt(i, (UChar)ruleNum);
 508             }
 509         }
 510
 511         // Break positions appear in rules as a matching named capture of zero length at the break position,
 512         //   the adjusted pattern contains (?<BreakPosition>)
 513         if (hasBreak) {
 514             int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status);
 515             if (U_FAILURE(status) || breakPos < 0) {
 516                 // Rule specified a break, but that break wasn't part of the match, even
 517                 // though the rule as a whole matched.
 518                 // Can't happen with regular expressions derived from (equivalent to) ICU break rules.
 519                 // Shouldn't get here.
 520                 IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__);
 521                 status =  U_INVALID_FORMAT_ERROR;
 522                 break;
 523             }
 524             fExpectedBreaks.setCharAt(breakPos, (UChar)1);
 525             // printf("recording break at %d\n", breakPos);
 526             // For the next iteration, pick up applying rules immediately after the break,
 527             // which may differ from end of the match. The matching rule may have included
 528             // context following the boundary that needs to be looked at again.
 529             strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
 530             initialMatch = true;
 531         } else {
 532             // Original rule didn't specify a break.
 533             // Continue applying rules starting on the last code point of this match.
 534             strIdx = fString.moveIndex32(matchEnd, -1);
 535             initialMatch = false;
 536             if (strIdx == matchStart) {
 537                 // Match was only one code point, no progress if we continue.
 538                 // Shouldn't get here, case is filtered out at top of loop.
 539                 CharString ruleName;
 540                 ruleName.appendInvariantChars(matchingRule->fName, status);
 541                 IntlTest::gTest->errln("%s:%d Rule %s internal error",
 542                         __FILE__, __LINE__, ruleName.data());
 543                 status = U_INVALID_FORMAT_ERROR;
 544                 break;
 545             }
 546         }
 547         if (U_FAILURE(status)) {
 548             IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
 549                 __FILE__, __LINE__, u_errorName(status));
 550             break;
 551         }
 552     }
 553 }
 554
 555 void MonkeyTestData::clearActualBreaks() {
 556     fActualBreaks.remove();
 557     // Actual Breaks length is one longer than the data string length, allowing
 558     //    for breaks before the first and after the last character in the data.
 559     for (int32_t i=0; i<=fString.length(); i++) {
 560         fActualBreaks.append((UChar)0);
 561     }
 562 }
 563
 564 void MonkeyTestData::dump(int32_t around) const {
 565     printf("\n"
 566            "         char                        break  Rule                     Character\n"
 567            "   pos   code   class                 R I   name                     name\n"
 568            "---------------------------------------------------------------------------------------------\n");
 569
 570     int32_t start;
 571     int32_t end;
 572
 573     if (around == -1) {
 574         start = 0;
 575         end = fString.length();
 576     } else {
 577         // Display context around a failure.
 578         start = fString.moveIndex32(around, -30);
 579         end = fString.moveIndex32(around, +30);
 580     }
 581
 582     for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) {
 583         UErrorCode status = U_ZERO_ERROR;
 584         UChar32 c = fString.char32At(charIdx);
 585         const CharClass *cc = fBkRules->getClassForChar(c);
 586         CharString ccName;
 587         ccName.appendInvariantChars(cc->fName, status);
 588         CharString ruleName, secondRuleName;
 589         const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx)));
 590         ruleName.appendInvariantChars(rule->fName, status);
 591         if (f2ndRuleForPos.charAt(charIdx) > 0) {
 592             const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx)));
 593             secondRuleName.appendInvariantChars(secondRule->fName, status);
 594         }
 595         char cName[200];
 596         u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
 597
 598         printf("  %4.1d %6.4x   %-20s  %c %c   %-10s %-10s    %s\n",
 599             charIdx, c, ccName.data(),
 600             fExpectedBreaks.charAt(charIdx) ? '*' : '.',
 601             fActualBreaks.charAt(charIdx) ? '*' : '.',
 602             ruleName.data(), secondRuleName.data(), cName
 603         );
 604     }
 605 }
 606
 607
 608 //---------------------------------------------------------------------------------------
 609 //
 610 //   class RBBIMonkeyImpl
 611 //
 612 //---------------------------------------------------------------------------------------
 613
 614 RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) {
 615     (void)status;    // suppress unused parameter compiler warning.
 616 }
 617
 618
 619 // RBBIMonkeyImpl setup       does all of the setup for a single rule set - compiling the
 620 //                            reference rules and creating the icu breakiterator to test,
 621 //                            with its type and locale coming from the reference rules.
 622
 623 void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) {
 624     fRuleFileName = ruleFile;
 625     openBreakRules(ruleFile, status);
 626     if (U_FAILURE(status)) {
 627         IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
 628         return;
 629     }
 630     fRuleSet.adoptInstead(new BreakRules(this, status));
 631     fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status);
 632     if (U_FAILURE(status)) {
 633         IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
 634         return;
 635     }
 636     fBI.adoptInstead(fRuleSet->createICUBreakIterator(status));
 637     fTestData.adoptInstead(new MonkeyTestData());
 638 }
 639
 640
 641 RBBIMonkeyImpl::~RBBIMonkeyImpl() {
 642 }
 643
 644
 645 void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) {
 646     CharString path;
 647     path.append(IntlTest::getSourceTestData(status), status);
 648     path.append("break_rules" U_FILE_SEP_STRING, status);
 649     path.appendPathPart(fileName, status);
 650     const char *codePage = "UTF-8";
 651     fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status));
 652 }
 653
 654
 655 void RBBIMonkeyImpl::startTest() {
 656     fThread.start();   // invokes runTest() in a separate thread.
 657 }
 658
 659 void RBBIMonkeyImpl::join() {
 660     fThread.join();
 661 }
 662
 663
 664 #define MONKEY_ERROR(msg, index) { \
 665     IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
 666                     __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
 667     if (fVerbose) { fTestData->dump(index); } \
 668     status = U_INVALID_STATE_ERROR;  \
 669 }
 670
 671 void RBBIMonkeyImpl::runTest() {
 672     UErrorCode status = U_ZERO_ERROR;
 673     int32_t errorCount = 0;
 674     for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
 675         status = U_ZERO_ERROR;
 676         fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status);
 677         if (fBI.isNull()) {
 678             IntlTest::gTest->dataerrln("Unable to run test because fBI is null.");
 679             return;
 680         }
 681         if ( uprv_strcmp(fRuleFileName,"line_loose_cj.txt") == 0 && fTestData->fRandomSeed==1712915859 ) {
 682             continue; // known bug around index 103-104, break expected/actual 0/1, fwd 0020 200D | FDFC, rev 1325A 0020 | 200D
 683         }
 684         // fTestData->dump();
 685         testForwards(status);
 686         testPrevious(status);
 687         testFollowing(status);
 688         testPreceding(status);
 689         testIsBoundary(status);
 690         testIsBoundaryRandom(status);
 691
 692         if (fLoopCount < 0 && loopCount % 100 == 0) {
 693             fprintf(stderr, ".");
 694         }
 695         if (U_FAILURE(status)) {
 696             if (++errorCount > 10) {
 697                 return;
 698             }
 699         }
 700     }
 701 }
 702
 703 void RBBIMonkeyImpl::testForwards(UErrorCode &status) {
 704     if (U_FAILURE(status)) {
 705         return;
 706     }
 707     fTestData->clearActualBreaks();
 708     fBI->setText(fTestData->fString);
 709     int32_t previousBreak = -2;
 710     for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) {
 711         if (bk <= previousBreak) {
 712             MONKEY_ERROR("Break Iterator Stall", bk);
 713             return;
 714         }
 715         if (bk < 0 || bk > fTestData->fString.length()) {
 716             MONKEY_ERROR("Boundary out of bounds", bk);
 717             return;
 718         }
 719         fTestData->fActualBreaks.setCharAt(bk, 1);
 720     }
 721     checkResults("testForwards", FORWARD, status);
 722 }
 723
 724 void RBBIMonkeyImpl::testFollowing(UErrorCode &status) {
 725     if (U_FAILURE(status)) {
 726         return;
 727     }
 728     fTestData->clearActualBreaks();
 729     fBI->setText(fTestData->fString);
 730     int32_t nextBreak = -1;
 731     for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) {
 732         int32_t bk = fBI->following(i);
 733         if (bk == BreakIterator::DONE && i == fTestData->fString.length()) {
 734             continue;
 735         }
 736         if (bk == nextBreak && bk > i) {
 737             // i is in the gap between two breaks.
 738             continue;
 739         }
 740         if (i == nextBreak && bk > nextBreak) {
 741             fTestData->fActualBreaks.setCharAt(bk, 1);
 742             nextBreak = bk;
 743             continue;
 744         }
 745         MONKEY_ERROR("following(i)", i);
 746         return;
 747     }
 748     checkResults("testFollowing", FORWARD, status);
 749 }
 750
 751
 752
 753 void RBBIMonkeyImpl::testPrevious(UErrorCode &status) {
 754     if (U_FAILURE(status)) {return;}
 755
 756     fTestData->clearActualBreaks();
 757     fBI->setText(fTestData->fString);
 758     int32_t previousBreak = INT32_MAX;
 759     for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) {
 760          if (bk >= previousBreak) {
 761             MONKEY_ERROR("Break Iterator Stall", bk);
 762             return;
 763         }
 764         if (bk < 0 || bk > fTestData->fString.length()) {
 765             MONKEY_ERROR("Boundary out of bounds", bk);
 766             return;
 767         }
 768         fTestData->fActualBreaks.setCharAt(bk, 1);
 769     }
 770     checkResults("testPrevious", REVERSE, status);
 771 }
 772
 773
 774 void RBBIMonkeyImpl::testPreceding(UErrorCode &status) {
 775     if (U_FAILURE(status)) {
 776         return;
 777     }
 778     fTestData->clearActualBreaks();
 779     fBI->setText(fTestData->fString);
 780     int32_t nextBreak = fTestData->fString.length()+1;
 781     for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) {
 782         int32_t bk = fBI->preceding(i);
 783         // printf("i:%d  bk:%d  nextBreak:%d\n", i, bk, nextBreak);
 784         if (bk == BreakIterator::DONE && i == 0) {
 785             continue;
 786         }
 787         if (bk == nextBreak && bk < i) {
 788             // i is in the gap between two breaks.
 789             continue;
 790         }
 791         if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) {
 792             // i indexes to a trailing surrogate.
 793             // Break Iterators treat an index to either half as referring to the supplemental code point,
 794             // with preceding going to some preceding code point.
 795             if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) {
 796                 MONKEY_ERROR("preceding of trailing surrogate error", i);
 797             }
 798             continue;
 799         }
 800         if (i == nextBreak && bk < nextBreak) {
 801             fTestData->fActualBreaks.setCharAt(bk, 1);
 802             nextBreak = bk;
 803             continue;
 804         }
 805         MONKEY_ERROR("preceding(i)", i);
 806         return;
 807     }
 808     checkResults("testPreceding", REVERSE, status);
 809 }
 810
 811
 812 void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) {
 813     if (U_FAILURE(status)) {
 814         return;
 815     }
 816     fTestData->clearActualBreaks();
 817     fBI->setText(fTestData->fString);
 818     for (int i=fTestData->fString.length(); i>=0; --i) {
 819         if (fBI->isBoundary(i)) {
 820             fTestData->fActualBreaks.setCharAt(i, 1);
 821         }
 822     }
 823     checkResults("testForwards", FORWARD, status);
 824 }
 825
 826 void RBBIMonkeyImpl::testIsBoundaryRandom(UErrorCode &status) {
 827     if (U_FAILURE(status)) {
 828         return;
 829     }
 830     fBI->setText(fTestData->fString);
 831
 832     int stringLen = fTestData->fString.length();
 833     for (int i=stringLen; i>=0; --i) {
 834         int strIdx = fRandomGenerator() % stringLen;
 835         if (fTestData->fExpectedBreaks.charAt(strIdx) != fBI->isBoundary(strIdx)) {
 836             IntlTest::gTest->errln("%s:%d testIsBoundaryRandom failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
 837                     __FILE__, __LINE__, strIdx, fRuleFileName, fTestData->fRandomSeed);
 838             if (fVerbose) {
 839                 fTestData->dump(i);
 840             }
 841             status = U_INVALID_STATE_ERROR;
 842             break;
 843         }
 844     }
 845 }
 846
 847
 848
 849 void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) {
 850     if (U_FAILURE(status)) {
 851         return;
 852     }
 853     if (direction == FORWARD) {
 854         for (int i=0; i<=fTestData->fString.length(); ++i) {
 855             if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
 856                 if (i > 1) {
 857                     IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
 858                         __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-2), fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
 859                         fRuleFileName, fTestData->fRandomSeed);
 860                 } else {
 861                     IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
 862                         __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
 863                         fRuleFileName, fTestData->fRandomSeed);
 864                 }
 865                 if (fVerbose) {
 866                     fTestData->dump(i);
 867                 }
 868                 status = U_INVALID_STATE_ERROR;   // Prevent the test from continuing, which would likely
 869                 break;                            // produce many redundant errors.
 870             }
 871         }
 872     } else {
 873         for (int i=fTestData->fString.length(); i>=0; i--) {
 874             if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
 875                 if (i > 1) {
 876                     IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
 877                         __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-2), fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
 878                         fRuleFileName, fTestData->fRandomSeed);
 879                 } else {
 880                     IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
 881                         __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
 882                         fRuleFileName, fTestData->fRandomSeed);
 883                 }
 884                 if (fVerbose) {
 885                     fTestData->dump(i);
 886                 }
 887                 status = U_INVALID_STATE_ERROR;
 888                 break;
 889             }
 890         }
 891     }
 892 }
 893
 894
 895
 896 //---------------------------------------------------------------------------------------
 897 //
 898 //   class RBBIMonkeyTest implementation.
 899 //
 900 //---------------------------------------------------------------------------------------
 901 RBBIMonkeyTest::RBBIMonkeyTest() {
 902 }
 903
 904 RBBIMonkeyTest::~RBBIMonkeyTest() {
 905 }
 906
 907
 908 //     params, taken from this->fParams.
 909 //       rules=file_name   Name of file containing the reference rules.
 910 //       seed=nnnnn        Random number starting seed.
 911 //                         Setting the seed allows errors to be reproduced.
 912 //       loop=nnn          Looping count.  Controls running time.
 913 //                         -1:  run forever.
 914 //                          0 or greater:  run length.
 915 //       expansions        debug option, show expansions of rules and sets.
 916 //       verbose           Display details of the failure.
 917 //
 918 //     Parameters on the intltest command line follow the test name, and are preceded by '@'.
 919 //     For example,
 920 //           intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
 921 //
 922 void RBBIMonkeyTest::testMonkey() {
 923     // printf("Test parameters: %s\n", fParams);
 924     UnicodeString params(fParams);
 925     UErrorCode status = U_ZERO_ERROR;
 926
 927     const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "line_cj.txt", "sentence.txt", "line_normal.txt",
 928                            "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
 929                            NULL };
 930     CharString testNameFromParams;
 931     if (getStringParam("rules", params, testNameFromParams, status)) {
 932         tests[0] = testNameFromParams.data();
 933         tests[1] = NULL;
 934     }
 935
 936     int64_t loopCount = quick? 100 : 5000;
 937     getIntParam("loop", params, loopCount, status);
 938
 939     UBool dumpExpansions = FALSE;
 940     getBoolParam("expansions", params, dumpExpansions, status);
 941
 942     UBool verbose = FALSE;
 943     getBoolParam("verbose", params, verbose, status);
 944
 945     int64_t seed = 0;
 946     getIntParam("seed", params, seed, status);
 947
 948     if (params.length() != 0) {
 949         // Options processing did not consume all of the parameters. Something unrecognized was present.
 950         CharString unrecognizedParameters;
 951         unrecognizedParameters.append(CStr(params)(), -1, status);
 952         errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data());
 953         return;
 954     }
 955
 956     UVector startedTests(status);
 957     if (U_FAILURE(status)) {
 958         errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status));
 959         return;
 960     }
 961
 962     // Monkey testing is multi-threaded.
 963     // Each set of break rules to be tested is run in a separate thread.
 964     // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
 965     int32_t i;
 966     for (i=0; tests[i] != NULL; ++i) {
 967         logln("beginning testing of %s", tests[i]);
 968         LocalPointer<RBBIMonkeyImpl> test(new RBBIMonkeyImpl(status));
 969         if (U_FAILURE(status)) {
 970             dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
 971             break;
 972         }
 973         test->fDumpExpansions = dumpExpansions;
 974         test->fVerbose = verbose;
 975         test->fRandomGenerator.seed(static_cast<uint32_t>(seed));
 976         test->fLoopCount = static_cast<int32_t>(loopCount);
 977         test->setup(tests[i], status);
 978         if (U_FAILURE(status)) {
 979             dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
 980             break;
 981         }
 982         test->startTest();
 983         startedTests.addElement(test.orphan(), status);
 984         if (U_FAILURE(status)) {
 985             errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
 986             break;
 987         }
 988     }
 989
 990     for (i=0; i<startedTests.size(); ++i) {
 991         RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i));
 992         test->join();
 993         delete test;
 994     }
 995 }
 996
 997
 998 UBool  RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status) {
 999     name.append(" *= *(-?\\d+) *,? *");
1000     RegexMatcher m(name, params, 0, status);
1001     if (m.find()) {
1002         // The param exists.  Convert the string to an int.
1003         CharString str;
1004         str.append(CStr(m.group(1, status))(), -1, status);
1005         val = strtol(str.data(),  NULL, 10);
1006
1007         // Delete this parameter from the params string.
1008         m.reset();
1009         params = m.replaceFirst(UnicodeString(), status);
1010         return TRUE;
1011     }
1012     return FALSE;
1013 }
1014
1015 UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status) {
1016     name.append(" *= *([^ ,]*) *,? *");
1017     RegexMatcher m(name, params, 0, status);
1018     if (m.find()) {
1019         // The param exists.
1020         dest.append(CStr(m.group(1, status))(), -1, status);
1021
1022         // Delete this parameter from the params string.
1023         m.reset();
1024         params = m.replaceFirst(UnicodeString(), status);
1025         return TRUE;
1026     }
1027     return FALSE;
1028 }
1029
1030 UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status) {
1031     name.append("(?: *= *(true|false))? *,? *");
1032     RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status);
1033     if (m.find()) {
1034         if (m.start(1, status) > 0) {
1035             // user option included a value.
1036             dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0;
1037         } else {
1038             // No explicit user value, implies true.
1039             dest = TRUE;
1040         }
1041
1042         // Delete this parameter from the params string.
1043         m.reset();
1044         params = m.replaceFirst(UnicodeString(), status);
1045         return TRUE;
1046     }
1047     return FALSE;
1048 }
1049
1050 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */