]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/rbbimonkeytest.cpp
ICU-57165.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbimonkeytest.cpp
1 /********************************************************************
2 * Copyright (c) 2016, International Business Machines Corporation and
3 * others. All Rights Reserved.
4 ********************************************************************/
5
6
7 #include "unicode/utypes.h"
8
9 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
10
11 #include "rbbimonkeytest.h"
12 #include "unicode/utypes.h"
13 #include "unicode/brkiter.h"
14 #include "unicode/utf16.h"
15 #include "unicode/uniset.h"
16 #include "unicode/unistr.h"
17
18 #include "charstr.h"
19 #include "cmemory.h"
20 #include "cstr.h"
21 #include "uelement.h"
22 #include "uhash.h"
23 #include "cstring.h"
24
25 #include "iostream"
26 #include "string"
27
28 using namespace icu;
29
30
31 void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) {
32 fParams = params; // Work around TESTCASE_AUTO not being able to pass params to test function.
33
34 TESTCASE_AUTO_BEGIN;
35 TESTCASE_AUTO(testMonkey);
36 TESTCASE_AUTO_END;
37 }
38
39 //---------------------------------------------------------------------------------------
40 //
41 // class BreakRule implementation.
42 //
43 //---------------------------------------------------------------------------------------
44
45 BreakRule::BreakRule() // : all field default initialized.
46 {
47 }
48
49 BreakRule::~BreakRule() {}
50
51
52 //---------------------------------------------------------------------------------------
53 //
54 // class BreakRules implementation.
55 //
56 //---------------------------------------------------------------------------------------
57 BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) :
58 fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) {
59 fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString,
60 uhash_compareUnicodeString,
61 NULL, // value comparator.
62 &status));
63 if (U_FAILURE(status)) {
64 return;
65 }
66 uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
67 uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
68 fBreakRules.setDeleter(uprv_deleteUObject);
69
70 fCharClassList.adoptInstead(new UVector(status));
71
72 fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
73 "(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative lookbehind for '{' or '=' or '[:'
74 // (the identifier is a unicode property name or value)
75 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
76 0, status));
77
78 // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
79 fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
80 "(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';')
81 "[ \\t]*+" // Match white space.
82 "(#.*)?+" // Optional # plus whatever follows
83 "\\R$" // new-line at end of line.
84 ), 0, status));
85
86 // Match (initial parse) of a character class defintion line.
87 fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
88 "[ \\t]*" // leading white space
89 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
90 "[ \\t]*=[ \\t]*" // =
91 "(?<ClassDef>.*?)" // The char class UnicodeSet expression
92 "[ \\t]*;$"), // ; <end of line>
93 0, status));
94
95 // Match (initial parse) of a break rule line.
96 fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
97 "[ \\t]*" // leading white space
98 "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name
99 "[ \\t]*:[ \\t]*" // :
100 "(?<RuleDef>.*?)" // The rule definition
101 "[ \\t]*;$"), // ; <end of line>
102 0, status));
103
104 }
105
106
107 BreakRules::~BreakRules() {}
108
109
110 CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
111
112 // Create the expanded definition for this char class,
113 // replacing any set references with the corresponding definition.
114
115 UnicodeString expandedDef;
116 UnicodeString emptyString;
117 fSetRefsMatcher->reset(definition);
118 while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
119 const UnicodeString name =
120 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
121 CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
122 const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
123
124 fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status);
125 expandedDef.append(expansionForName);
126 }
127 fSetRefsMatcher->appendTail(expandedDef);
128
129 // Verify that the expanded set defintion is valid.
130
131 if (fMonkeyImpl->fDumpExpansions) {
132 printf("epandedDef: %s\n", CStr(expandedDef)());
133 }
134
135 UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status);
136 if (U_FAILURE(status)) {
137 IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__,
138 u_errorName(status), CStr(name)());
139 return NULL;
140 }
141 CharClass *cclass = new CharClass(name, definition, expandedDef, s);
142 CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(),
143 new UnicodeString(name), // Key, owned by hash table.
144 cclass, // Value, owned by hash table.
145 &status));
146
147 if (previousClass != NULL) {
148 // Duplicate class def.
149 // These are legitimate, they are adustments of an existing class.
150 // TODO: will need to keep the old around when we handle tailorings.
151 IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
152 delete previousClass;
153 }
154 return cclass;
155 }
156
157
158 void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
159 LocalPointer<BreakRule> thisRule(new BreakRule);
160 thisRule->fName = name;
161 thisRule->fRule = definition;
162
163 // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
164 // This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
165 UnicodeString emptyString;
166
167 // Expand the char class definitions within the rule.
168 fSetRefsMatcher->reset(definition);
169 while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
170 const UnicodeString name =
171 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
172 CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
173 if (!nameClass) {
174 IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
175 __FILE__, __LINE__, CStr(name)(), CStr(definition)());
176 }
177 const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
178
179 fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status);
180 thisRule->fExpandedRule.append(expansionForName);
181 }
182 fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
183
184 // Replace the divide sign (\u00f7) with a regular expression named capture.
185 // When running the rules, a match that includes this group means we found a break position.
186
187 int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7);
188 if (dividePos >= 0) {
189 thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)"));
190 }
191 if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) {
192 status = U_ILLEGAL_ARGUMENT_ERROR; // TODO: produce a good error message.
193 }
194
195 // UAX break rule set definitions can be empty, just [].
196 // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
197 // also matches nothing.
198
199 static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0};
200 int32_t where = 0;
201 while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) {
202 thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
203 }
204 if (fMonkeyImpl->fDumpExpansions) {
205 printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)());
206 }
207
208 // Compile a regular expression for this rule.
209 thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status));
210 if (U_FAILURE(status)) {
211 IntlTest::gTest->errln("%s:%d Error creating regular expression for %s",
212 __FILE__, __LINE__, CStr(thisRule->fExpandedRule)());
213 return;
214 }
215
216 // Put this new rule into the vector of all Rules.
217 fBreakRules.addElement(thisRule.orphan(), status);
218 }
219
220
221 bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) {
222 if (keyword == UnicodeString("locale")) {
223 CharString localeName;
224 localeName.append(CStr(value)(), -1, status);
225 fLocale = Locale::createFromName(localeName.data());
226 return true;
227 }
228 if (keyword == UnicodeString("type")) {
229 if (value == UnicodeString("grapheme")) {
230 fType = UBRK_CHARACTER;
231 } else if (value == UnicodeString("word")) {
232 fType = UBRK_WORD;
233 } else if (value == UnicodeString("line")) {
234 fType = UBRK_LINE;
235 } else if (value == UnicodeString("sentence")) {
236 fType = UBRK_SENTENCE;
237 } else {
238 IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__, CStr(value)());
239 }
240 return true;
241 }
242 // TODO: add tailoring base setting here.
243 return false;
244 }
245
246 RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) {
247 if (U_FAILURE(status)) {
248 return NULL;
249 }
250 RuleBasedBreakIterator *bi = NULL;
251 switch(fType) {
252 case UBRK_CHARACTER:
253 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status));
254 break;
255 case UBRK_WORD:
256 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status));
257 break;
258 case UBRK_LINE:
259 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status));
260 break;
261 case UBRK_SENTENCE:
262 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status));
263 break;
264 default:
265 IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType);
266 status = U_ILLEGAL_ARGUMENT_ERROR;
267 }
268 return bi;
269 }
270
271
272 void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) {
273 if (U_FAILURE(status)) {
274 return;
275 }
276
277 UnicodeString emptyString;
278 for (int32_t lineNumber=0; ;lineNumber++) { // Loop once per input line.
279 if (U_FAILURE(status)) {
280 return;
281 }
282 int32_t lineLength = 0;
283 const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status);
284 if (lineBuf == NULL) {
285 break;
286 }
287 UnicodeString line(lineBuf, lineLength);
288
289 // Strip comment lines.
290 fCommentsMatcher->reset(line);
291 line = fCommentsMatcher->replaceFirst(emptyString, status);
292 if (line.isEmpty()) {
293 continue;
294 }
295
296 // Recognize character class definition and keyword lines
297 fClassDefMatcher->reset(line);
298 if (fClassDefMatcher->matches(status)) {
299 UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status);
300 UnicodeString classDef = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status);
301 if (fMonkeyImpl->fDumpExpansions) {
302 printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)());
303 }
304 if (setKeywordParameter(className, classDef, status)) {
305 // The scanned item was "type = ..." or "locale = ...", etc.
306 // which are not actual character classes.
307 continue;
308 }
309 addCharClass(className, classDef, status);
310 continue;
311 }
312
313 // Recognize rule lines.
314 fRuleDefMatcher->reset(line);
315 if (fRuleDefMatcher->matches(status)) {
316 UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status);
317 UnicodeString ruleDef = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status);
318 if (fMonkeyImpl->fDumpExpansions) {
319 printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)());
320 }
321 addRule(ruleName, ruleDef, status);
322 continue;
323 }
324
325 IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
326 __FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)());
327 }
328
329 // Build the vector of char classes, omitting the dictionary class if there is one.
330 // This will be used when constructing the random text to be tested.
331
332 // Also compute the "other" set, consisting of any characters not included in
333 // one or more of the user defined sets.
334
335 UnicodeSet otherSet((UChar32)0, 0x10ffff);
336 int32_t pos = UHASH_FIRST;
337 const UHashElement *el = NULL;
338 while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) {
339 const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer);
340 CharClass *cclass = static_cast<CharClass *>(el->value.pointer);
341 // printf(" Adding %s\n", CStr(*ccName)());
342 if (*ccName != cclass->fName) {
343 IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
344 __FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)());
345 }
346 const UnicodeSet *set = cclass->fSet.getAlias();
347 otherSet.removeAll(*set);
348 if (*ccName == UnicodeString("dictionary")) {
349 fDictionarySet = *set;
350 } else {
351 fCharClassList->addElement(cclass, status);
352 }
353 }
354
355 if (!otherSet.isEmpty()) {
356 // fprintf(stderr, "have an other set.\n");
357 UnicodeString pattern;
358 CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status);
359 fCharClassList->addElement(cclass, status);
360 }
361 }
362
363
364 const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const {
365 int32_t localIter = 0;
366 int32_t &it = iter? *iter : localIter;
367
368 while (it < fCharClassList->size()) {
369 const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it));
370 ++it;
371 if (cc->fSet->contains(c)) {
372 return cc;
373 }
374 }
375 return NULL;
376 }
377
378 //---------------------------------------------------------------------------------------
379 //
380 // class MonkeyTestData implementation.
381 //
382 //---------------------------------------------------------------------------------------
383
384 void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) {
385 const int32_t dataLength = 1000;
386
387 // Fill the test string with random characters.
388 // First randomly pick a char class, then randomly pick a character from that class.
389 // Exclude any characters from the dictionary set.
390
391 // std::cout << "Populating Test Data" << std::endl;
392 fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages,
393 // allowing recreation of failing data.
394 fBkRules = rules;
395 fString.remove();
396 for (int32_t n=0; n<dataLength;) {
397 int charClassIndex = rand() % rules->fCharClassList->size();
398 const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex));
399 if (cclass->fSet->size() == 0) {
400 // Some rules or tailorings do end up with empty char classes.
401 continue;
402 }
403 int32_t charIndex = rand() % cclass->fSet->size();
404 UChar32 c = cclass->fSet->charAt(charIndex);
405 if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) {
406 // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
407 // Don't let random unpaired surrogates combine in the test data because they might
408 // produce an unwanted dictionary character.
409 continue;
410 }
411
412 if (!rules->fDictionarySet.contains(c)) {
413 fString.append(c);
414 ++n;
415 }
416 }
417
418 // Reset each rule matcher regex with this new string.
419 // (Although we are always using the same string object, ICU regular expressions
420 // don't like the underlying string data changing without doing a reset).
421
422 for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
423 BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
424 rule->fRuleMatcher->reset(fString);
425 }
426
427 // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
428 // Expected and Actual breaks are one longer than the input string; a non-zero value
429 // will indicate a boundary preceding that position.
430
431 clearActualBreaks();
432 fExpectedBreaks = fActualBreaks;
433 fRuleForPosition = fActualBreaks;
434 f2ndRuleForPos = fActualBreaks;
435
436 // Apply reference rules to find the expected breaks.
437
438 fExpectedBreaks.setCharAt(0, (UChar)1); // Force an expected break before the start of the text.
439 // ICU always reports a break there.
440 // The reference rules do not have a means to do so.
441 int32_t strIdx = 0;
442 while (strIdx < fString.length()) {
443 BreakRule *matchingRule = NULL;
444 UBool hasBreak = FALSE;
445 int32_t ruleNum = 0;
446 int32_t matchStart = 0;
447 int32_t matchEnd = 0;
448 int32_t breakGroup = 0;
449 for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
450 BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
451 rule->fRuleMatcher->reset();
452 if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
453 // A candidate rule match, check further to see if we take it or continue to check other rules.
454 // Matches of zero or one codepoint count only if they also specify a break.
455 matchStart = rule->fRuleMatcher->start(status);
456 matchEnd = rule->fRuleMatcher->end(status);
457 breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status);
458 hasBreak = U_SUCCESS(status);
459 if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
460 status = U_ZERO_ERROR;
461 }
462 if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) {
463 matchingRule = rule;
464 break;
465 }
466 }
467 }
468 if (matchingRule == NULL) {
469 // No reference rule matched. This is an error in the rules that should never happen.
470 IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
471 __FILE__, __LINE__, strIdx);
472 dump(strIdx);
473 status = U_INVALID_FORMAT_ERROR;
474 return;
475 }
476 if (matchingRule->fRuleMatcher->group(status).length() == 0) {
477 // Zero length rule match. This is also an error in the rule expressions.
478 IntlTest::gTest->errln("%s:%d Zero length rule match.",
479 __FILE__, __LINE__);
480 status = U_INVALID_FORMAT_ERROR;
481 return;
482 }
483
484 // Record which rule matched over the length of the match.
485 for (int i = matchStart; i < matchEnd; i++) {
486 if (fRuleForPosition.charAt(i) == 0) {
487 fRuleForPosition.setCharAt(i, (UChar)ruleNum);
488 } else {
489 f2ndRuleForPos.setCharAt(i, (UChar)ruleNum);
490 }
491 }
492
493 // Break positions appear in rules as a matching named capture of zero length at the break position,
494 // the adjusted pattern contains (?<BreakPosition>)
495 if (hasBreak) {
496 int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status);
497 if (U_FAILURE(status) || breakPos < 0) {
498 // Rule specified a break, but that break wasn't part of the match, even
499 // though the rule as a whole matched.
500 // Can't happen with regular expressions derived from (equivalent to) ICU break rules.
501 // Shouldn't get here.
502 IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__);
503 status = U_INVALID_FORMAT_ERROR;
504 break;
505 }
506 fExpectedBreaks.setCharAt(breakPos, (UChar)1);
507 // printf("recording break at %d\n", breakPos);
508 // For the next iteration, pick up applying rules immediately after the break,
509 // which may differ from end of the match. The matching rule may have included
510 // context following the boundary that needs to be looked at again.
511 strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
512 } else {
513 // Original rule didn't specify a break.
514 // Continue applying rules starting on the last code point of this match.
515 strIdx = fString.moveIndex32(matchEnd, -1);
516 if (strIdx == matchStart) {
517 // Match was only one code point, no progress if we continue.
518 // Shouldn't get here, case is filtered out at top of loop.
519 CharString ruleName;
520 ruleName.appendInvariantChars(matchingRule->fName, status);
521 IntlTest::gTest->errln("%s:%d Rule %s internal error",
522 __FILE__, __LINE__, ruleName.data());
523 status = U_INVALID_FORMAT_ERROR;
524 break;
525 }
526 }
527 if (U_FAILURE(status)) {
528 IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
529 __FILE__, __LINE__, u_errorName(status));
530 break;
531 }
532 }
533 }
534
535 void MonkeyTestData::clearActualBreaks() {
536 fActualBreaks.remove();
537 // Actual Breaks length is one longer than the data string length, allowing
538 // for breaks before the first and after the last character in the data.
539 for (int32_t i=0; i<=fString.length(); i++) {
540 fActualBreaks.append((UChar)0);
541 }
542 }
543
544 void MonkeyTestData::dump(int32_t around) const {
545 printf("\n"
546 " char break Rule Character\n"
547 " pos code class R I name name\n"
548 "---------------------------------------------------------------------------------------------\n");
549
550 int32_t start;
551 int32_t end;
552
553 if (around == -1) {
554 start = 0;
555 end = fString.length();
556 } else {
557 // Display context around a failure.
558 start = fString.moveIndex32(around, -30);
559 end = fString.moveIndex32(around, +30);
560 }
561
562 for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) {
563 UErrorCode status = U_ZERO_ERROR;
564 UChar32 c = fString.char32At(charIdx);
565 const CharClass *cc = fBkRules->getClassForChar(c);
566 CharString ccName;
567 ccName.appendInvariantChars(cc->fName, status);
568 CharString ruleName, secondRuleName;
569 const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx)));
570 ruleName.appendInvariantChars(rule->fName, status);
571 if (f2ndRuleForPos.charAt(charIdx) > 0) {
572 const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx)));
573 secondRuleName.appendInvariantChars(secondRule->fName, status);
574 }
575 char cName[200];
576 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
577
578 printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n",
579 charIdx, c, ccName.data(),
580 fExpectedBreaks.charAt(charIdx) ? '*' : '.',
581 fActualBreaks.charAt(charIdx) ? '*' : '.',
582 ruleName.data(), secondRuleName.data(), cName
583 );
584 }
585 }
586
587
588 //---------------------------------------------------------------------------------------
589 //
590 // class RBBIMonkeyImpl
591 //
592 //---------------------------------------------------------------------------------------
593
594 RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) {
595 (void)status; // suppress unused parameter compiler warning.
596 }
597
598
599 // RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the
600 // reference rules and creating the icu breakiterator to test,
601 // with its type and locale coming from the reference rules.
602
603 void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) {
604 fRuleFileName = ruleFile;
605 openBreakRules(ruleFile, status);
606 if (U_FAILURE(status)) {
607 IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
608 return;
609 }
610 fRuleSet.adoptInstead(new BreakRules(this, status));
611 fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status);
612 if (U_FAILURE(status)) {
613 IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
614 return;
615 }
616 fBI.adoptInstead(fRuleSet->createICUBreakIterator(status));
617 fTestData.adoptInstead(new MonkeyTestData());
618 }
619
620
621 RBBIMonkeyImpl::~RBBIMonkeyImpl() {
622 }
623
624
625 void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) {
626 CharString path;
627 path.append(IntlTest::getSourceTestData(status), status);
628 path.append("break_rules" U_FILE_SEP_STRING, status);
629 path.appendPathPart(fileName, status);
630 const char *codePage = "UTF-8";
631 fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status));
632 }
633
634
635 void RBBIMonkeyImpl::startTest() {
636 fThread.start(); // invokes runTest() in a separate thread.
637 }
638
639 void RBBIMonkeyImpl::join() {
640 fThread.join();
641 }
642
643
644 #define MONKEY_ERROR(msg, index) { \
645 IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
646 __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
647 if (fVerbose) { fTestData->dump(index); } \
648 status = U_INVALID_STATE_ERROR; \
649 }
650
651 void RBBIMonkeyImpl::runTest() {
652 UErrorCode status = U_ZERO_ERROR;
653 int32_t errorCount = 0;
654 for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
655 status = U_ZERO_ERROR;
656 fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status);
657 if (fBI.isNull()) {
658 IntlTest::gTest->dataerrln("Unable to run test because fBI is null.");
659 return;
660 }
661 if ( uprv_strcmp(fRuleFileName,"line_loose_cj.txt") == 0 && fTestData->fRandomSeed==1712915859 ) {
662 continue; // known bug around index 103-104, break expected/actual 0/1, fwd 0020 200D | FDFC, rev 1325A 0020 | 200D
663 }
664 // fTestData->dump();
665 testForwards(status);
666 testPrevious(status);
667 testFollowing(status);
668 testPreceding(status);
669 testIsBoundary(status);
670
671 if (fLoopCount < 0 && loopCount % 100 == 0) {
672 fprintf(stderr, ".");
673 }
674 if (U_FAILURE(status)) {
675 if (++errorCount > 10) {
676 return;
677 }
678 }
679 }
680 }
681
682 void RBBIMonkeyImpl::testForwards(UErrorCode &status) {
683 if (U_FAILURE(status)) {
684 return;
685 }
686 fTestData->clearActualBreaks();
687 fBI->setText(fTestData->fString);
688 int32_t previousBreak = -2;
689 for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) {
690 if (bk <= previousBreak) {
691 MONKEY_ERROR("Break Iterator Stall", bk);
692 return;
693 }
694 if (bk < 0 || bk > fTestData->fString.length()) {
695 MONKEY_ERROR("Boundary out of bounds", bk);
696 return;
697 }
698 fTestData->fActualBreaks.setCharAt(bk, 1);
699 }
700 checkResults("testForwards", FORWARD, status);
701 }
702
703 void RBBIMonkeyImpl::testFollowing(UErrorCode &status) {
704 if (U_FAILURE(status)) {
705 return;
706 }
707 fTestData->clearActualBreaks();
708 fBI->setText(fTestData->fString);
709 int32_t nextBreak = -1;
710 for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) {
711 int32_t bk = fBI->following(i);
712 if (bk == BreakIterator::DONE && i == fTestData->fString.length()) {
713 continue;
714 }
715 if (bk == nextBreak && bk > i) {
716 // i is in the gap between two breaks.
717 continue;
718 }
719 if (i == nextBreak && bk > nextBreak) {
720 fTestData->fActualBreaks.setCharAt(bk, 1);
721 nextBreak = bk;
722 continue;
723 }
724 MONKEY_ERROR("following(i)", i);
725 return;
726 }
727 checkResults("testFollowing", FORWARD, status);
728 }
729
730
731
732 void RBBIMonkeyImpl::testPrevious(UErrorCode &status) {
733 if (U_FAILURE(status)) {return;}
734
735 fTestData->clearActualBreaks();
736 fBI->setText(fTestData->fString);
737 int32_t previousBreak = INT32_MAX;
738 for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) {
739 if (bk >= previousBreak) {
740 MONKEY_ERROR("Break Iterator Stall", bk);
741 return;
742 }
743 if (bk < 0 || bk > fTestData->fString.length()) {
744 MONKEY_ERROR("Boundary out of bounds", bk);
745 return;
746 }
747 fTestData->fActualBreaks.setCharAt(bk, 1);
748 }
749 checkResults("testPrevious", REVERSE, status);
750 }
751
752
753 void RBBIMonkeyImpl::testPreceding(UErrorCode &status) {
754 if (U_FAILURE(status)) {
755 return;
756 }
757 fTestData->clearActualBreaks();
758 fBI->setText(fTestData->fString);
759 int32_t nextBreak = fTestData->fString.length()+1;
760 for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) {
761 int32_t bk = fBI->preceding(i);
762 // printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
763 if (bk == BreakIterator::DONE && i == 0) {
764 continue;
765 }
766 if (bk == nextBreak && bk < i) {
767 // i is in the gap between two breaks.
768 continue;
769 }
770 if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) {
771 // i indexes to a trailing surrogate.
772 // Break Iterators treat an index to either half as referring to the supplemental code point,
773 // with preceding going to some preceding code point.
774 if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) {
775 MONKEY_ERROR("preceding of trailing surrogate error", i);
776 }
777 continue;
778 }
779 if (i == nextBreak && bk < nextBreak) {
780 fTestData->fActualBreaks.setCharAt(bk, 1);
781 nextBreak = bk;
782 continue;
783 }
784 MONKEY_ERROR("preceding(i)", i);
785 return;
786 }
787 checkResults("testPreceding", REVERSE, status);
788 }
789
790
791 void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) {
792 if (U_FAILURE(status)) {
793 return;
794 }
795 fTestData->clearActualBreaks();
796 fBI->setText(fTestData->fString);
797 for (int i=fTestData->fString.length(); i>=0; --i) {
798 if (fBI->isBoundary(i)) {
799 fTestData->fActualBreaks.setCharAt(i, 1);
800 }
801 }
802 checkResults("testForwards", FORWARD, status);
803 }
804
805 void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) {
806 if (U_FAILURE(status)) {
807 return;
808 }
809 if (direction == FORWARD) {
810 for (int i=0; i<=fTestData->fString.length(); ++i) {
811 if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
812 if (i > 1) {
813 IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
814 __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-2), fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
815 fRuleFileName, fTestData->fRandomSeed);
816 } else {
817 IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
818 __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
819 fRuleFileName, fTestData->fRandomSeed);
820 }
821 if (fVerbose) {
822 fTestData->dump(i);
823 }
824 status = U_INVALID_STATE_ERROR; // Prevent the test from continuing, which would likely
825 break; // produce many redundant errors.
826 }
827 }
828 } else {
829 for (int i=fTestData->fString.length(); i>=0; i--) {
830 if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
831 if (i > 1) {
832 IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
833 __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-2), fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
834 fRuleFileName, fTestData->fRandomSeed);
835 } else {
836 IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
837 __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
838 fRuleFileName, fTestData->fRandomSeed);
839 }
840 if (fVerbose) {
841 fTestData->dump(i);
842 }
843 status = U_INVALID_STATE_ERROR;
844 break;
845 }
846 }
847 }
848 }
849
850
851
852 //---------------------------------------------------------------------------------------
853 //
854 // class RBBIMonkeyTest implementation.
855 //
856 //---------------------------------------------------------------------------------------
857 RBBIMonkeyTest::RBBIMonkeyTest() {
858 }
859
860 RBBIMonkeyTest::~RBBIMonkeyTest() {
861 }
862
863
864 // params, taken from this->fParams.
865 // rules=file_name Name of file containing the reference rules.
866 // seed=nnnnn Random number starting seed.
867 // Setting the seed allows errors to be reproduced.
868 // loop=nnn Looping count. Controls running time.
869 // -1: run forever.
870 // 0 or greater: run length.
871 // expansions debug option, show expansions of rules and sets.
872 // verbose Display details of the failure.
873 //
874 // Parameters on the intltest command line follow the test name, and are preceded by '@'.
875 // For example,
876 // intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
877 //
878 void RBBIMonkeyTest::testMonkey() {
879 // printf("Test parameters: %s\n", fParams);
880 UnicodeString params(fParams);
881 UErrorCode status = U_ZERO_ERROR;
882
883 const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
884 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
885 NULL };
886 CharString testNameFromParams;
887 if (getStringParam("rules", params, testNameFromParams, status)) {
888 tests[0] = testNameFromParams.data();
889 tests[1] = NULL;
890 }
891
892 int64_t loopCount = quick? 100 : 5000;
893 getIntParam("loop", params, loopCount, status);
894
895 UBool dumpExpansions = FALSE;
896 getBoolParam("expansions", params, dumpExpansions, status);
897
898 UBool verbose = FALSE;
899 getBoolParam("verbose", params, verbose, status);
900
901 int64_t seed = 0;
902 getIntParam("seed", params, seed, status);
903
904 if (params.length() != 0) {
905 // Options processing did not consume all of the parameters. Something unrecognized was present.
906 CharString unrecognizedParameters;
907 unrecognizedParameters.append(CStr(params)(), -1, status);
908 errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data());
909 return;
910 }
911
912 UVector startedTests(status);
913 if (U_FAILURE(status)) {
914 errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status));
915 return;
916 }
917
918 // Monkey testing is multi-threaded.
919 // Each set of break rules to be tested is run in a separate thread.
920 // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
921 int32_t i;
922 for (i=0; tests[i] != NULL; ++i) {
923 logln("beginning testing of %s", tests[i]);
924 RBBIMonkeyImpl *test = new RBBIMonkeyImpl(status);
925 test->fDumpExpansions = dumpExpansions;
926 test->fVerbose = verbose;
927 test->fRandomGenerator.seed((uint32_t)seed);
928 test->fLoopCount = loopCount;
929 test->setup(tests[i], status);
930 test->startTest();
931 startedTests.addElement(test, status);
932 if (U_FAILURE(status)) {
933 break;
934 }
935 }
936
937 if (U_FAILURE(status)) {
938 dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
939 }
940
941 for (i=0; i<startedTests.size(); ++i) {
942 RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i));
943 test->join();
944 delete test;
945 }
946 }
947
948
949 UBool RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status) {
950 name.append(" *= *(-?\\d+) *,? *");
951 RegexMatcher m(name, params, 0, status);
952 if (m.find()) {
953 // The param exists. Convert the string to an int.
954 CharString str;
955 str.append(CStr(m.group(1, status))(), -1, status);
956 val = strtol(str.data(), NULL, 10);
957
958 // Delete this parameter from the params string.
959 m.reset();
960 params = m.replaceFirst(UnicodeString(), status);
961 return TRUE;
962 }
963 return FALSE;
964 }
965
966 UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status) {
967 name.append(" *= *([^ ,]*) *,? *");
968 RegexMatcher m(name, params, 0, status);
969 if (m.find()) {
970 // The param exists.
971 dest.append(CStr(m.group(1, status))(), -1, status);
972
973 // Delete this parameter from the params string.
974 m.reset();
975 params = m.replaceFirst(UnicodeString(), status);
976 return TRUE;
977 }
978 return FALSE;
979 }
980
981 UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status) {
982 name.append("(?: *= *(true|false))? *,? *");
983 RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status);
984 if (m.find()) {
985 if (m.start(1, status) > 0) {
986 // user option included a value.
987 dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0;
988 } else {
989 // No explicit user value, implies true.
990 dest = TRUE;
991 }
992
993 // Delete this parameter from the params string.
994 m.reset();
995 params = m.replaceFirst(UnicodeString(), status);
996 return TRUE;
997 }
998 return FALSE;
999 }
1000
1001 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */