]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/rbbimonkeytest.cpp
ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbimonkeytest.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * Copyright (c) 2016, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 ********************************************************************/
7
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
12
13 #include "rbbimonkeytest.h"
14 #include "unicode/utypes.h"
15 #include "unicode/brkiter.h"
16 #include "unicode/utf16.h"
17 #include "unicode/uniset.h"
18 #include "unicode/unistr.h"
19
20 #include "charstr.h"
21 #include "cmemory.h"
22 #include "cstr.h"
23 #include "uelement.h"
24 #include "uhash.h"
25 #include "cstring.h"
26
27 #include <iostream>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string>
31
32 using namespace icu;
33
34
35 void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) {
36 fParams = params; // Work around TESTCASE_AUTO not being able to pass params to test function.
37
38 TESTCASE_AUTO_BEGIN;
39 TESTCASE_AUTO(testMonkey);
40 TESTCASE_AUTO_END;
41 }
42
43 //---------------------------------------------------------------------------------------
44 //
45 // class BreakRule implementation.
46 //
47 //---------------------------------------------------------------------------------------
48
49 BreakRule::BreakRule() // : all field default initialized.
50 {
51 }
52
53 BreakRule::~BreakRule() {}
54
55
56 //---------------------------------------------------------------------------------------
57 //
58 // class BreakRules implementation.
59 //
60 //---------------------------------------------------------------------------------------
61 BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) :
62 fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) {
63 fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString,
64 uhash_compareUnicodeString,
65 NULL, // value comparator.
66 &status));
67 if (U_FAILURE(status)) {
68 return;
69 }
70 uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
71 uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
72 fBreakRules.setDeleter(uprv_deleteUObject);
73
74 fCharClassList.adoptInstead(new UVector(status));
75
76 fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
77 "(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative look behind for '{' or '=' or '[:'
78 // (the identifier is a unicode property name or value)
79 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
80 0, status));
81
82 // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
83 fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
84 "(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';')
85 "[ \\t]*+" // Match white space.
86 "(#.*)?+" // Optional # plus whatever follows
87 "\\R$" // new-line at end of line.
88 ), 0, status));
89
90 // Match (initial parse) of a character class definition line.
91 fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
92 "[ \\t]*" // leading white space
93 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
94 "[ \\t]*=[ \\t]*" // =
95 "(?<ClassDef>.*?)" // The char class UnicodeSet expression
96 "[ \\t]*;$"), // ; <end of line>
97 0, status));
98
99 // Match (initial parse) of a break rule line.
100 fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
101 "[ \\t]*" // leading white space
102 "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name
103 "[ \\t]*:[ \\t]*" // :
104 "(?<RuleDef>.*?)" // The rule definition
105 "[ \\t]*;$"), // ; <end of line>
106 0, status));
107
108 }
109
110
111 BreakRules::~BreakRules() {}
112
113
114 CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
115
116 // Create the expanded definition for this char class,
117 // replacing any set references with the corresponding definition.
118
119 UnicodeString expandedDef;
120 UnicodeString emptyString;
121 fSetRefsMatcher->reset(definition);
122 while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
123 const UnicodeString name =
124 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
125 CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
126 const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
127
128 fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status);
129 expandedDef.append(expansionForName);
130 }
131 fSetRefsMatcher->appendTail(expandedDef);
132
133 // Verify that the expanded set definition is valid.
134
135 if (fMonkeyImpl->fDumpExpansions) {
136 printf("epandedDef: %s\n", CStr(expandedDef)());
137 }
138
139 UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status);
140 if (U_FAILURE(status)) {
141 IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__,
142 u_errorName(status), CStr(name)());
143 return NULL;
144 }
145 CharClass *cclass = new CharClass(name, definition, expandedDef, s);
146 CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(),
147 new UnicodeString(name), // Key, owned by hash table.
148 cclass, // Value, owned by hash table.
149 &status));
150
151 if (previousClass != NULL) {
152 // Duplicate class def.
153 // These are legitimate, they are adjustments of an existing class.
154 // TODO: will need to keep the old around when we handle tailorings.
155 IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
156 delete previousClass;
157 }
158 return cclass;
159 }
160
161
162 void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
163 LocalPointer<BreakRule> thisRule(new BreakRule);
164 thisRule->fName = name;
165 thisRule->fRule = definition;
166
167 // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
168 // This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
169 UnicodeString emptyString;
170
171 // Expand the char class definitions within the rule.
172 fSetRefsMatcher->reset(definition);
173 while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
174 const UnicodeString name =
175 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
176 CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
177 if (!nameClass) {
178 IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
179 __FILE__, __LINE__, CStr(name)(), CStr(definition)());
180 }
181 const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
182
183 fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status);
184 thisRule->fExpandedRule.append(expansionForName);
185 }
186 fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
187
188 // If rule begins with a '^' rule chaining is disallowed.
189 // Strip off the '^' from the rule expression, and set the flag.
190 if (thisRule->fExpandedRule.charAt(0) == u'^') {
191 thisRule->fInitialMatchOnly = true;
192 thisRule->fExpandedRule.remove(0, 1);
193 thisRule->fExpandedRule.trim();
194 }
195
196 // Replace the divide sign (\u00f7) with a regular expression named capture.
197 // When running the rules, a match that includes this group means we found a break position.
198
199 int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7);
200 if (dividePos >= 0) {
201 thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)"));
202 }
203 if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) {
204 status = U_ILLEGAL_ARGUMENT_ERROR; // TODO: produce a good error message.
205 }
206
207 // UAX break rule set definitions can be empty, just [].
208 // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
209 // also matches nothing.
210
211 static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0};
212 int32_t where = 0;
213 while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) {
214 thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
215 }
216 if (fMonkeyImpl->fDumpExpansions) {
217 printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)());
218 }
219
220 // Compile a regular expression for this rule.
221 thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status));
222 if (U_FAILURE(status)) {
223 IntlTest::gTest->errln("%s:%d Error creating regular expression for %s",
224 __FILE__, __LINE__, CStr(thisRule->fExpandedRule)());
225 return;
226 }
227
228 // Put this new rule into the vector of all Rules.
229 fBreakRules.addElement(thisRule.orphan(), status);
230 }
231
232
233 bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) {
234 if (keyword == UnicodeString("locale")) {
235 CharString localeName;
236 localeName.append(CStr(value)(), -1, status);
237 fLocale = Locale::createFromName(localeName.data());
238 return true;
239 }
240 if (keyword == UnicodeString("type")) {
241 if (value == UnicodeString("grapheme")) {
242 fType = UBRK_CHARACTER;
243 } else if (value == UnicodeString("word")) {
244 fType = UBRK_WORD;
245 } else if (value == UnicodeString("line")) {
246 fType = UBRK_LINE;
247 } else if (value == UnicodeString("sentence")) {
248 fType = UBRK_SENTENCE;
249 } else {
250 IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__, CStr(value)());
251 }
252 return true;
253 }
254 // TODO: add tailoring base setting here.
255 return false;
256 }
257
258 RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) {
259 if (U_FAILURE(status)) {
260 return NULL;
261 }
262 RuleBasedBreakIterator *bi = NULL;
263 switch(fType) {
264 case UBRK_CHARACTER:
265 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status));
266 break;
267 case UBRK_WORD:
268 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status));
269 break;
270 case UBRK_LINE:
271 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status));
272 break;
273 case UBRK_SENTENCE:
274 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status));
275 break;
276 default:
277 IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType);
278 status = U_ILLEGAL_ARGUMENT_ERROR;
279 }
280 return bi;
281 }
282
283
284 void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) {
285 if (U_FAILURE(status)) {
286 return;
287 }
288
289 UnicodeString emptyString;
290 for (int32_t lineNumber=0; ;lineNumber++) { // Loop once per input line.
291 if (U_FAILURE(status)) {
292 return;
293 }
294 int32_t lineLength = 0;
295 const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status);
296 if (lineBuf == NULL) {
297 break;
298 }
299 UnicodeString line(lineBuf, lineLength);
300
301 // Strip comment lines.
302 fCommentsMatcher->reset(line);
303 line = fCommentsMatcher->replaceFirst(emptyString, status);
304 if (line.isEmpty()) {
305 continue;
306 }
307
308 // Recognize character class definition and keyword lines
309 fClassDefMatcher->reset(line);
310 if (fClassDefMatcher->matches(status)) {
311 UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status);
312 UnicodeString classDef = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status);
313 if (fMonkeyImpl->fDumpExpansions) {
314 printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)());
315 }
316 if (setKeywordParameter(className, classDef, status)) {
317 // The scanned item was "type = ..." or "locale = ...", etc.
318 // which are not actual character classes.
319 continue;
320 }
321 addCharClass(className, classDef, status);
322 continue;
323 }
324
325 // Recognize rule lines.
326 fRuleDefMatcher->reset(line);
327 if (fRuleDefMatcher->matches(status)) {
328 UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status);
329 UnicodeString ruleDef = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status);
330 if (fMonkeyImpl->fDumpExpansions) {
331 printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)());
332 }
333 addRule(ruleName, ruleDef, status);
334 continue;
335 }
336
337 IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
338 __FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)());
339 }
340
341 // Build the vector of char classes, omitting the dictionary class if there is one.
342 // This will be used when constructing the random text to be tested.
343
344 // Also compute the "other" set, consisting of any characters not included in
345 // one or more of the user defined sets.
346
347 UnicodeSet otherSet((UChar32)0, 0x10ffff);
348 int32_t pos = UHASH_FIRST;
349 const UHashElement *el = NULL;
350 while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) {
351 const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer);
352 CharClass *cclass = static_cast<CharClass *>(el->value.pointer);
353 // printf(" Adding %s\n", CStr(*ccName)());
354 if (*ccName != cclass->fName) {
355 IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
356 __FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)());
357 }
358 const UnicodeSet *set = cclass->fSet.getAlias();
359 otherSet.removeAll(*set);
360 if (*ccName == UnicodeString("dictionary")) {
361 fDictionarySet = *set;
362 } else {
363 fCharClassList->addElement(cclass, status);
364 }
365 }
366
367 if (!otherSet.isEmpty()) {
368 // fprintf(stderr, "have an other set.\n");
369 UnicodeString pattern;
370 CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status);
371 fCharClassList->addElement(cclass, status);
372 }
373 }
374
375
376 const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const {
377 int32_t localIter = 0;
378 int32_t &it = iter? *iter : localIter;
379
380 while (it < fCharClassList->size()) {
381 const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it));
382 ++it;
383 if (cc->fSet->contains(c)) {
384 return cc;
385 }
386 }
387 return NULL;
388 }
389
390 //---------------------------------------------------------------------------------------
391 //
392 // class MonkeyTestData implementation.
393 //
394 //---------------------------------------------------------------------------------------
395
396 void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) {
397 const int32_t dataLength = 1000;
398
399 // Fill the test string with random characters.
400 // First randomly pick a char class, then randomly pick a character from that class.
401 // Exclude any characters from the dictionary set.
402
403 // std::cout << "Populating Test Data" << std::endl;
404 fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages,
405 // allowing recreation of failing data.
406 fBkRules = rules;
407 fString.remove();
408 for (int32_t n=0; n<dataLength;) {
409 int charClassIndex = rand() % rules->fCharClassList->size();
410 const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex));
411 if (cclass->fSet->size() == 0) {
412 // Some rules or tailorings do end up with empty char classes.
413 continue;
414 }
415 int32_t charIndex = rand() % cclass->fSet->size();
416 UChar32 c = cclass->fSet->charAt(charIndex);
417 if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) {
418 // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
419 // Don't let random unpaired surrogates combine in the test data because they might
420 // produce an unwanted dictionary character.
421 continue;
422 }
423
424 if (!rules->fDictionarySet.contains(c)) {
425 fString.append(c);
426 ++n;
427 }
428 }
429
430 // Reset each rule matcher regex with this new string.
431 // (Although we are always using the same string object, ICU regular expressions
432 // don't like the underlying string data changing without doing a reset).
433
434 for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
435 BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
436 rule->fRuleMatcher->reset(fString);
437 }
438
439 // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
440 // Expected and Actual breaks are one longer than the input string; a non-zero value
441 // will indicate a boundary preceding that position.
442
443 clearActualBreaks();
444 fExpectedBreaks = fActualBreaks;
445 fRuleForPosition = fActualBreaks;
446 f2ndRuleForPos = fActualBreaks;
447
448 // Apply reference rules to find the expected breaks.
449
450 fExpectedBreaks.setCharAt(0, (UChar)1); // Force an expected break before the start of the text.
451 // ICU always reports a break there.
452 // The reference rules do not have a means to do so.
453 int32_t strIdx = 0;
454 bool initialMatch = true; // True at start of text, and immediately after each boundary,
455 // for control over rule chaining.
456 while (strIdx < fString.length()) {
457 BreakRule *matchingRule = NULL;
458 UBool hasBreak = FALSE;
459 int32_t ruleNum = 0;
460 int32_t matchStart = 0;
461 int32_t matchEnd = 0;
462 int32_t breakGroup = 0;
463 for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
464 BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
465 if (rule->fInitialMatchOnly && !initialMatch) {
466 // Skip checking this '^' rule. (No rule chaining)
467 continue;
468 }
469 rule->fRuleMatcher->reset();
470 if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
471 // A candidate rule match, check further to see if we take it or continue to check other rules.
472 // Matches of zero or one codepoint count only if they also specify a break.
473 matchStart = rule->fRuleMatcher->start(status);
474 matchEnd = rule->fRuleMatcher->end(status);
475 breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status);
476 hasBreak = U_SUCCESS(status);
477 if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
478 status = U_ZERO_ERROR;
479 }
480 if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) {
481 matchingRule = rule;
482 break;
483 }
484 }
485 }
486 if (matchingRule == NULL) {
487 // No reference rule matched. This is an error in the rules that should never happen.
488 IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
489 __FILE__, __LINE__, strIdx);
490 dump(strIdx);
491 status = U_INVALID_FORMAT_ERROR;
492 return;
493 }
494 if (matchingRule->fRuleMatcher->group(status).length() == 0) {
495 // Zero length rule match. This is also an error in the rule expressions.
496 IntlTest::gTest->errln("%s:%d Zero length rule match.",
497 __FILE__, __LINE__);
498 status = U_INVALID_FORMAT_ERROR;
499 return;
500 }
501
502 // Record which rule matched over the length of the match.
503 for (int i = matchStart; i < matchEnd; i++) {
504 if (fRuleForPosition.charAt(i) == 0) {
505 fRuleForPosition.setCharAt(i, (UChar)ruleNum);
506 } else {
507 f2ndRuleForPos.setCharAt(i, (UChar)ruleNum);
508 }
509 }
510
511 // Break positions appear in rules as a matching named capture of zero length at the break position,
512 // the adjusted pattern contains (?<BreakPosition>)
513 if (hasBreak) {
514 int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status);
515 if (U_FAILURE(status) || breakPos < 0) {
516 // Rule specified a break, but that break wasn't part of the match, even
517 // though the rule as a whole matched.
518 // Can't happen with regular expressions derived from (equivalent to) ICU break rules.
519 // Shouldn't get here.
520 IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__);
521 status = U_INVALID_FORMAT_ERROR;
522 break;
523 }
524 fExpectedBreaks.setCharAt(breakPos, (UChar)1);
525 // printf("recording break at %d\n", breakPos);
526 // For the next iteration, pick up applying rules immediately after the break,
527 // which may differ from end of the match. The matching rule may have included
528 // context following the boundary that needs to be looked at again.
529 strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
530 initialMatch = true;
531 } else {
532 // Original rule didn't specify a break.
533 // Continue applying rules starting on the last code point of this match.
534 strIdx = fString.moveIndex32(matchEnd, -1);
535 initialMatch = false;
536 if (strIdx == matchStart) {
537 // Match was only one code point, no progress if we continue.
538 // Shouldn't get here, case is filtered out at top of loop.
539 CharString ruleName;
540 ruleName.appendInvariantChars(matchingRule->fName, status);
541 IntlTest::gTest->errln("%s:%d Rule %s internal error",
542 __FILE__, __LINE__, ruleName.data());
543 status = U_INVALID_FORMAT_ERROR;
544 break;
545 }
546 }
547 if (U_FAILURE(status)) {
548 IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
549 __FILE__, __LINE__, u_errorName(status));
550 break;
551 }
552 }
553 }
554
555 void MonkeyTestData::clearActualBreaks() {
556 fActualBreaks.remove();
557 // Actual Breaks length is one longer than the data string length, allowing
558 // for breaks before the first and after the last character in the data.
559 for (int32_t i=0; i<=fString.length(); i++) {
560 fActualBreaks.append((UChar)0);
561 }
562 }
563
564 void MonkeyTestData::dump(int32_t around) const {
565 printf("\n"
566 " char break Rule Character\n"
567 " pos code class R I name name\n"
568 "---------------------------------------------------------------------------------------------\n");
569
570 int32_t start;
571 int32_t end;
572
573 if (around == -1) {
574 start = 0;
575 end = fString.length();
576 } else {
577 // Display context around a failure.
578 start = fString.moveIndex32(around, -30);
579 end = fString.moveIndex32(around, +30);
580 }
581
582 for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) {
583 UErrorCode status = U_ZERO_ERROR;
584 UChar32 c = fString.char32At(charIdx);
585 const CharClass *cc = fBkRules->getClassForChar(c);
586 CharString ccName;
587 ccName.appendInvariantChars(cc->fName, status);
588 CharString ruleName, secondRuleName;
589 const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx)));
590 ruleName.appendInvariantChars(rule->fName, status);
591 if (f2ndRuleForPos.charAt(charIdx) > 0) {
592 const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx)));
593 secondRuleName.appendInvariantChars(secondRule->fName, status);
594 }
595 char cName[200];
596 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
597
598 printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n",
599 charIdx, c, ccName.data(),
600 fExpectedBreaks.charAt(charIdx) ? '*' : '.',
601 fActualBreaks.charAt(charIdx) ? '*' : '.',
602 ruleName.data(), secondRuleName.data(), cName
603 );
604 }
605 }
606
607
608 //---------------------------------------------------------------------------------------
609 //
610 // class RBBIMonkeyImpl
611 //
612 //---------------------------------------------------------------------------------------
613
614 RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) {
615 (void)status; // suppress unused parameter compiler warning.
616 }
617
618
619 // RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the
620 // reference rules and creating the icu breakiterator to test,
621 // with its type and locale coming from the reference rules.
622
623 void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) {
624 fRuleFileName = ruleFile;
625 openBreakRules(ruleFile, status);
626 if (U_FAILURE(status)) {
627 IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
628 return;
629 }
630 fRuleSet.adoptInstead(new BreakRules(this, status));
631 fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status);
632 if (U_FAILURE(status)) {
633 IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
634 return;
635 }
636 fBI.adoptInstead(fRuleSet->createICUBreakIterator(status));
637 fTestData.adoptInstead(new MonkeyTestData());
638 }
639
640
641 RBBIMonkeyImpl::~RBBIMonkeyImpl() {
642 }
643
644
645 void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) {
646 CharString path;
647 path.append(IntlTest::getSourceTestData(status), status);
648 path.append("break_rules" U_FILE_SEP_STRING, status);
649 path.appendPathPart(fileName, status);
650 const char *codePage = "UTF-8";
651 fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status));
652 }
653
654
655 void RBBIMonkeyImpl::startTest() {
656 fThread.start(); // invokes runTest() in a separate thread.
657 }
658
659 void RBBIMonkeyImpl::join() {
660 fThread.join();
661 }
662
663
664 #define MONKEY_ERROR(msg, index) { \
665 IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
666 __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
667 if (fVerbose) { fTestData->dump(index); } \
668 status = U_INVALID_STATE_ERROR; \
669 }
670
671 void RBBIMonkeyImpl::runTest() {
672 UErrorCode status = U_ZERO_ERROR;
673 int32_t errorCount = 0;
674 for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
675 status = U_ZERO_ERROR;
676 fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status);
677 if (fBI.isNull()) {
678 IntlTest::gTest->dataerrln("Unable to run test because fBI is null.");
679 return;
680 }
681 if ( uprv_strcmp(fRuleFileName,"line_loose_cj.txt") == 0 && fTestData->fRandomSeed==1712915859 ) {
682 continue; // known bug around index 103-104, break expected/actual 0/1, fwd 0020 200D | FDFC, rev 1325A 0020 | 200D
683 }
684 // fTestData->dump();
685 testForwards(status);
686 testPrevious(status);
687 testFollowing(status);
688 testPreceding(status);
689 testIsBoundary(status);
690 testIsBoundaryRandom(status);
691
692 if (fLoopCount < 0 && loopCount % 100 == 0) {
693 fprintf(stderr, ".");
694 }
695 if (U_FAILURE(status)) {
696 if (++errorCount > 10) {
697 return;
698 }
699 }
700 }
701 }
702
703 void RBBIMonkeyImpl::testForwards(UErrorCode &status) {
704 if (U_FAILURE(status)) {
705 return;
706 }
707 fTestData->clearActualBreaks();
708 fBI->setText(fTestData->fString);
709 int32_t previousBreak = -2;
710 for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) {
711 if (bk <= previousBreak) {
712 MONKEY_ERROR("Break Iterator Stall", bk);
713 return;
714 }
715 if (bk < 0 || bk > fTestData->fString.length()) {
716 MONKEY_ERROR("Boundary out of bounds", bk);
717 return;
718 }
719 fTestData->fActualBreaks.setCharAt(bk, 1);
720 }
721 checkResults("testForwards", FORWARD, status);
722 }
723
724 void RBBIMonkeyImpl::testFollowing(UErrorCode &status) {
725 if (U_FAILURE(status)) {
726 return;
727 }
728 fTestData->clearActualBreaks();
729 fBI->setText(fTestData->fString);
730 int32_t nextBreak = -1;
731 for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) {
732 int32_t bk = fBI->following(i);
733 if (bk == BreakIterator::DONE && i == fTestData->fString.length()) {
734 continue;
735 }
736 if (bk == nextBreak && bk > i) {
737 // i is in the gap between two breaks.
738 continue;
739 }
740 if (i == nextBreak && bk > nextBreak) {
741 fTestData->fActualBreaks.setCharAt(bk, 1);
742 nextBreak = bk;
743 continue;
744 }
745 MONKEY_ERROR("following(i)", i);
746 return;
747 }
748 checkResults("testFollowing", FORWARD, status);
749 }
750
751
752
753 void RBBIMonkeyImpl::testPrevious(UErrorCode &status) {
754 if (U_FAILURE(status)) {return;}
755
756 fTestData->clearActualBreaks();
757 fBI->setText(fTestData->fString);
758 int32_t previousBreak = INT32_MAX;
759 for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) {
760 if (bk >= previousBreak) {
761 MONKEY_ERROR("Break Iterator Stall", bk);
762 return;
763 }
764 if (bk < 0 || bk > fTestData->fString.length()) {
765 MONKEY_ERROR("Boundary out of bounds", bk);
766 return;
767 }
768 fTestData->fActualBreaks.setCharAt(bk, 1);
769 }
770 checkResults("testPrevious", REVERSE, status);
771 }
772
773
774 void RBBIMonkeyImpl::testPreceding(UErrorCode &status) {
775 if (U_FAILURE(status)) {
776 return;
777 }
778 fTestData->clearActualBreaks();
779 fBI->setText(fTestData->fString);
780 int32_t nextBreak = fTestData->fString.length()+1;
781 for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) {
782 int32_t bk = fBI->preceding(i);
783 // printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
784 if (bk == BreakIterator::DONE && i == 0) {
785 continue;
786 }
787 if (bk == nextBreak && bk < i) {
788 // i is in the gap between two breaks.
789 continue;
790 }
791 if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) {
792 // i indexes to a trailing surrogate.
793 // Break Iterators treat an index to either half as referring to the supplemental code point,
794 // with preceding going to some preceding code point.
795 if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) {
796 MONKEY_ERROR("preceding of trailing surrogate error", i);
797 }
798 continue;
799 }
800 if (i == nextBreak && bk < nextBreak) {
801 fTestData->fActualBreaks.setCharAt(bk, 1);
802 nextBreak = bk;
803 continue;
804 }
805 MONKEY_ERROR("preceding(i)", i);
806 return;
807 }
808 checkResults("testPreceding", REVERSE, status);
809 }
810
811
812 void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) {
813 if (U_FAILURE(status)) {
814 return;
815 }
816 fTestData->clearActualBreaks();
817 fBI->setText(fTestData->fString);
818 for (int i=fTestData->fString.length(); i>=0; --i) {
819 if (fBI->isBoundary(i)) {
820 fTestData->fActualBreaks.setCharAt(i, 1);
821 }
822 }
823 checkResults("testForwards", FORWARD, status);
824 }
825
826 void RBBIMonkeyImpl::testIsBoundaryRandom(UErrorCode &status) {
827 if (U_FAILURE(status)) {
828 return;
829 }
830 fBI->setText(fTestData->fString);
831
832 int stringLen = fTestData->fString.length();
833 for (int i=stringLen; i>=0; --i) {
834 int strIdx = fRandomGenerator() % stringLen;
835 if (fTestData->fExpectedBreaks.charAt(strIdx) != fBI->isBoundary(strIdx)) {
836 IntlTest::gTest->errln("%s:%d testIsBoundaryRandom failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
837 __FILE__, __LINE__, strIdx, fRuleFileName, fTestData->fRandomSeed);
838 if (fVerbose) {
839 fTestData->dump(i);
840 }
841 status = U_INVALID_STATE_ERROR;
842 break;
843 }
844 }
845 }
846
847
848
849 void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) {
850 if (U_FAILURE(status)) {
851 return;
852 }
853 if (direction == FORWARD) {
854 for (int i=0; i<=fTestData->fString.length(); ++i) {
855 if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
856 if (i > 1) {
857 IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
858 __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-2), fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
859 fRuleFileName, fTestData->fRandomSeed);
860 } else {
861 IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
862 __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
863 fRuleFileName, fTestData->fRandomSeed);
864 }
865 if (fVerbose) {
866 fTestData->dump(i);
867 }
868 status = U_INVALID_STATE_ERROR; // Prevent the test from continuing, which would likely
869 break; // produce many redundant errors.
870 }
871 }
872 } else {
873 for (int i=fTestData->fString.length(); i>=0; i--) {
874 if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
875 if (i > 1) {
876 IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
877 __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-2), fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
878 fRuleFileName, fTestData->fRandomSeed);
879 } else {
880 IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
881 __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
882 fRuleFileName, fTestData->fRandomSeed);
883 }
884 if (fVerbose) {
885 fTestData->dump(i);
886 }
887 status = U_INVALID_STATE_ERROR;
888 break;
889 }
890 }
891 }
892 }
893
894
895
896 //---------------------------------------------------------------------------------------
897 //
898 // class RBBIMonkeyTest implementation.
899 //
900 //---------------------------------------------------------------------------------------
901 RBBIMonkeyTest::RBBIMonkeyTest() {
902 }
903
904 RBBIMonkeyTest::~RBBIMonkeyTest() {
905 }
906
907
908 // params, taken from this->fParams.
909 // rules=file_name Name of file containing the reference rules.
910 // seed=nnnnn Random number starting seed.
911 // Setting the seed allows errors to be reproduced.
912 // loop=nnn Looping count. Controls running time.
913 // -1: run forever.
914 // 0 or greater: run length.
915 // expansions debug option, show expansions of rules and sets.
916 // verbose Display details of the failure.
917 //
918 // Parameters on the intltest command line follow the test name, and are preceded by '@'.
919 // For example,
920 // intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
921 //
922 void RBBIMonkeyTest::testMonkey() {
923 // printf("Test parameters: %s\n", fParams);
924 UnicodeString params(fParams);
925 UErrorCode status = U_ZERO_ERROR;
926
927 const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "line_cj.txt", "sentence.txt", "line_normal.txt",
928 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
929 NULL };
930 CharString testNameFromParams;
931 if (getStringParam("rules", params, testNameFromParams, status)) {
932 tests[0] = testNameFromParams.data();
933 tests[1] = NULL;
934 }
935
936 int64_t loopCount = quick? 100 : 5000;
937 getIntParam("loop", params, loopCount, status);
938
939 UBool dumpExpansions = FALSE;
940 getBoolParam("expansions", params, dumpExpansions, status);
941
942 UBool verbose = FALSE;
943 getBoolParam("verbose", params, verbose, status);
944
945 int64_t seed = 0;
946 getIntParam("seed", params, seed, status);
947
948 if (params.length() != 0) {
949 // Options processing did not consume all of the parameters. Something unrecognized was present.
950 CharString unrecognizedParameters;
951 unrecognizedParameters.append(CStr(params)(), -1, status);
952 errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data());
953 return;
954 }
955
956 UVector startedTests(status);
957 if (U_FAILURE(status)) {
958 errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status));
959 return;
960 }
961
962 // Monkey testing is multi-threaded.
963 // Each set of break rules to be tested is run in a separate thread.
964 // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
965 int32_t i;
966 for (i=0; tests[i] != NULL; ++i) {
967 logln("beginning testing of %s", tests[i]);
968 LocalPointer<RBBIMonkeyImpl> test(new RBBIMonkeyImpl(status));
969 if (U_FAILURE(status)) {
970 dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
971 break;
972 }
973 test->fDumpExpansions = dumpExpansions;
974 test->fVerbose = verbose;
975 test->fRandomGenerator.seed(static_cast<uint32_t>(seed));
976 test->fLoopCount = static_cast<int32_t>(loopCount);
977 test->setup(tests[i], status);
978 if (U_FAILURE(status)) {
979 dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
980 break;
981 }
982 test->startTest();
983 startedTests.addElement(test.orphan(), status);
984 if (U_FAILURE(status)) {
985 errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
986 break;
987 }
988 }
989
990 for (i=0; i<startedTests.size(); ++i) {
991 RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i));
992 test->join();
993 delete test;
994 }
995 }
996
997
998 UBool RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status) {
999 name.append(" *= *(-?\\d+) *,? *");
1000 RegexMatcher m(name, params, 0, status);
1001 if (m.find()) {
1002 // The param exists. Convert the string to an int.
1003 CharString str;
1004 str.append(CStr(m.group(1, status))(), -1, status);
1005 val = strtol(str.data(), NULL, 10);
1006
1007 // Delete this parameter from the params string.
1008 m.reset();
1009 params = m.replaceFirst(UnicodeString(), status);
1010 return TRUE;
1011 }
1012 return FALSE;
1013 }
1014
1015 UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status) {
1016 name.append(" *= *([^ ,]*) *,? *");
1017 RegexMatcher m(name, params, 0, status);
1018 if (m.find()) {
1019 // The param exists.
1020 dest.append(CStr(m.group(1, status))(), -1, status);
1021
1022 // Delete this parameter from the params string.
1023 m.reset();
1024 params = m.replaceFirst(UnicodeString(), status);
1025 return TRUE;
1026 }
1027 return FALSE;
1028 }
1029
1030 UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status) {
1031 name.append("(?: *= *(true|false))? *,? *");
1032 RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status);
1033 if (m.find()) {
1034 if (m.start(1, status) > 0) {
1035 // user option included a value.
1036 dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0;
1037 } else {
1038 // No explicit user value, implies true.
1039 dest = TRUE;
1040 }
1041
1042 // Delete this parameter from the params string.
1043 m.reset();
1044 params = m.replaceFirst(UnicodeString(), status);
1045 return TRUE;
1046 }
1047 return FALSE;
1048 }
1049
1050 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */