]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/rbbimonkeytest.cpp
ICU-62135.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbimonkeytest.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * Copyright (c) 2016, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 ********************************************************************/
7
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
12
13 #include "rbbimonkeytest.h"
14 #include "unicode/utypes.h"
15 #include "unicode/brkiter.h"
16 #include "unicode/utf16.h"
17 #include "unicode/uniset.h"
18 #include "unicode/unistr.h"
19
20 #include "charstr.h"
21 #include "cmemory.h"
22 #include "cstr.h"
23 #include "uelement.h"
24 #include "uhash.h"
25 #include "cstring.h"
26
27 #include <iostream>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string>
31
32 using namespace icu;
33
34
35 void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) {
36 fParams = params; // Work around TESTCASE_AUTO not being able to pass params to test function.
37
38 TESTCASE_AUTO_BEGIN;
39 TESTCASE_AUTO(testMonkey);
40 TESTCASE_AUTO_END;
41 }
42
43 //---------------------------------------------------------------------------------------
44 //
45 // class BreakRule implementation.
46 //
47 //---------------------------------------------------------------------------------------
48
49 BreakRule::BreakRule() // : all field default initialized.
50 {
51 }
52
53 BreakRule::~BreakRule() {}
54
55
56 //---------------------------------------------------------------------------------------
57 //
58 // class BreakRules implementation.
59 //
60 //---------------------------------------------------------------------------------------
61 BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) :
62 fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) {
63 fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString,
64 uhash_compareUnicodeString,
65 NULL, // value comparator.
66 &status));
67 if (U_FAILURE(status)) {
68 return;
69 }
70 uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
71 uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
72 fBreakRules.setDeleter(uprv_deleteUObject);
73
74 fCharClassList.adoptInstead(new UVector(status));
75
76 fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
77 "(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative look behind for '{' or '=' or '[:'
78 // (the identifier is a unicode property name or value)
79 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
80 0, status));
81
82 // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
83 fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
84 "(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';')
85 "[ \\t]*+" // Match white space.
86 "(#.*)?+" // Optional # plus whatever follows
87 "\\R$" // new-line at end of line.
88 ), 0, status));
89
90 // Match (initial parse) of a character class definition line.
91 fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
92 "[ \\t]*" // leading white space
93 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
94 "[ \\t]*=[ \\t]*" // =
95 "(?<ClassDef>.*?)" // The char class UnicodeSet expression
96 "[ \\t]*;$"), // ; <end of line>
97 0, status));
98
99 // Match (initial parse) of a break rule line.
100 fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
101 "[ \\t]*" // leading white space
102 "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name
103 "[ \\t]*:[ \\t]*" // :
104 "(?<RuleDef>.*?)" // The rule definition
105 "[ \\t]*;$"), // ; <end of line>
106 0, status));
107
108 }
109
110
111 BreakRules::~BreakRules() {}
112
113
114 CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
115
116 // Create the expanded definition for this char class,
117 // replacing any set references with the corresponding definition.
118
119 UnicodeString expandedDef;
120 UnicodeString emptyString;
121 fSetRefsMatcher->reset(definition);
122 while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
123 const UnicodeString name =
124 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
125 CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
126 const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
127
128 fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status);
129 expandedDef.append(expansionForName);
130 }
131 fSetRefsMatcher->appendTail(expandedDef);
132
133 // Verify that the expanded set definition is valid.
134
135 if (fMonkeyImpl->fDumpExpansions) {
136 printf("epandedDef: %s\n", CStr(expandedDef)());
137 }
138
139 UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status);
140 if (U_FAILURE(status)) {
141 IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__,
142 u_errorName(status), CStr(name)());
143 return NULL;
144 }
145 CharClass *cclass = new CharClass(name, definition, expandedDef, s);
146 CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(),
147 new UnicodeString(name), // Key, owned by hash table.
148 cclass, // Value, owned by hash table.
149 &status));
150
151 if (previousClass != NULL) {
152 // Duplicate class def.
153 // These are legitimate, they are adjustments of an existing class.
154 // TODO: will need to keep the old around when we handle tailorings.
155 IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
156 delete previousClass;
157 }
158 return cclass;
159 }
160
161
162 void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
163 LocalPointer<BreakRule> thisRule(new BreakRule);
164 thisRule->fName = name;
165 thisRule->fRule = definition;
166
167 // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
168 // This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
169 UnicodeString emptyString;
170
171 // Expand the char class definitions within the rule.
172 fSetRefsMatcher->reset(definition);
173 while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
174 const UnicodeString name =
175 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
176 CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
177 if (!nameClass) {
178 IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
179 __FILE__, __LINE__, CStr(name)(), CStr(definition)());
180 }
181 const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
182
183 fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status);
184 thisRule->fExpandedRule.append(expansionForName);
185 }
186 fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
187
188 // Replace the divide sign (\u00f7) with a regular expression named capture.
189 // When running the rules, a match that includes this group means we found a break position.
190
191 int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7);
192 if (dividePos >= 0) {
193 thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)"));
194 }
195 if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) {
196 status = U_ILLEGAL_ARGUMENT_ERROR; // TODO: produce a good error message.
197 }
198
199 // UAX break rule set definitions can be empty, just [].
200 // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
201 // also matches nothing.
202
203 static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0};
204 int32_t where = 0;
205 while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) {
206 thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
207 }
208 if (fMonkeyImpl->fDumpExpansions) {
209 printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)());
210 }
211
212 // Compile a regular expression for this rule.
213 thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status));
214 if (U_FAILURE(status)) {
215 IntlTest::gTest->errln("%s:%d Error creating regular expression for %s",
216 __FILE__, __LINE__, CStr(thisRule->fExpandedRule)());
217 return;
218 }
219
220 // Put this new rule into the vector of all Rules.
221 fBreakRules.addElement(thisRule.orphan(), status);
222 }
223
224
225 bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) {
226 if (keyword == UnicodeString("locale")) {
227 CharString localeName;
228 localeName.append(CStr(value)(), -1, status);
229 fLocale = Locale::createFromName(localeName.data());
230 return true;
231 }
232 if (keyword == UnicodeString("type")) {
233 if (value == UnicodeString("grapheme")) {
234 fType = UBRK_CHARACTER;
235 } else if (value == UnicodeString("word")) {
236 fType = UBRK_WORD;
237 } else if (value == UnicodeString("line")) {
238 fType = UBRK_LINE;
239 } else if (value == UnicodeString("sentence")) {
240 fType = UBRK_SENTENCE;
241 } else {
242 IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__, CStr(value)());
243 }
244 return true;
245 }
246 // TODO: add tailoring base setting here.
247 return false;
248 }
249
250 RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) {
251 if (U_FAILURE(status)) {
252 return NULL;
253 }
254 RuleBasedBreakIterator *bi = NULL;
255 switch(fType) {
256 case UBRK_CHARACTER:
257 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status));
258 break;
259 case UBRK_WORD:
260 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status));
261 break;
262 case UBRK_LINE:
263 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status));
264 break;
265 case UBRK_SENTENCE:
266 bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status));
267 break;
268 default:
269 IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType);
270 status = U_ILLEGAL_ARGUMENT_ERROR;
271 }
272 return bi;
273 }
274
275
276 void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) {
277 if (U_FAILURE(status)) {
278 return;
279 }
280
281 UnicodeString emptyString;
282 for (int32_t lineNumber=0; ;lineNumber++) { // Loop once per input line.
283 if (U_FAILURE(status)) {
284 return;
285 }
286 int32_t lineLength = 0;
287 const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status);
288 if (lineBuf == NULL) {
289 break;
290 }
291 UnicodeString line(lineBuf, lineLength);
292
293 // Strip comment lines.
294 fCommentsMatcher->reset(line);
295 line = fCommentsMatcher->replaceFirst(emptyString, status);
296 if (line.isEmpty()) {
297 continue;
298 }
299
300 // Recognize character class definition and keyword lines
301 fClassDefMatcher->reset(line);
302 if (fClassDefMatcher->matches(status)) {
303 UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status);
304 UnicodeString classDef = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status);
305 if (fMonkeyImpl->fDumpExpansions) {
306 printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)());
307 }
308 if (setKeywordParameter(className, classDef, status)) {
309 // The scanned item was "type = ..." or "locale = ...", etc.
310 // which are not actual character classes.
311 continue;
312 }
313 addCharClass(className, classDef, status);
314 continue;
315 }
316
317 // Recognize rule lines.
318 fRuleDefMatcher->reset(line);
319 if (fRuleDefMatcher->matches(status)) {
320 UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status);
321 UnicodeString ruleDef = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status);
322 if (fMonkeyImpl->fDumpExpansions) {
323 printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)());
324 }
325 addRule(ruleName, ruleDef, status);
326 continue;
327 }
328
329 IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
330 __FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)());
331 }
332
333 // Build the vector of char classes, omitting the dictionary class if there is one.
334 // This will be used when constructing the random text to be tested.
335
336 // Also compute the "other" set, consisting of any characters not included in
337 // one or more of the user defined sets.
338
339 UnicodeSet otherSet((UChar32)0, 0x10ffff);
340 int32_t pos = UHASH_FIRST;
341 const UHashElement *el = NULL;
342 while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) {
343 const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer);
344 CharClass *cclass = static_cast<CharClass *>(el->value.pointer);
345 // printf(" Adding %s\n", CStr(*ccName)());
346 if (*ccName != cclass->fName) {
347 IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
348 __FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)());
349 }
350 const UnicodeSet *set = cclass->fSet.getAlias();
351 otherSet.removeAll(*set);
352 if (*ccName == UnicodeString("dictionary")) {
353 fDictionarySet = *set;
354 } else {
355 fCharClassList->addElement(cclass, status);
356 }
357 }
358
359 if (!otherSet.isEmpty()) {
360 // fprintf(stderr, "have an other set.\n");
361 UnicodeString pattern;
362 CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status);
363 fCharClassList->addElement(cclass, status);
364 }
365 }
366
367
368 const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const {
369 int32_t localIter = 0;
370 int32_t &it = iter? *iter : localIter;
371
372 while (it < fCharClassList->size()) {
373 const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it));
374 ++it;
375 if (cc->fSet->contains(c)) {
376 return cc;
377 }
378 }
379 return NULL;
380 }
381
382 //---------------------------------------------------------------------------------------
383 //
384 // class MonkeyTestData implementation.
385 //
386 //---------------------------------------------------------------------------------------
387
388 void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) {
389 const int32_t dataLength = 1000;
390
391 // Fill the test string with random characters.
392 // First randomly pick a char class, then randomly pick a character from that class.
393 // Exclude any characters from the dictionary set.
394
395 // std::cout << "Populating Test Data" << std::endl;
396 fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages,
397 // allowing recreation of failing data.
398 fBkRules = rules;
399 fString.remove();
400 for (int32_t n=0; n<dataLength;) {
401 int charClassIndex = rand() % rules->fCharClassList->size();
402 const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex));
403 if (cclass->fSet->size() == 0) {
404 // Some rules or tailorings do end up with empty char classes.
405 continue;
406 }
407 int32_t charIndex = rand() % cclass->fSet->size();
408 UChar32 c = cclass->fSet->charAt(charIndex);
409 if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) {
410 // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
411 // Don't let random unpaired surrogates combine in the test data because they might
412 // produce an unwanted dictionary character.
413 continue;
414 }
415
416 if (!rules->fDictionarySet.contains(c)) {
417 fString.append(c);
418 ++n;
419 }
420 }
421
422 // Reset each rule matcher regex with this new string.
423 // (Although we are always using the same string object, ICU regular expressions
424 // don't like the underlying string data changing without doing a reset).
425
426 for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
427 BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
428 rule->fRuleMatcher->reset(fString);
429 }
430
431 // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
432 // Expected and Actual breaks are one longer than the input string; a non-zero value
433 // will indicate a boundary preceding that position.
434
435 clearActualBreaks();
436 fExpectedBreaks = fActualBreaks;
437 fRuleForPosition = fActualBreaks;
438 f2ndRuleForPos = fActualBreaks;
439
440 // Apply reference rules to find the expected breaks.
441
442 fExpectedBreaks.setCharAt(0, (UChar)1); // Force an expected break before the start of the text.
443 // ICU always reports a break there.
444 // The reference rules do not have a means to do so.
445 int32_t strIdx = 0;
446 while (strIdx < fString.length()) {
447 BreakRule *matchingRule = NULL;
448 UBool hasBreak = FALSE;
449 int32_t ruleNum = 0;
450 int32_t matchStart = 0;
451 int32_t matchEnd = 0;
452 int32_t breakGroup = 0;
453 for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
454 BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
455 rule->fRuleMatcher->reset();
456 if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
457 // A candidate rule match, check further to see if we take it or continue to check other rules.
458 // Matches of zero or one codepoint count only if they also specify a break.
459 matchStart = rule->fRuleMatcher->start(status);
460 matchEnd = rule->fRuleMatcher->end(status);
461 breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status);
462 hasBreak = U_SUCCESS(status);
463 if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
464 status = U_ZERO_ERROR;
465 }
466 if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) {
467 matchingRule = rule;
468 break;
469 }
470 }
471 }
472 if (matchingRule == NULL) {
473 // No reference rule matched. This is an error in the rules that should never happen.
474 IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
475 __FILE__, __LINE__, strIdx);
476 dump(strIdx);
477 status = U_INVALID_FORMAT_ERROR;
478 return;
479 }
480 if (matchingRule->fRuleMatcher->group(status).length() == 0) {
481 // Zero length rule match. This is also an error in the rule expressions.
482 IntlTest::gTest->errln("%s:%d Zero length rule match.",
483 __FILE__, __LINE__);
484 status = U_INVALID_FORMAT_ERROR;
485 return;
486 }
487
488 // Record which rule matched over the length of the match.
489 for (int i = matchStart; i < matchEnd; i++) {
490 if (fRuleForPosition.charAt(i) == 0) {
491 fRuleForPosition.setCharAt(i, (UChar)ruleNum);
492 } else {
493 f2ndRuleForPos.setCharAt(i, (UChar)ruleNum);
494 }
495 }
496
497 // Break positions appear in rules as a matching named capture of zero length at the break position,
498 // the adjusted pattern contains (?<BreakPosition>)
499 if (hasBreak) {
500 int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status);
501 if (U_FAILURE(status) || breakPos < 0) {
502 // Rule specified a break, but that break wasn't part of the match, even
503 // though the rule as a whole matched.
504 // Can't happen with regular expressions derived from (equivalent to) ICU break rules.
505 // Shouldn't get here.
506 IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__);
507 status = U_INVALID_FORMAT_ERROR;
508 break;
509 }
510 fExpectedBreaks.setCharAt(breakPos, (UChar)1);
511 // printf("recording break at %d\n", breakPos);
512 // For the next iteration, pick up applying rules immediately after the break,
513 // which may differ from end of the match. The matching rule may have included
514 // context following the boundary that needs to be looked at again.
515 strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
516 } else {
517 // Original rule didn't specify a break.
518 // Continue applying rules starting on the last code point of this match.
519 strIdx = fString.moveIndex32(matchEnd, -1);
520 if (strIdx == matchStart) {
521 // Match was only one code point, no progress if we continue.
522 // Shouldn't get here, case is filtered out at top of loop.
523 CharString ruleName;
524 ruleName.appendInvariantChars(matchingRule->fName, status);
525 IntlTest::gTest->errln("%s:%d Rule %s internal error",
526 __FILE__, __LINE__, ruleName.data());
527 status = U_INVALID_FORMAT_ERROR;
528 break;
529 }
530 }
531 if (U_FAILURE(status)) {
532 IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
533 __FILE__, __LINE__, u_errorName(status));
534 break;
535 }
536 }
537 }
538
539 void MonkeyTestData::clearActualBreaks() {
540 fActualBreaks.remove();
541 // Actual Breaks length is one longer than the data string length, allowing
542 // for breaks before the first and after the last character in the data.
543 for (int32_t i=0; i<=fString.length(); i++) {
544 fActualBreaks.append((UChar)0);
545 }
546 }
547
548 void MonkeyTestData::dump(int32_t around) const {
549 printf("\n"
550 " char break Rule Character\n"
551 " pos code class R I name name\n"
552 "---------------------------------------------------------------------------------------------\n");
553
554 int32_t start;
555 int32_t end;
556
557 if (around == -1) {
558 start = 0;
559 end = fString.length();
560 } else {
561 // Display context around a failure.
562 start = fString.moveIndex32(around, -30);
563 end = fString.moveIndex32(around, +30);
564 }
565
566 for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) {
567 UErrorCode status = U_ZERO_ERROR;
568 UChar32 c = fString.char32At(charIdx);
569 const CharClass *cc = fBkRules->getClassForChar(c);
570 CharString ccName;
571 ccName.appendInvariantChars(cc->fName, status);
572 CharString ruleName, secondRuleName;
573 const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx)));
574 ruleName.appendInvariantChars(rule->fName, status);
575 if (f2ndRuleForPos.charAt(charIdx) > 0) {
576 const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx)));
577 secondRuleName.appendInvariantChars(secondRule->fName, status);
578 }
579 char cName[200];
580 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
581
582 printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n",
583 charIdx, c, ccName.data(),
584 fExpectedBreaks.charAt(charIdx) ? '*' : '.',
585 fActualBreaks.charAt(charIdx) ? '*' : '.',
586 ruleName.data(), secondRuleName.data(), cName
587 );
588 }
589 }
590
591
592 //---------------------------------------------------------------------------------------
593 //
594 // class RBBIMonkeyImpl
595 //
596 //---------------------------------------------------------------------------------------
597
598 RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) {
599 (void)status; // suppress unused parameter compiler warning.
600 }
601
602
603 // RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the
604 // reference rules and creating the icu breakiterator to test,
605 // with its type and locale coming from the reference rules.
606
607 void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) {
608 fRuleFileName = ruleFile;
609 openBreakRules(ruleFile, status);
610 if (U_FAILURE(status)) {
611 IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
612 return;
613 }
614 fRuleSet.adoptInstead(new BreakRules(this, status));
615 fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status);
616 if (U_FAILURE(status)) {
617 IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
618 return;
619 }
620 fBI.adoptInstead(fRuleSet->createICUBreakIterator(status));
621 fTestData.adoptInstead(new MonkeyTestData());
622 }
623
624
625 RBBIMonkeyImpl::~RBBIMonkeyImpl() {
626 }
627
628
629 void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) {
630 CharString path;
631 path.append(IntlTest::getSourceTestData(status), status);
632 path.append("break_rules" U_FILE_SEP_STRING, status);
633 path.appendPathPart(fileName, status);
634 const char *codePage = "UTF-8";
635 fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status));
636 }
637
638
639 void RBBIMonkeyImpl::startTest() {
640 fThread.start(); // invokes runTest() in a separate thread.
641 }
642
643 void RBBIMonkeyImpl::join() {
644 fThread.join();
645 }
646
647
648 #define MONKEY_ERROR(msg, index) { \
649 IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
650 __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
651 if (fVerbose) { fTestData->dump(index); } \
652 status = U_INVALID_STATE_ERROR; \
653 }
654
655 void RBBIMonkeyImpl::runTest() {
656 UErrorCode status = U_ZERO_ERROR;
657 int32_t errorCount = 0;
658 for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
659 status = U_ZERO_ERROR;
660 fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status);
661 if (fBI.isNull()) {
662 IntlTest::gTest->dataerrln("Unable to run test because fBI is null.");
663 return;
664 }
665 if ( uprv_strcmp(fRuleFileName,"line_loose_cj.txt") == 0 && fTestData->fRandomSeed==1712915859 ) {
666 continue; // known bug around index 103-104, break expected/actual 0/1, fwd 0020 200D | FDFC, rev 1325A 0020 | 200D
667 }
668 // fTestData->dump();
669 testForwards(status);
670 testPrevious(status);
671 testFollowing(status);
672 testPreceding(status);
673 testIsBoundary(status);
674 testIsBoundaryRandom(status);
675
676 if (fLoopCount < 0 && loopCount % 100 == 0) {
677 fprintf(stderr, ".");
678 }
679 if (U_FAILURE(status)) {
680 if (++errorCount > 10) {
681 return;
682 }
683 }
684 }
685 }
686
687 void RBBIMonkeyImpl::testForwards(UErrorCode &status) {
688 if (U_FAILURE(status)) {
689 return;
690 }
691 fTestData->clearActualBreaks();
692 fBI->setText(fTestData->fString);
693 int32_t previousBreak = -2;
694 for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) {
695 if (bk <= previousBreak) {
696 MONKEY_ERROR("Break Iterator Stall", bk);
697 return;
698 }
699 if (bk < 0 || bk > fTestData->fString.length()) {
700 MONKEY_ERROR("Boundary out of bounds", bk);
701 return;
702 }
703 fTestData->fActualBreaks.setCharAt(bk, 1);
704 }
705 checkResults("testForwards", FORWARD, status);
706 }
707
708 void RBBIMonkeyImpl::testFollowing(UErrorCode &status) {
709 if (U_FAILURE(status)) {
710 return;
711 }
712 fTestData->clearActualBreaks();
713 fBI->setText(fTestData->fString);
714 int32_t nextBreak = -1;
715 for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) {
716 int32_t bk = fBI->following(i);
717 if (bk == BreakIterator::DONE && i == fTestData->fString.length()) {
718 continue;
719 }
720 if (bk == nextBreak && bk > i) {
721 // i is in the gap between two breaks.
722 continue;
723 }
724 if (i == nextBreak && bk > nextBreak) {
725 fTestData->fActualBreaks.setCharAt(bk, 1);
726 nextBreak = bk;
727 continue;
728 }
729 MONKEY_ERROR("following(i)", i);
730 return;
731 }
732 checkResults("testFollowing", FORWARD, status);
733 }
734
735
736
737 void RBBIMonkeyImpl::testPrevious(UErrorCode &status) {
738 if (U_FAILURE(status)) {return;}
739
740 fTestData->clearActualBreaks();
741 fBI->setText(fTestData->fString);
742 int32_t previousBreak = INT32_MAX;
743 for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) {
744 if (bk >= previousBreak) {
745 MONKEY_ERROR("Break Iterator Stall", bk);
746 return;
747 }
748 if (bk < 0 || bk > fTestData->fString.length()) {
749 MONKEY_ERROR("Boundary out of bounds", bk);
750 return;
751 }
752 fTestData->fActualBreaks.setCharAt(bk, 1);
753 }
754 checkResults("testPrevious", REVERSE, status);
755 }
756
757
758 void RBBIMonkeyImpl::testPreceding(UErrorCode &status) {
759 if (U_FAILURE(status)) {
760 return;
761 }
762 fTestData->clearActualBreaks();
763 fBI->setText(fTestData->fString);
764 int32_t nextBreak = fTestData->fString.length()+1;
765 for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) {
766 int32_t bk = fBI->preceding(i);
767 // printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
768 if (bk == BreakIterator::DONE && i == 0) {
769 continue;
770 }
771 if (bk == nextBreak && bk < i) {
772 // i is in the gap between two breaks.
773 continue;
774 }
775 if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) {
776 // i indexes to a trailing surrogate.
777 // Break Iterators treat an index to either half as referring to the supplemental code point,
778 // with preceding going to some preceding code point.
779 if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) {
780 MONKEY_ERROR("preceding of trailing surrogate error", i);
781 }
782 continue;
783 }
784 if (i == nextBreak && bk < nextBreak) {
785 fTestData->fActualBreaks.setCharAt(bk, 1);
786 nextBreak = bk;
787 continue;
788 }
789 MONKEY_ERROR("preceding(i)", i);
790 return;
791 }
792 checkResults("testPreceding", REVERSE, status);
793 }
794
795
796 void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) {
797 if (U_FAILURE(status)) {
798 return;
799 }
800 fTestData->clearActualBreaks();
801 fBI->setText(fTestData->fString);
802 for (int i=fTestData->fString.length(); i>=0; --i) {
803 if (fBI->isBoundary(i)) {
804 fTestData->fActualBreaks.setCharAt(i, 1);
805 }
806 }
807 checkResults("testForwards", FORWARD, status);
808 }
809
810 void RBBIMonkeyImpl::testIsBoundaryRandom(UErrorCode &status) {
811 if (U_FAILURE(status)) {
812 return;
813 }
814 fBI->setText(fTestData->fString);
815
816 int stringLen = fTestData->fString.length();
817 for (int i=stringLen; i>=0; --i) {
818 int strIdx = fRandomGenerator() % stringLen;
819 if (fTestData->fExpectedBreaks.charAt(strIdx) != fBI->isBoundary(strIdx)) {
820 IntlTest::gTest->errln("%s:%d testIsBoundaryRandom failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
821 __FILE__, __LINE__, strIdx, fRuleFileName, fTestData->fRandomSeed);
822 if (fVerbose) {
823 fTestData->dump(i);
824 }
825 status = U_INVALID_STATE_ERROR;
826 break;
827 }
828 }
829 }
830
831
832
833 void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) {
834 if (U_FAILURE(status)) {
835 return;
836 }
837 if (direction == FORWARD) {
838 for (int i=0; i<=fTestData->fString.length(); ++i) {
839 if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
840 if (i > 1) {
841 IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
842 __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-2), fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
843 fRuleFileName, fTestData->fRandomSeed);
844 } else {
845 IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
846 __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
847 fRuleFileName, fTestData->fRandomSeed);
848 }
849 if (fVerbose) {
850 fTestData->dump(i);
851 }
852 status = U_INVALID_STATE_ERROR; // Prevent the test from continuing, which would likely
853 break; // produce many redundant errors.
854 }
855 }
856 } else {
857 for (int i=fTestData->fString.length(); i>=0; i--) {
858 if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
859 if (i > 1) {
860 IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
861 __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-2), fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
862 fRuleFileName, fTestData->fRandomSeed);
863 } else {
864 IntlTest::gTest->errln("%s:%d %s failure at index %d, %04X | %04X, break expected/actual %d/%d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
865 __FILE__, __LINE__, msg, i, fTestData->fString.char32At(i-1), fTestData->fString.char32At(i), fTestData->fExpectedBreaks.charAt(i), fTestData->fActualBreaks.charAt(i),
866 fRuleFileName, fTestData->fRandomSeed);
867 }
868 if (fVerbose) {
869 fTestData->dump(i);
870 }
871 status = U_INVALID_STATE_ERROR;
872 break;
873 }
874 }
875 }
876 }
877
878
879
880 //---------------------------------------------------------------------------------------
881 //
882 // class RBBIMonkeyTest implementation.
883 //
884 //---------------------------------------------------------------------------------------
885 RBBIMonkeyTest::RBBIMonkeyTest() {
886 }
887
888 RBBIMonkeyTest::~RBBIMonkeyTest() {
889 }
890
891
892 // params, taken from this->fParams.
893 // rules=file_name Name of file containing the reference rules.
894 // seed=nnnnn Random number starting seed.
895 // Setting the seed allows errors to be reproduced.
896 // loop=nnn Looping count. Controls running time.
897 // -1: run forever.
898 // 0 or greater: run length.
899 // expansions debug option, show expansions of rules and sets.
900 // verbose Display details of the failure.
901 //
902 // Parameters on the intltest command line follow the test name, and are preceded by '@'.
903 // For example,
904 // intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
905 //
906 void RBBIMonkeyTest::testMonkey() {
907 // printf("Test parameters: %s\n", fParams);
908 UnicodeString params(fParams);
909 UErrorCode status = U_ZERO_ERROR;
910
911 const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
912 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
913 NULL };
914 CharString testNameFromParams;
915 if (getStringParam("rules", params, testNameFromParams, status)) {
916 tests[0] = testNameFromParams.data();
917 tests[1] = NULL;
918 }
919
920 int64_t loopCount = quick? 100 : 5000;
921 getIntParam("loop", params, loopCount, status);
922
923 UBool dumpExpansions = FALSE;
924 getBoolParam("expansions", params, dumpExpansions, status);
925
926 UBool verbose = FALSE;
927 getBoolParam("verbose", params, verbose, status);
928
929 int64_t seed = 0;
930 getIntParam("seed", params, seed, status);
931
932 if (params.length() != 0) {
933 // Options processing did not consume all of the parameters. Something unrecognized was present.
934 CharString unrecognizedParameters;
935 unrecognizedParameters.append(CStr(params)(), -1, status);
936 errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data());
937 return;
938 }
939
940 UVector startedTests(status);
941 if (U_FAILURE(status)) {
942 errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status));
943 return;
944 }
945
946 // Monkey testing is multi-threaded.
947 // Each set of break rules to be tested is run in a separate thread.
948 // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
949 int32_t i;
950 for (i=0; tests[i] != NULL; ++i) {
951 logln("beginning testing of %s", tests[i]);
952 LocalPointer<RBBIMonkeyImpl> test(new RBBIMonkeyImpl(status));
953 if (U_FAILURE(status)) {
954 dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
955 break;
956 }
957 test->fDumpExpansions = dumpExpansions;
958 test->fVerbose = verbose;
959 test->fRandomGenerator.seed((uint32_t)seed);
960 test->fLoopCount = loopCount;
961 test->setup(tests[i], status);
962 if (U_FAILURE(status)) {
963 dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
964 break;
965 }
966 test->startTest();
967 startedTests.addElement(test.orphan(), status);
968 if (U_FAILURE(status)) {
969 errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
970 break;
971 }
972 }
973
974 for (i=0; i<startedTests.size(); ++i) {
975 RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i));
976 test->join();
977 delete test;
978 }
979 }
980
981
982 UBool RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status) {
983 name.append(" *= *(-?\\d+) *,? *");
984 RegexMatcher m(name, params, 0, status);
985 if (m.find()) {
986 // The param exists. Convert the string to an int.
987 CharString str;
988 str.append(CStr(m.group(1, status))(), -1, status);
989 val = strtol(str.data(), NULL, 10);
990
991 // Delete this parameter from the params string.
992 m.reset();
993 params = m.replaceFirst(UnicodeString(), status);
994 return TRUE;
995 }
996 return FALSE;
997 }
998
999 UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status) {
1000 name.append(" *= *([^ ,]*) *,? *");
1001 RegexMatcher m(name, params, 0, status);
1002 if (m.find()) {
1003 // The param exists.
1004 dest.append(CStr(m.group(1, status))(), -1, status);
1005
1006 // Delete this parameter from the params string.
1007 m.reset();
1008 params = m.replaceFirst(UnicodeString(), status);
1009 return TRUE;
1010 }
1011 return FALSE;
1012 }
1013
1014 UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status) {
1015 name.append("(?: *= *(true|false))? *,? *");
1016 RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status);
1017 if (m.find()) {
1018 if (m.start(1, status) > 0) {
1019 // user option included a value.
1020 dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0;
1021 } else {
1022 // No explicit user value, implies true.
1023 dest = TRUE;
1024 }
1025
1026 // Delete this parameter from the params string.
1027 m.reset();
1028 params = m.replaceFirst(UnicodeString(), status);
1029 return TRUE;
1030 }
1031 return FALSE;
1032 }
1033
1034 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */