]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/rbbitst.cpp
ICU-491.11.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
CommitLineData
73c04bcf
A
1/********************************************************************
2 * COPYRIGHT:
4388f060 3 * Copyright (c) 1999-2012, International Business Machines Corporation and
73c04bcf
A
4 * others. All Rights Reserved.
5 ********************************************************************/
6/************************************************************************
7* Date Name Description
8* 12/15/99 Madhu Creation.
9* 01/12/2000 Madhu Updated for changed API and added new tests
10************************************************************************/
11
729e4ab9
A
12#include <typeinfo> // for 'typeid' to work
13
73c04bcf
A
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/utypes.h"
19#include "unicode/brkiter.h"
20#include "unicode/rbbi.h"
21#include "unicode/uchar.h"
22#include "unicode/utf16.h"
23#include "unicode/ucnv.h"
24#include "unicode/schriter.h"
25#include "unicode/uniset.h"
4388f060
A
26#if !UCONFIG_NO_REGULAR_EXPRESSIONS
27#include "unicode/regex.h"
28#endif
73c04bcf
A
29#include "unicode/ustring.h"
30#include "unicode/utext.h"
31#include "intltest.h"
32#include "rbbitst.h"
33#include <string.h>
34#include "uvector.h"
35#include "uvectr32.h"
36#include "triedict.h"
37#include <string.h>
38#include <stdio.h>
39#include <stdlib.h>
40
41#define TEST_ASSERT(x) {if (!(x)) { \
42 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
43
46f4442e 44#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
729e4ab9 45 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
73c04bcf
A
46
47
46f4442e
A
48//---------------------------------------------
49// runIndexedTest
50//---------------------------------------------
51
4388f060
A
52
53// Note: Before adding new tests to this file, check whether the desired test data can
54// simply be added to the file testdata/rbbitest.txt. In most cases it can,
55// it's much less work than writing a new test, diagnostic output in the event of failures
56// is good, and the test data file will is shared with ICU4J, so eventually the test
57// will run there as well, without additional effort.
58
46f4442e
A
59void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
60{
61 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
62
63 switch (index) {
729e4ab9 64#if !UCONFIG_NO_FILE_IO
46f4442e
A
65 case 0: name = "TestBug4153072";
66 if(exec) TestBug4153072(); break;
729e4ab9
A
67#else
68 case 0: name = "skip";
69 break;
70#endif
71
4388f060
A
72 case 1: name = "skip";
73 break;
46f4442e
A
74 case 2: name = "TestStatusReturn";
75 if(exec) TestStatusReturn(); break;
729e4ab9
A
76
77#if !UCONFIG_NO_FILE_IO
46f4442e
A
78 case 3: name = "TestUnicodeFiles";
79 if(exec) TestUnicodeFiles(); break;
80 case 4: name = "TestEmptyString";
81 if(exec) TestEmptyString(); break;
729e4ab9
A
82#else
83 case 3: case 4: name = "skip";
84 break;
85#endif
46f4442e
A
86
87 case 5: name = "TestGetAvailableLocales";
88 if(exec) TestGetAvailableLocales(); break;
89
90 case 6: name = "TestGetDisplayName";
91 if(exec) TestGetDisplayName(); break;
92
729e4ab9 93#if !UCONFIG_NO_FILE_IO
46f4442e
A
94 case 7: name = "TestEndBehaviour";
95 if(exec) TestEndBehaviour(); break;
4388f060
A
96 case 8: case 9: case 10: name = "skip";
97 break;
46f4442e
A
98 case 11: name = "TestWordBreaks";
99 if(exec) TestWordBreaks(); break;
100 case 12: name = "TestWordBoundary";
101 if(exec) TestWordBoundary(); break;
102 case 13: name = "TestLineBreaks";
103 if(exec) TestLineBreaks(); break;
104 case 14: name = "TestSentBreaks";
105 if(exec) TestSentBreaks(); break;
106 case 15: name = "TestExtended";
107 if(exec) TestExtended(); break;
729e4ab9
A
108#else
109 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
110 break;
111#endif
112
4388f060
A
113#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
114 case 16: name = "TestMonkey";
115 if(exec) TestMonkey(params); break;
116#else
729e4ab9 117 case 16:
4388f060
A
118 name = "skip"; break;
119#endif
729e4ab9
A
120
121#if !UCONFIG_NO_FILE_IO
46f4442e
A
122 case 17: name = "TestBug3818";
123 if(exec) TestBug3818(); break;
729e4ab9 124#else
4388f060 125 case 17: name = "skip";
729e4ab9
A
126 break;
127#endif
128
4388f060
A
129 case 18: name = "skip";
130 break;
46f4442e
A
131 case 19: name = "TestDebug";
132 if(exec) TestDebug(); break;
133 case 20: name = "TestTrieDict";
134 if(exec) TestTrieDict(); break;
729e4ab9
A
135
136#if !UCONFIG_NO_FILE_IO
46f4442e 137 case 21: name = "TestBug5775";
729e4ab9 138 if (exec) TestBug5775(); break;
729e4ab9 139#else
4388f060 140 case 21: name = "skip";
729e4ab9
A
141 break;
142#endif
4388f060
A
143
144 case 22: name = "skip";
145 break;
146 case 23: name = "TestDictRules";
729e4ab9 147 if (exec) TestDictRules(); break;
4388f060 148 case 24: name = "TestBug5532";
729e4ab9 149 if (exec) TestBug5532(); break;
46f4442e
A
150 default: name = ""; break; //needed to end loop
151 }
152}
153
154
73c04bcf
A
155//---------------------------------------------------------------------------
156//
157// class BITestData Holds a set of Break iterator test data and results
158// Includes
159// - the string data to be broken
160// - a vector of the expected break positions.
161// - a vector of source line numbers for the data,
162// (to help see where errors occured.)
163// - The expected break tag values.
164// - Vectors of actual break positions and tag values.
165// - Functions for comparing actual with expected and
166// reporting errors.
167//
168//----------------------------------------------------------------------------
169class BITestData {
170public:
171 UnicodeString fDataToBreak;
172 UVector fExpectedBreakPositions;
173 UVector fExpectedTags;
174 UVector fLineNum;
175 UVector fActualBreakPositions; // Test Results.
176 UVector fActualTags;
177
178 BITestData(UErrorCode &status);
179 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
180 void checkResults(const char *heading, RBBITest *test);
181 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
182 void clearResults();
183};
184
185//
186// Constructor.
187//
188BITestData::BITestData(UErrorCode &status)
189: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
190 fActualTags(status)
191{
192}
193
194//
195// addDataChunk. Add a section (non-breaking) piece if data to the test data.
196// The macro form collects the line number, which is helpful
197// when tracking down failures.
198//
199// A null data item is inserted at the start of each test's data
200// to put the starting zero into the data list. The position saved for
201// each non-null item is its ending position.
202//
203#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
204void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
205 if (U_FAILURE(status)) {return;}
206 if (data != NULL) {
207 fDataToBreak.append(CharsToUnicodeString(data));
208 }
209 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
210 fExpectedTags.addElement(tag, status);
211 fLineNum.addElement(lineNum, status);
212}
213
214
215//
216// checkResults. Compare the actual and expected break positions, report any differences.
217//
218void BITestData::checkResults(const char *heading, RBBITest *test) {
219 int32_t expectedIndex = 0;
220 int32_t actualIndex = 0;
221
222 for (;;) {
223 // If we've run through both the expected and actual results vectors, we're done.
224 // break out of the loop.
225 if (expectedIndex >= fExpectedBreakPositions.size() &&
226 actualIndex >= fActualBreakPositions.size()) {
227 break;
228 }
229
230
231 if (expectedIndex >= fExpectedBreakPositions.size()) {
232 err(heading, test, expectedIndex-1, actualIndex);
233 actualIndex++;
234 continue;
235 }
236
237 if (actualIndex >= fActualBreakPositions.size()) {
238 err(heading, test, expectedIndex, actualIndex-1);
239 expectedIndex++;
240 continue;
241 }
242
243 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
244 err(heading, test, expectedIndex, actualIndex);
245 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
246 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
247 actualIndex++;
248 } else {
249 expectedIndex++;
250 }
251 continue;
252 }
253
254 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
255 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
256 heading, fLineNum.elementAt(expectedIndex),
257 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
258 }
259
260 actualIndex++;
261 expectedIndex++;
262 }
263}
264
265//
266// err - An error was found. Report it, along with information about where the
267// incorrectly broken test data appeared in the source file.
268//
269void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
270{
271 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
272 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
273 int32_t o = 0;
274 int32_t line = fLineNum.elementAti(expectedIdx);
275 if (expectedIdx > 0) {
276 // The line numbers are off by one because a premature break occurs somewhere
277 // within the previous item, rather than at the start of the current (expected) item.
278 // We want to report the offset of the unexpected break from the start of
279 // this previous item.
280 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
281 }
282 if (actual < expected) {
46f4442e 283 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
73c04bcf 284 } else {
46f4442e 285 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
73c04bcf
A
286 }
287}
288
289
290void BITestData::clearResults() {
291 fActualBreakPositions.removeAllElements();
292 fActualTags.removeAllElements();
293}
294
295
73c04bcf
A
296//--------------------------------------------------------------------------------------
297//
298// RBBITest constructor and destructor
299//
300//--------------------------------------------------------------------------------------
301
302RBBITest::RBBITest() {
73c04bcf
A
303}
304
305
306RBBITest::~RBBITest() {
73c04bcf
A
307}
308
73c04bcf
A
309//-----------------------------------------------------------------------------------
310//
311// Test for status {tag} return value from break rules.
312// TODO: a more thorough test.
313//
314//-----------------------------------------------------------------------------------
315void RBBITest::TestStatusReturn() {
46f4442e 316 UnicodeString rulesString1("$Letters = [:L:];\n"
73c04bcf
A
317 "$Numbers = [:N:];\n"
318 "$Letters+{1};\n"
319 "$Numbers+{2};\n"
320 "Help\\ {4}/me\\!;\n"
321 "[^$Letters $Numbers];\n"
46f4442e 322 "!.*;\n", -1, US_INV);
73c04bcf
A
323 UnicodeString testString1 = "abc123..abc Help me Help me!";
324 // 01234567890123456789012345678
325 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
326 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
327
328 UErrorCode status=U_ZERO_ERROR;
329 UParseError parseError;
330
331 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
332 if(U_FAILURE(status)) {
729e4ab9 333 dataerrln("FAIL : in construction - %s", u_errorName(status));
73c04bcf
A
334 } else {
335 int32_t pos;
336 int32_t i = 0;
337 bi->setText(testString1);
338 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
339 if (pos != bounds1[i]) {
340 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
341 break;
342 }
343
344 int tag = bi->getRuleStatus();
345 if (tag != brkStatus[i]) {
346 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
347 break;
348 }
349 i++;
350 }
351 }
352 delete bi;
353}
354
355
356static void printStringBreaks(UnicodeString ustr, int expected[],
357 int expectedcount)
358{
359 UErrorCode status = U_ZERO_ERROR;
360 char name[100];
361 printf("code alpha extend alphanum type word sent line name\n");
362 int j;
363 for (j = 0; j < ustr.length(); j ++) {
364 if (expectedcount > 0) {
365 int k;
366 for (k = 0; k < expectedcount; k ++) {
367 if (j == expected[k]) {
368 printf("------------------------------------------------ %d\n",
369 j);
370 }
371 }
372 }
373 UChar32 c = ustr.char32At(j);
374 if (c > 0xffff) {
375 j ++;
376 }
377 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
378 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
379 u_isUAlphabetic(c),
380 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
381 u_isalnum(c),
382 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
383 u_charType(c),
384 U_SHORT_PROPERTY_NAME),
385 u_getPropertyValueName(UCHAR_WORD_BREAK,
386 u_getIntPropertyValue(c,
387 UCHAR_WORD_BREAK),
388 U_SHORT_PROPERTY_NAME),
389 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
390 u_getIntPropertyValue(c,
391 UCHAR_SENTENCE_BREAK),
392 U_SHORT_PROPERTY_NAME),
393 u_getPropertyValueName(UCHAR_LINE_BREAK,
394 u_getIntPropertyValue(c,
395 UCHAR_LINE_BREAK),
396 U_SHORT_PROPERTY_NAME),
397 name);
398 }
399}
400
73c04bcf
A
401
402void RBBITest::TestBug3818() {
403 UErrorCode status = U_ZERO_ERROR;
404
405 // Four Thai words...
406 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
407 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
408 UnicodeString thaiStr(thaiWordData);
409
410 RuleBasedBreakIterator* bi =
411 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
412 if (U_FAILURE(status) || bi == NULL) {
729e4ab9 413 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
73c04bcf
A
414 return;
415 }
416 bi->setText(thaiStr);
417
418 int32_t startOfSecondWord = bi->following(1);
419 if (startOfSecondWord != 4) {
420 errln("Fail at file %s, line %d expected start of word at 4, got %d",
421 __FILE__, __LINE__, startOfSecondWord);
422 }
423 startOfSecondWord = bi->following(0);
424 if (startOfSecondWord != 4) {
425 errln("Fail at file %s, line %d expected start of word at 4, got %d",
426 __FILE__, __LINE__, startOfSecondWord);
427 }
428 delete bi;
429}
430
431
73c04bcf
A
432void RBBITest::TestTrieDict() {
433 UErrorCode status = U_ZERO_ERROR;
434
435 //
436 // Open and read the test data file.
437 //
438 const char *testDataDirectory = IntlTest::getSourceTestData(status);
439 char testFileName[1000];
440 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
441 errln("Can't open test data. Path too long.");
442 return;
443 }
444 strcpy(testFileName, testDataDirectory);
445 strcat(testFileName, "riwords.txt");
446
447 // Items needing deleting at the end
448 MutableTrieDictionary *mutableDict = NULL;
449 CompactTrieDictionary *compactDict = NULL;
450 UnicodeSet *breaks = NULL;
451 UChar *testFile = NULL;
46f4442e
A
452 StringEnumeration *enumer1 = NULL;
453 StringEnumeration *enumer2 = NULL;
73c04bcf
A
454 MutableTrieDictionary *mutable2 = NULL;
455 StringEnumeration *cloneEnum = NULL;
456 CompactTrieDictionary *compact2 = NULL;
457
46f4442e 458
73c04bcf
A
459 const UnicodeString *originalWord = NULL;
460 const UnicodeString *cloneWord = NULL;
461 UChar *current;
462 UChar *word;
463 UChar uc;
464 int32_t wordLen;
465 int32_t wordCount;
466 int32_t testCount;
46f4442e 467
73c04bcf 468 int len;
46f4442e 469 testFile = ReadAndConvertFile(testFileName, len, NULL, status);
73c04bcf
A
470 if (U_FAILURE(status)) {
471 goto cleanup; /* something went wrong, error already output */
472 }
473
474 mutableDict = new MutableTrieDictionary(0x0E1C, status);
475 if (U_FAILURE(status)) {
476 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
477 goto cleanup;
478 }
46f4442e 479
73c04bcf
A
480 breaks = new UnicodeSet;
481 breaks->add(0x000A); // Line Feed
482 breaks->add(0x000D); // Carriage Return
483 breaks->add(0x2028); // Line Separator
484 breaks->add(0x2029); // Paragraph Separator
485
486 // Now add each non-comment line of the file as a word.
487 current = testFile;
488 word = current;
489 uc = *current++;
490 wordLen = 0;
491 wordCount = 0;
46f4442e 492
73c04bcf
A
493 while (uc) {
494 if (uc == 0x0023) { // #comment line, skip
495 while (uc && !breaks->contains(uc)) {
496 uc = *current++;
497 }
498 }
499 else while (uc && !breaks->contains(uc)) {
500 ++wordLen;
501 uc = *current++;
502 }
503 if (wordLen > 0) {
504 mutableDict->addWord(word, wordLen, status);
505 if (U_FAILURE(status)) {
506 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
507 goto cleanup;
508 }
509 wordCount += 1;
510 }
46f4442e 511
73c04bcf
A
512 // Find beginning of next line
513 while (uc && breaks->contains(uc)) {
514 uc = *current++;
515 }
516 word = current-1;
517 wordLen = 0;
518 }
46f4442e 519
73c04bcf
A
520 if (wordCount < 50) {
521 errln("Word count (%d) unreasonably small\n", wordCount);
522 goto cleanup;
523 }
524
46f4442e 525 enumer1 = mutableDict->openWords(status);
73c04bcf
A
526 if (U_FAILURE(status)) {
527 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
528 goto cleanup;
529 }
530
531 testCount = 0;
46f4442e 532 if (wordCount != (testCount = enumer1->count(status))) {
73c04bcf
A
533 errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
534 testCount, wordCount, u_errorName(status));
535 goto cleanup;
536 }
46f4442e 537
73c04bcf
A
538 // Now compact it
539 compactDict = new CompactTrieDictionary(*mutableDict, status);
540 if (U_FAILURE(status)) {
541 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
542 goto cleanup;
543 }
46f4442e
A
544
545 enumer2 = compactDict->openWords(status);
73c04bcf
A
546 if (U_FAILURE(status)) {
547 errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
548 goto cleanup;
549 }
46f4442e
A
550
551 if (wordCount != (testCount = enumer2->count(status))) {
73c04bcf
A
552 errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
553 testCount, wordCount, u_errorName(status));
554 goto cleanup;
555 }
46f4442e 556
729e4ab9
A
557 if (typeid(*enumer1) == typeid(*enumer2)) {
558 errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
46f4442e
A
559 }
560 delete enumer1;
561 enumer1 = NULL;
562 delete enumer2;
563 enumer2 = NULL;
564
73c04bcf
A
565 // Now un-compact it
566 mutable2 = compactDict->cloneMutable(status);
567 if (U_FAILURE(status)) {
568 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
569 goto cleanup;
570 }
46f4442e 571
73c04bcf
A
572 cloneEnum = mutable2->openWords(status);
573 if (U_FAILURE(status)) {
574 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
575 goto cleanup;
576 }
46f4442e 577
73c04bcf
A
578 if (wordCount != (testCount = cloneEnum->count(status))) {
579 errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
580 testCount, wordCount, u_errorName(status));
581 goto cleanup;
582 }
46f4442e 583
73c04bcf
A
584 // Compact original dictionary to clone. Note that we can only compare the same kind of
585 // dictionary as the order of the enumerators is not guaranteed to be the same between
586 // different kinds
46f4442e 587 enumer1 = mutableDict->openWords(status);
73c04bcf
A
588 if (U_FAILURE(status)) {
589 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
590 goto cleanup;
591 }
46f4442e
A
592
593 originalWord = enumer1->snext(status);
73c04bcf
A
594 cloneWord = cloneEnum->snext(status);
595 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
596 if (*originalWord != *cloneWord) {
597 errln("Original and cloned MutableTrieDictionary word mismatch\n");
598 goto cleanup;
599 }
46f4442e 600 originalWord = enumer1->snext(status);
73c04bcf
A
601 cloneWord = cloneEnum->snext(status);
602 }
46f4442e 603
73c04bcf
A
604 if (U_FAILURE(status)) {
605 errln("Enumeration failed: %s\n", u_errorName(status));
606 goto cleanup;
607 }
46f4442e 608
73c04bcf
A
609 if (originalWord != cloneWord) {
610 errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
611 goto cleanup;
612 }
613
614 // Test the data copying constructor for CompactTrieDict, and the data access APIs.
615 compact2 = new CompactTrieDictionary(compactDict->data(), status);
616 if (U_FAILURE(status)) {
617 errln("CompactTrieDictionary(const void *,...) failed\n");
618 goto cleanup;
619 }
46f4442e 620
73c04bcf
A
621 if (compact2->dataSize() == 0) {
622 errln("CompactTrieDictionary->dataSize() == 0\n");
623 goto cleanup;
624 }
46f4442e 625
73c04bcf 626 // Now count the words via the second dictionary
46f4442e
A
627 delete enumer1;
628 enumer1 = compact2->openWords(status);
73c04bcf
A
629 if (U_FAILURE(status)) {
630 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
631 goto cleanup;
632 }
46f4442e
A
633
634 if (wordCount != (testCount = enumer1->count(status))) {
73c04bcf
A
635 errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
636 testCount, wordCount, u_errorName(status));
637 goto cleanup;
638 }
46f4442e 639
73c04bcf
A
640cleanup:
641 delete compactDict;
642 delete mutableDict;
643 delete breaks;
644 delete[] testFile;
46f4442e 645 delete enumer1;
73c04bcf
A
646 delete mutable2;
647 delete cloneEnum;
648 delete compact2;
649}
650
73c04bcf
A
651
652//----------------------------------------------------------------------------
653//
654// generalIteratorTest Given a break iterator and a set of test data,
655// Run the tests and report the results.
656//
657//----------------------------------------------------------------------------
658void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
659{
660
661 bi.setText(td.fDataToBreak);
662
663 testFirstAndNext(bi, td);
664
665 testLastAndPrevious(bi, td);
666
667 testFollowing(bi, td);
668 testPreceding(bi, td);
669 testIsBoundary(bi, td);
670 doMultipleSelectionTest(bi, td);
671}
672
673
674//
675// testFirstAndNext. Run the iterator forwards in the obvious first(), next()
676// kind of loop.
677//
678void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
679{
680 UErrorCode status = U_ZERO_ERROR;
681 int32_t p;
682 int32_t lastP = -1;
683 int32_t tag;
684
685 logln("Test first and next");
686 bi.setText(td.fDataToBreak);
687 td.clearResults();
688
689 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
690 td.fActualBreakPositions.addElement(p, status); // Save result.
691 tag = bi.getRuleStatus();
692 td.fActualTags.addElement(tag, status);
693 if (p <= lastP) {
694 // If the iterator is not making forward progress, stop.
695 // No need to raise an error here, it'll be detected in the normal check of results.
696 break;
697 }
698 lastP = p;
699 }
700 td.checkResults("testFirstAndNext", this);
701}
702
703
704//
705// TestLastAndPrevious. Run the iterator backwards, starting with last().
706//
707void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
708{
709 UErrorCode status = U_ZERO_ERROR;
710 int32_t p;
711 int32_t lastP = 0x7ffffffe;
712 int32_t tag;
713
46f4442e 714 logln("Test last and previous");
73c04bcf
A
715 bi.setText(td.fDataToBreak);
716 td.clearResults();
717
718 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
719 // Save break position. Insert it at start of vector of results, shoving
720 // already-saved results further towards the end.
721 td.fActualBreakPositions.insertElementAt(p, 0, status);
722 // bi.previous(); // TODO: Why does this fix things up????
723 // bi.next();
724 tag = bi.getRuleStatus();
725 td.fActualTags.insertElementAt(tag, 0, status);
726 if (p >= lastP) {
727 // If the iterator is not making progress, stop.
728 // No need to raise an error here, it'll be detected in the normal check of results.
729 break;
730 }
731 lastP = p;
732 }
733 td.checkResults("testLastAndPrevious", this);
734}
735
736
737void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
738{
739 UErrorCode status = U_ZERO_ERROR;
740 int32_t p;
741 int32_t tag;
742 int32_t lastP = -2; // A value that will never be returned as a break position.
743 // cannot be -1; that is returned for DONE.
744 int i;
745
746 logln("testFollowing():");
747 bi.setText(td.fDataToBreak);
748 td.clearResults();
749
750 // Save the starting point, since we won't get that out of following.
751 p = bi.first();
752 td.fActualBreakPositions.addElement(p, status); // Save result.
753 tag = bi.getRuleStatus();
754 td.fActualTags.addElement(tag, status);
755
756 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
757 p = bi.following(i);
758 if (p != lastP) {
759 if (p == RuleBasedBreakIterator::DONE) {
760 break;
761 }
762 // We've reached a new break position. Save it.
763 td.fActualBreakPositions.addElement(p, status); // Save result.
764 tag = bi.getRuleStatus();
765 td.fActualTags.addElement(tag, status);
766 lastP = p;
767 }
768 }
769 // The loop normally exits by means of the break in the middle.
770 // Make sure that the index was at the correct position for the break iterator to have
771 // returned DONE.
772 if (i != td.fDataToBreak.length()) {
773 errln("testFollowing(): iterator returned DONE prematurely.");
774 }
775
776 // Full check of all results.
777 td.checkResults("testFollowing", this);
778}
779
780
781
782void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
783 UErrorCode status = U_ZERO_ERROR;
784 int32_t p;
785 int32_t tag;
786 int32_t lastP = 0x7ffffffe;
787 int i;
788
789 logln("testPreceding():");
790 bi.setText(td.fDataToBreak);
791 td.clearResults();
792
793 p = bi.last();
794 td.fActualBreakPositions.addElement(p, status);
795 tag = bi.getRuleStatus();
796 td.fActualTags.addElement(tag, status);
797
798 for (i = td.fDataToBreak.length(); i>=-1; i--) {
799 p = bi.preceding(i);
800 if (p != lastP) {
801 if (p == RuleBasedBreakIterator::DONE) {
802 break;
803 }
804 // We've reached a new break position. Save it.
805 td.fActualBreakPositions.insertElementAt(p, 0, status);
806 lastP = p;
807 tag = bi.getRuleStatus();
808 td.fActualTags.insertElementAt(tag, 0, status);
809 }
810 }
811 // The loop normally exits by means of the break in the middle.
812 // Make sure that the index was at the correct position for the break iterator to have
813 // returned DONE.
814 if (i != 0) {
815 errln("testPreceding(): iterator returned DONE prematurely.");
816 }
817
818 // Full check of all results.
819 td.checkResults("testPreceding", this);
820}
821
822
823
824void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
825 UErrorCode status = U_ZERO_ERROR;
826 int i;
827 int32_t tag;
828
829 logln("testIsBoundary():");
830 bi.setText(td.fDataToBreak);
831 td.clearResults();
832
833 for (i = 0; i <= td.fDataToBreak.length(); i++) {
834 if (bi.isBoundary(i)) {
835 td.fActualBreakPositions.addElement(i, status); // Save result.
836 tag = bi.getRuleStatus();
837 td.fActualTags.addElement(tag, status);
838 }
839 }
840 td.checkResults("testIsBoundary: ", this);
841}
842
843
844
845void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
846{
847 iterator.setText(td.fDataToBreak);
848
849 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
850 int32_t offset = iterator.first();
851 int32_t testOffset;
852 int32_t count = 0;
853
854 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
855
856 if (*testIterator != iterator)
857 errln("clone() or operator!= failed: two clones compared unequal");
858
859 do {
860 testOffset = testIterator->first();
861 testOffset = testIterator->next(count);
862 if (offset != testOffset)
863 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
864
865 if (offset != RuleBasedBreakIterator::DONE) {
866 count++;
867 offset = iterator.next();
868
869 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
870 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
871 if (count > 10000 || offset == -1) {
872 errln("operator== failed too many times. Stopping test.");
873 if (offset == -1) {
874 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
875 }
876 return;
877 }
878 }
879 }
880 } while (offset != RuleBasedBreakIterator::DONE);
881
882 // now do it backwards...
883 offset = iterator.last();
884 count = 0;
885
886 do {
887 testOffset = testIterator->last();
888 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
889 if (offset != testOffset)
890 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
891
892 if (offset != RuleBasedBreakIterator::DONE) {
893 count--;
894 offset = iterator.previous();
895 }
896 } while (offset != RuleBasedBreakIterator::DONE);
897
898 delete testIterator;
899}
900
901
902//---------------------------------------------
903//
904// other tests
905//
906//---------------------------------------------
907void RBBITest::TestEmptyString()
908{
909 UnicodeString text = "";
910 UErrorCode status = U_ZERO_ERROR;
911
912 BITestData x(status);
913 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
914 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
915 if (U_FAILURE(status))
916 {
729e4ab9 917 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
73c04bcf
A
918 return;
919 }
920 generalIteratorTest(*bi, x);
921 delete bi;
922}
923
924void RBBITest::TestGetAvailableLocales()
925{
926 int32_t locCount = 0;
927 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
928
929 if (locCount == 0)
729e4ab9 930 dataerrln("getAvailableLocales() returned an empty list!");
73c04bcf
A
931 // Just make sure that it's returning good memory.
932 int32_t i;
933 for (i = 0; i < locCount; ++i) {
934 logln(locList[i].getName());
935 }
936}
937
938//Testing the BreakIterator::getDisplayName() function
939void RBBITest::TestGetDisplayName()
940{
941 UnicodeString result;
942
943 BreakIterator::getDisplayName(Locale::getUS(), result);
944 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
729e4ab9 945 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
73c04bcf
A
946 + result);
947
948 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
949 if (result != "French (France)")
729e4ab9 950 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
73c04bcf
A
951 + result);
952}
953/**
954 * Test End Behaviour
955 * @bug 4068137
956 */
957void RBBITest::TestEndBehaviour()
958{
959 UErrorCode status = U_ZERO_ERROR;
960 UnicodeString testString("boo.");
961 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
962 if (U_FAILURE(status))
963 {
729e4ab9 964 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
73c04bcf
A
965 return;
966 }
967 wb->setText(testString);
968
969 if (wb->first() != 0)
970 errln("Didn't get break at beginning of string.");
971 if (wb->next() != 3)
972 errln("Didn't get break before period in \"boo.\"");
973 if (wb->current() != 4 && wb->next() != 4)
974 errln("Didn't get break at end of string.");
975 delete wb;
976}
977/*
978 * @bug 4153072
979 */
980void RBBITest::TestBug4153072() {
981 UErrorCode status = U_ZERO_ERROR;
982 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
983 if (U_FAILURE(status))
984 {
729e4ab9 985 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
73c04bcf
A
986 return;
987 }
988 UnicodeString str("...Hello, World!...");
989 int32_t begin = 3;
990 int32_t end = str.length() - 3;
991 UBool onBoundary;
992
993 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
994 iter->adoptText(textIterator);
995 int index;
996 // Note: with the switch to UText, there is no way to restrict the
997 // iteration range to begin at an index other than zero.
998 // String character iterators created with a non-zero bound are
999 // treated by RBBI as being empty.
1000 for (index = -1; index < begin + 1; ++index) {
1001 onBoundary = iter->isBoundary(index);
1002 if (index == 0? !onBoundary : onBoundary) {
1003 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
1004 " and begin index = " + begin);
1005 }
1006 }
1007 delete iter;
1008}
1009
1010
46f4442e
A
1011//
1012// Test for problem reported by Ashok Matoria on 9 July 2007
1013// One.<kSoftHyphen><kSpace>Two.
1014//
1015// Sentence break at start (0) and then on calling next() it breaks at
1016// 'T' of "Two". Now, at this point if I do next() and
1017// then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
1018//
1019void RBBITest::TestBug5775() {
1020 UErrorCode status = U_ZERO_ERROR;
1021 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1022 TEST_ASSERT_SUCCESS(status);
729e4ab9
A
1023 if (U_FAILURE(status)) {
1024 return;
1025 }
1026// Check for status first for better handling of no data errors.
46f4442e 1027 TEST_ASSERT(bi != NULL);
729e4ab9 1028 if (bi == NULL) {
46f4442e
A
1029 return;
1030 }
729e4ab9 1031
46f4442e
A
1032 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
1033 // 01234 56789
1034 s = s.unescape();
1035 bi->setText(s);
1036 int pos = bi->next();
1037 TEST_ASSERT(pos == 6);
1038 pos = bi->next();
1039 TEST_ASSERT(pos == 10);
1040 pos = bi->previous();
1041 TEST_ASSERT(pos == 6);
1042 delete bi;
1043}
1044
1045
1046
73c04bcf
A
1047//------------------------------------------------------------------------------
1048//
1049// RBBITest::Extended Run RBBI Tests from an external test data file
1050//
1051//------------------------------------------------------------------------------
1052
1053struct TestParams {
1054 BreakIterator *bi;
1055 UnicodeString dataToBreak;
1056 UVector32 *expectedBreaks;
1057 UVector32 *srcLine;
1058 UVector32 *srcCol;
1059};
1060
1061void RBBITest::executeTest(TestParams *t) {
1062 int32_t bp;
1063 int32_t prevBP;
1064 int32_t i;
1065
1066 if (t->bi == NULL) {
1067 return;
1068 }
1069
1070 t->bi->setText(t->dataToBreak);
1071 //
1072 // Run the iterator forward
1073 //
1074 prevBP = -1;
1075 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1076 if (prevBP == bp) {
1077 // Fail for lack of forward progress.
1078 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1079 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1080 break;
1081 }
1082
1083 // Check that there were we didn't miss an expected break between the last one
1084 // and this one.
1085 for (i=prevBP+1; i<bp; i++) {
1086 if (t->expectedBreaks->elementAti(i) != 0) {
1087 int expected[] = {0, i};
1088 printStringBreaks(t->dataToBreak, expected, 2);
1089 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1090 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1091 }
1092 }
1093
1094 // Check that the break we did find was expected
1095 if (t->expectedBreaks->elementAti(bp) == 0) {
1096 int expected[] = {0, bp};
1097 printStringBreaks(t->dataToBreak, expected, 2);
1098 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1099 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1100 } else {
1101 // The break was expected.
1102 // Check that the {nnn} tag value is correct.
1103 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1104 if (expectedTagVal == -1) {
1105 expectedTagVal = 0;
1106 }
1107 int32_t line = t->srcLine->elementAti(bp);
1108 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1109 if (rs != expectedTagVal) {
1110 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1111 " Actual, Expected status = %4d, %4d",
1112 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1113 }
1114 }
1115
1116
1117 prevBP = bp;
1118 }
1119
1120 // Verify that there were no missed expected breaks after the last one found
1121 for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1122 if (t->expectedBreaks->elementAti(i) != 0) {
1123 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1124 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1125 }
1126 }
1127
1128 //
1129 // Run the iterator backwards, verify that the same breaks are found.
1130 //
1131 prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen.
1132 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1133 if (prevBP == bp) {
1134 // Fail for lack of progress.
1135 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1136 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1137 break;
1138 }
1139
1140 // Check that there were we didn't miss an expected break between the last one
1141 // and this one. (UVector returns zeros for index out of bounds.)
1142 for (i=prevBP-1; i>bp; i--) {
1143 if (t->expectedBreaks->elementAti(i) != 0) {
1144 errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1145 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1146 }
1147 }
1148
1149 // Check that the break we did find was expected
1150 if (t->expectedBreaks->elementAti(bp) == 0) {
1151 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1152 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1153 } else {
1154 // The break was expected.
1155 // Check that the {nnn} tag value is correct.
1156 int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1157 if (expectedTagVal == -1) {
1158 expectedTagVal = 0;
1159 }
1160 int line = t->srcLine->elementAti(bp);
1161 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1162 if (rs != expectedTagVal) {
1163 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1164 " Actual, Expected status = %4d, %4d",
1165 bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1166 }
1167 }
1168
1169 prevBP = bp;
1170 }
1171
1172 // Verify that there were no missed breaks prior to the last one found
1173 for (i=prevBP-1; i>=0; i--) {
1174 if (t->expectedBreaks->elementAti(i) != 0) {
1175 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1176 i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1177 }
1178 }
1179}
1180
1181
1182void RBBITest::TestExtended() {
1183#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1184 UErrorCode status = U_ZERO_ERROR;
1185 Locale locale("");
1186
1187 UnicodeString rules;
1188 TestParams tp;
1189 tp.bi = NULL;
1190 tp.expectedBreaks = new UVector32(status);
1191 tp.srcLine = new UVector32(status);
1192 tp.srcCol = new UVector32(status);
1193
46f4442e 1194 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
729e4ab9
A
1195 if (U_FAILURE(status)) {
1196 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1197 }
73c04bcf
A
1198
1199
1200 //
1201 // Open and read the test data file.
1202 //
1203 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1204 char testFileName[1000];
1205 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1206 errln("Can't open test data. Path too long.");
1207 return;
1208 }
1209 strcpy(testFileName, testDataDirectory);
1210 strcat(testFileName, "rbbitst.txt");
1211
1212 int len;
46f4442e 1213 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
73c04bcf
A
1214 if (U_FAILURE(status)) {
1215 return; /* something went wrong, error already output */
1216 }
1217
1218
1219
46f4442e 1220
73c04bcf
A
1221 //
1222 // Put the test data into a UnicodeString
1223 //
1224 UnicodeString testString(FALSE, testFile, len);
1225
1226 enum EParseState{
1227 PARSE_COMMENT,
1228 PARSE_TAG,
1229 PARSE_DATA,
1230 PARSE_NUM
1231 }
1232 parseState = PARSE_TAG;
1233
1234 EParseState savedState = PARSE_TAG;
1235
1236 static const UChar CH_LF = 0x0a;
1237 static const UChar CH_CR = 0x0d;
1238 static const UChar CH_HASH = 0x23;
1239 /*static const UChar CH_PERIOD = 0x2e;*/
1240 static const UChar CH_LT = 0x3c;
1241 static const UChar CH_GT = 0x3e;
1242 static const UChar CH_BACKSLASH = 0x5c;
1243 static const UChar CH_BULLET = 0x2022;
1244
1245 int32_t lineNum = 1;
1246 int32_t colStart = 0;
1247 int32_t column = 0;
1248 int32_t charIdx = 0;
1249
1250 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1251
1252 for (charIdx = 0; charIdx < len; ) {
1253 status = U_ZERO_ERROR;
1254 UChar c = testString.charAt(charIdx);
1255 charIdx++;
1256 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1257 // treat CRLF as a unit
1258 c = CH_LF;
1259 charIdx++;
1260 }
1261 if (c == CH_LF || c == CH_CR) {
1262 lineNum++;
1263 colStart = charIdx;
1264 }
1265 column = charIdx - colStart + 1;
1266
1267 switch (parseState) {
1268 case PARSE_COMMENT:
1269 if (c == 0x0a || c == 0x0d) {
1270 parseState = savedState;
1271 }
1272 break;
1273
1274 case PARSE_TAG:
1275 {
1276 if (c == CH_HASH) {
1277 parseState = PARSE_COMMENT;
1278 savedState = PARSE_TAG;
1279 break;
1280 }
1281 if (u_isUWhiteSpace(c)) {
1282 break;
1283 }
1284 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1285 delete tp.bi;
1286 tp.bi = BreakIterator::createWordInstance(locale, status);
1287 charIdx += 5;
1288 break;
1289 }
1290 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1291 delete tp.bi;
1292 tp.bi = BreakIterator::createCharacterInstance(locale, status);
1293 charIdx += 5;
1294 break;
1295 }
1296 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1297 delete tp.bi;
1298 tp.bi = BreakIterator::createLineInstance(locale, status);
1299 charIdx += 5;
1300 break;
1301 }
1302 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1303 delete tp.bi;
1304 tp.bi = NULL;
46f4442e 1305 tp.bi = BreakIterator::createSentenceInstance(locale, status);
73c04bcf
A
1306 charIdx += 5;
1307 break;
1308 }
1309 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1310 delete tp.bi;
1311 tp.bi = BreakIterator::createTitleInstance(locale, status);
1312 charIdx += 6;
1313 break;
1314 }
46f4442e 1315
73c04bcf
A
1316 // <locale loc_name>
1317 localeMatcher.reset(testString);
1318 if (localeMatcher.lookingAt(charIdx-1, status)) {
1319 UnicodeString localeName = localeMatcher.group(1, status);
1320 char localeName8[100];
1321 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1322 locale = Locale::createFromName(localeName8);
1323 charIdx += localeMatcher.group(0, status).length();
1324 TEST_ASSERT_SUCCESS(status);
1325 break;
1326 }
1327 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1328 parseState = PARSE_DATA;
1329 charIdx += 5;
1330 tp.dataToBreak = "";
1331 tp.expectedBreaks->removeAllElements();
1332 tp.srcCol ->removeAllElements();
1333 tp.srcLine->removeAllElements();
1334 break;
1335 }
1336
1337 errln("line %d: Tag expected in test file.", lineNum);
73c04bcf
A
1338 parseState = PARSE_COMMENT;
1339 savedState = PARSE_DATA;
46f4442e 1340 goto end_test; // Stop the test.
73c04bcf
A
1341 }
1342 break;
1343
1344 case PARSE_DATA:
1345 if (c == CH_BULLET) {
1346 int32_t breakIdx = tp.dataToBreak.length();
1347 tp.expectedBreaks->setSize(breakIdx+1);
1348 tp.expectedBreaks->setElementAt(-1, breakIdx);
1349 tp.srcLine->setSize(breakIdx+1);
1350 tp.srcLine->setElementAt(lineNum, breakIdx);
1351 tp.srcCol ->setSize(breakIdx+1);
1352 tp.srcCol ->setElementAt(column, breakIdx);
1353 break;
1354 }
1355
1356 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1357 // Add final entry to mappings from break location to source file position.
1358 // Need one extra because last break position returned is after the
1359 // last char in the data, not at the last char.
1360 tp.srcLine->addElement(lineNum, status);
1361 tp.srcCol ->addElement(column, status);
1362
1363 parseState = PARSE_TAG;
1364 charIdx += 6;
1365
1366 // RUN THE TEST!
1367 executeTest(&tp);
1368 break;
1369 }
1370
46f4442e 1371 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
73c04bcf
A
1372 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1373 // Get the code point from the name and insert it into the test data.
1374 // (Damn, no API takes names in Unicode !!!
1375 // we've got to take it back to char *)
1376 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1377 int32_t nameLength = nameEndIdx - (charIdx+2);
1378 char charNameBuf[200];
1379 UChar32 theChar = -1;
1380 if (nameEndIdx != -1) {
1381 UErrorCode status = U_ZERO_ERROR;
1382 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1383 charNameBuf[sizeof(charNameBuf)-1] = 0;
1384 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1385 if (U_FAILURE(status)) {
1386 theChar = -1;
1387 }
1388 }
1389 if (theChar == -1) {
1390 errln("Error in named character in test file at line %d, col %d",
1391 lineNum, column);
1392 } else {
1393 // Named code point was recognized. Insert it
1394 // into the test data.
1395 tp.dataToBreak.append(theChar);
1396 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1397 tp.srcLine->addElement(lineNum, status);
1398 tp.srcCol ->addElement(column, status);
1399 }
1400 }
1401 if (nameEndIdx > charIdx) {
1402 charIdx = nameEndIdx+1;
1403
1404 }
1405 break;
1406 }
1407
1408
1409
1410
1411 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1412 charIdx++;
1413 int32_t breakIdx = tp.dataToBreak.length();
1414 tp.expectedBreaks->setSize(breakIdx+1);
1415 tp.expectedBreaks->setElementAt(-1, breakIdx);
1416 tp.srcLine->setSize(breakIdx+1);
1417 tp.srcLine->setElementAt(lineNum, breakIdx);
1418 tp.srcCol ->setSize(breakIdx+1);
1419 tp.srcCol ->setElementAt(column, breakIdx);
1420 break;
1421 }
1422
1423 if (c == CH_LT) {
1424 tagValue = 0;
1425 parseState = PARSE_NUM;
1426 break;
1427 }
1428
1429 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
1430 parseState = PARSE_COMMENT;
1431 savedState = PARSE_DATA;
1432 break;
1433 }
1434
1435 if (c == CH_BACKSLASH) {
1436 // Check for \ at end of line, a line continuation.
1437 // Advance over (discard) the newline
1438 UChar32 cp = testString.char32At(charIdx);
1439 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1440 // We have a CR LF
1441 // Need an extra increment of the input ptr to move over both of them
1442 charIdx++;
1443 }
1444 if (cp == CH_LF || cp == CH_CR) {
1445 lineNum++;
1446 colStart = charIdx;
1447 charIdx++;
1448 break;
1449 }
1450
1451 // Let unescape handle the back slash.
1452 cp = testString.unescapeAt(charIdx);
1453 if (cp != -1) {
1454 // Escape sequence was recognized. Insert the char
1455 // into the test data.
1456 tp.dataToBreak.append(cp);
1457 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1458 tp.srcLine->addElement(lineNum, status);
1459 tp.srcCol ->addElement(column, status);
1460 }
1461 break;
1462 }
1463
1464
1465 // Not a recognized backslash escape sequence.
1466 // Take the next char as a literal.
1467 // TODO: Should this be an error?
1468 c = testString.charAt(charIdx);
1469 charIdx = testString.moveIndex32(charIdx, 1);
1470 }
1471
1472 // Normal, non-escaped data char.
1473 tp.dataToBreak.append(c);
1474
1475 // Save the mapping from offset in the data to line/column numbers in
1476 // the original input file. Will be used for better error messages only.
1477 // If there's an expected break before this char, the slot in the mapping
1478 // vector will already be set for this char; don't overwrite it.
1479 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1480 tp.srcLine->addElement(lineNum, status);
1481 tp.srcCol ->addElement(column, status);
1482 }
1483 break;
1484
1485
1486 case PARSE_NUM:
1487 // We are parsing an expected numeric tag value, like <1234>,
1488 // within a chunk of data.
1489 if (u_isUWhiteSpace(c)) {
1490 break;
1491 }
1492
1493 if (c == CH_GT) {
1494 // Finished the number. Add the info to the expected break data,
1495 // and switch parse state back to doing plain data.
1496 parseState = PARSE_DATA;
1497 if (tagValue == 0) {
1498 tagValue = -1;
1499 }
1500 int32_t breakIdx = tp.dataToBreak.length();
1501 tp.expectedBreaks->setSize(breakIdx+1);
1502 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1503 tp.srcLine->setSize(breakIdx+1);
1504 tp.srcLine->setElementAt(lineNum, breakIdx);
1505 tp.srcCol ->setSize(breakIdx+1);
1506 tp.srcCol ->setElementAt(column, breakIdx);
1507 break;
1508 }
1509
1510 if (u_isdigit(c)) {
1511 tagValue = tagValue*10 + u_charDigitValue(c);
1512 break;
1513 }
1514
1515 errln("Syntax Error in test file at line %d, col %d",
1516 lineNum, column);
73c04bcf 1517 parseState = PARSE_COMMENT;
46f4442e 1518 goto end_test; // Stop the test
73c04bcf
A
1519 break;
1520 }
1521
1522
1523 if (U_FAILURE(status)) {
4388f060 1524 dataerrln("ICU Error %s while parsing test file at line %d.",
73c04bcf 1525 u_errorName(status), lineNum);
73c04bcf 1526 status = U_ZERO_ERROR;
46f4442e 1527 goto end_test; // Stop the test
73c04bcf
A
1528 }
1529
1530 }
1531
1532end_test:
1533 delete tp.bi;
1534 delete tp.expectedBreaks;
1535 delete tp.srcLine;
1536 delete tp.srcCol;
1537 delete [] testFile;
1538#endif
1539}
1540
729e4ab9
A
1541
1542//-------------------------------------------------------------------------------
1543//
1544// TestDictRules create a break iterator from source rules that includes a
1545// dictionary range. Regression for bug #7130. Source rules
1546// do not declare a break iterator type (word, line, sentence, etc.
1547// but the dictionary code, without a type, would loop.
1548//
1549//-------------------------------------------------------------------------------
1550void RBBITest::TestDictRules() {
1551 const char *rules = "$dictionary = [a-z]; \n"
1552 "!!forward; \n"
1553 "$dictionary $dictionary; \n"
1554 "!!reverse; \n"
1555 "$dictionary $dictionary; \n";
1556 const char *text = "aa";
1557 UErrorCode status = U_ZERO_ERROR;
1558 UParseError parseError;
1559
1560 RuleBasedBreakIterator bi(rules, parseError, status);
1561 if (U_SUCCESS(status)) {
1562 UnicodeString utext = text;
1563 bi.setText(utext);
1564 int32_t position;
1565 int32_t loops;
1566 for (loops = 0; loops<10; loops++) {
1567 position = bi.next();
1568 if (position == RuleBasedBreakIterator::DONE) {
1569 break;
1570 }
1571 }
1572 TEST_ASSERT(loops == 1);
1573 } else {
1574 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1575 }
1576}
1577
1578
73c04bcf
A
1579
1580//-------------------------------------------------------------------------------
1581//
1582// ReadAndConvertFile Read a text data file, convert it to UChars, and
1583// return the datain one big UChar * buffer, which the caller must delete.
1584//
46f4442e
A
1585// parameters:
1586// fileName: the name of the file, with no directory part. The test data directory
1587// is assumed.
1588// ulen an out parameter, receives the actual length (in UChars) of the file data.
1589// encoding The file encoding. If the file contains a BOM, that will override the encoding
1590// specified here. The BOM, if it exists, will be stripped from the returned data.
1591// Pass NULL for the system default encoding.
1592// status
1593// returns:
1594// The file data, converted to UChar.
1595// The caller must delete this when done with
1596// delete [] theBuffer;
1597//
73c04bcf
A
1598// TODO: This is a clone of RegexTest::ReadAndConvertFile.
1599// Move this function to some common place.
1600//
1601//--------------------------------------------------------------------------------
46f4442e 1602UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
73c04bcf
A
1603 UChar *retPtr = NULL;
1604 char *fileBuf = NULL;
1605 UConverter* conv = NULL;
1606 FILE *f = NULL;
1607
1608 ulen = 0;
1609 if (U_FAILURE(status)) {
1610 return retPtr;
1611 }
1612
1613 //
1614 // Open the file.
1615 //
1616 f = fopen(fileName, "rb");
1617 if (f == 0) {
729e4ab9 1618 dataerrln("Error opening test data file %s\n", fileName);
73c04bcf
A
1619 status = U_FILE_ACCESS_ERROR;
1620 return NULL;
1621 }
1622 //
1623 // Read it in
1624 //
1625 int fileSize;
1626 int amt_read;
1627
1628 fseek( f, 0, SEEK_END);
1629 fileSize = ftell(f);
1630 fileBuf = new char[fileSize];
1631 fseek(f, 0, SEEK_SET);
1632 amt_read = fread(fileBuf, 1, fileSize, f);
1633 if (amt_read != fileSize || fileSize <= 0) {
1634 errln("Error reading test data file.");
1635 goto cleanUpAndReturn;
1636 }
1637
1638 //
1639 // Look for a Unicode Signature (BOM) on the data just read
1640 //
1641 int32_t signatureLength;
1642 const char * fileBufC;
46f4442e 1643 const char* bomEncoding;
73c04bcf
A
1644
1645 fileBufC = fileBuf;
46f4442e 1646 bomEncoding = ucnv_detectUnicodeSignature(
73c04bcf 1647 fileBuf, fileSize, &signatureLength, &status);
46f4442e 1648 if(bomEncoding!=NULL ){
73c04bcf
A
1649 fileBufC += signatureLength;
1650 fileSize -= signatureLength;
46f4442e 1651 encoding = bomEncoding;
73c04bcf
A
1652 }
1653
1654 //
1655 // Open a converter to take the rule file to UTF-16
1656 //
1657 conv = ucnv_open(encoding, &status);
1658 if (U_FAILURE(status)) {
1659 goto cleanUpAndReturn;
1660 }
1661
1662 //
1663 // Convert the rules to UChar.
1664 // Preflight first to determine required buffer size.
1665 //
1666 ulen = ucnv_toUChars(conv,
1667 NULL, // dest,
1668 0, // destCapacity,
1669 fileBufC,
1670 fileSize,
1671 &status);
1672 if (status == U_BUFFER_OVERFLOW_ERROR) {
1673 // Buffer Overflow is expected from the preflight operation.
1674 status = U_ZERO_ERROR;
1675
1676 retPtr = new UChar[ulen+1];
1677 ucnv_toUChars(conv,
1678 retPtr, // dest,
1679 ulen+1,
1680 fileBufC,
1681 fileSize,
1682 &status);
1683 }
1684
1685cleanUpAndReturn:
1686 fclose(f);
1687 delete []fileBuf;
1688 ucnv_close(conv);
1689 if (U_FAILURE(status)) {
1690 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4388f060 1691 delete []retPtr;
73c04bcf
A
1692 retPtr = 0;
1693 ulen = 0;
1694 };
1695 return retPtr;
1696}
1697
1698
73c04bcf 1699
46f4442e 1700//--------------------------------------------------------------------------------------------
73c04bcf 1701//
46f4442e 1702// Run tests from each of the boundary test data files distributed by the Unicode Consortium
73c04bcf 1703//
46f4442e
A
1704//-------------------------------------------------------------------------------------------
1705void RBBITest::TestUnicodeFiles() {
1706 RuleBasedBreakIterator *bi;
1707 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1708
729e4ab9 1709 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
46f4442e
A
1710 TEST_ASSERT_SUCCESS(status);
1711 if (U_SUCCESS(status)) {
1712 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1713 }
1714 delete bi;
73c04bcf 1715
729e4ab9 1716 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
46f4442e
A
1717 TEST_ASSERT_SUCCESS(status);
1718 if (U_SUCCESS(status)) {
1719 runUnicodeTestData("WordBreakTest.txt", bi);
1720 }
1721 delete bi;
73c04bcf 1722
729e4ab9 1723 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
46f4442e
A
1724 TEST_ASSERT_SUCCESS(status);
1725 if (U_SUCCESS(status)) {
1726 runUnicodeTestData("SentenceBreakTest.txt", bi);
1727 }
1728 delete bi;
73c04bcf 1729
729e4ab9 1730 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
46f4442e
A
1731 TEST_ASSERT_SUCCESS(status);
1732 if (U_SUCCESS(status)) {
1733 runUnicodeTestData("LineBreakTest.txt", bi);
73c04bcf 1734 }
46f4442e 1735 delete bi;
73c04bcf
A
1736}
1737
1738
46f4442e
A
1739//--------------------------------------------------------------------------------------------
1740//
1741// Run tests from one of the boundary test data files distributed by the Unicode Consortium
1742//
1743//-------------------------------------------------------------------------------------------
1744void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1745#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4388f060
A
1746 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
1747 UBool isTicket7270Fixed = isICUVersionAtLeast(50, 0);
1748 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
46f4442e 1749 UErrorCode status = U_ZERO_ERROR;
73c04bcf 1750
46f4442e
A
1751 //
1752 // Open and read the test data file, put it into a UnicodeString.
1753 //
1754 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1755 char testFileName[1000];
1756 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
729e4ab9 1757 dataerrln("Can't open test data. Path too long.");
73c04bcf
A
1758 return;
1759 }
46f4442e
A
1760 strcpy(testFileName, testDataDirectory);
1761 strcat(testFileName, fileName);
1762
1763 logln("Opening data file %s\n", fileName);
73c04bcf 1764
46f4442e
A
1765 int len;
1766 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1767 if (status != U_FILE_ACCESS_ERROR) {
1768 TEST_ASSERT_SUCCESS(status);
1769 TEST_ASSERT(testFile != NULL);
1770 }
1771 if (U_FAILURE(status) || testFile == NULL) {
1772 return; /* something went wrong, error already output */
1773 }
1774 UnicodeString testFileAsString(TRUE, testFile, len);
73c04bcf 1775
46f4442e
A
1776 //
1777 // Parse the test data file using a regular expression.
1778 // Each kind of token is recognized in its own capture group; what type of item was scanned
1779 // is identified by which group had a match.
1780 //
1781 // Caputure Group # 1 2 3 4 5
1782 // Parses this item: divide x hex digits comment \n unrecognized \n
1783 //
1784 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1785 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1786 UnicodeString testString;
1787 UVector32 breakPositions(status);
1788 int lineNumber = 1;
1789 TEST_ASSERT_SUCCESS(status);
1790 if (U_FAILURE(status)) {
73c04bcf
A
1791 return;
1792 }
1793
46f4442e
A
1794 //
1795 // Scan through each test case, building up the string to be broken in testString,
1796 // and the positions that should be boundaries in the breakPositions vector.
1797 //
729e4ab9 1798 int spin = 0;
46f4442e 1799 while (tokenMatcher.find()) {
729e4ab9
A
1800 if(tokenMatcher.hitEnd()) {
1801 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1802 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1803 and caused an infinite loop here on EBCDIC systems!
1804 */
1805 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1806 // return;
1807 }
46f4442e
A
1808 if (tokenMatcher.start(1, status) >= 0) {
1809 // Scanned a divide sign, indicating a break position in the test data.
1810 if (testString.length()>0) {
1811 breakPositions.addElement(testString.length(), status);
73c04bcf 1812 }
46f4442e
A
1813 }
1814 else if (tokenMatcher.start(2, status) >= 0) {
1815 // Scanned an 'x', meaning no break at this position in the test data
1816 // Nothing to be done here.
1817 }
1818 else if (tokenMatcher.start(3, status) >= 0) {
1819 // Scanned Hex digits. Convert them to binary, append to the character data string.
1820 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1821 int length = hexNumber.length();
1822 if (length<=8) {
1823 char buf[10];
1824 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1825 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1826 if (c<=0x10ffff) {
1827 testString.append(c);
1828 } else {
1829 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1830 fileName, lineNumber);
1831 }
1832 } else {
1833 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1834 fileName, lineNumber);
1835 }
1836 }
1837 else if (tokenMatcher.start(4, status) >= 0) {
1838 // Scanned to end of a line, possibly skipping over a comment in the process.
1839 // If the line from the file contained test data, run the test now.
1840 //
1841 if (testString.length() > 0) {
4388f060
A
1842// TODO(andy): Remove this time bomb code. Note: Line range updated for Unicode 6.1 LineBreakTest.txt.
1843if (!isLineBreak || isTicket7270Fixed || !(5066 <= lineNumber && lineNumber <= 5170)) {
46f4442e 1844 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
729e4ab9 1845}
73c04bcf
A
1846 }
1847
46f4442e
A
1848 // Clear out this test case.
1849 // The string and breakPositions vector will be refilled as the next
1850 // test case is parsed.
1851 testString.remove();
1852 breakPositions.removeAllElements();
1853 lineNumber++;
1854 } else {
1855 // Scanner catchall. Something unrecognized appeared on the line.
1856 char token[16];
1857 UnicodeString uToken = tokenMatcher.group(0, status);
1858 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1859 token[sizeof(token)-1] = 0;
1860 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1861
1862 // Clean up, in preparation for continuing with the next line.
1863 testString.remove();
1864 breakPositions.removeAllElements();
1865 lineNumber++;
1866 }
1867 TEST_ASSERT_SUCCESS(status);
1868 if (U_FAILURE(status)) {
73c04bcf
A
1869 break;
1870 }
46f4442e 1871 }
73c04bcf 1872
46f4442e
A
1873 delete [] testFile;
1874 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1875}
73c04bcf 1876
46f4442e
A
1877//--------------------------------------------------------------------------------------------
1878//
1879// checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1880// test data files. Do only a simple, forward-only check -
1881// this test is mostly to check that ICU and the Unicode
1882// data agree with each other.
1883//
1884//--------------------------------------------------------------------------------------------
1885void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1886 const UnicodeString &testString, // Text data to be broken
1887 UVector32 *breakPositions, // Positions where breaks should be found.
1888 RuleBasedBreakIterator *bi) {
1889 int32_t pos; // Break Position in the test string
1890 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1891 int32_t expectedPos; // Expected break position (index into test string)
1892
1893 bi->setText(testString);
1894 pos = bi->first();
1895 pos = bi->next();
1896
1897 while (pos != BreakIterator::DONE) {
1898 if (expectedI >= breakPositions->size()) {
1899 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1900 testFileName, lineNumber, pos);
1901 break;
73c04bcf 1902 }
46f4442e
A
1903 expectedPos = breakPositions->elementAti(expectedI);
1904 if (pos < expectedPos) {
1905 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1906 testFileName, lineNumber, pos);
1907 break;
1908 }
1909 if (pos > expectedPos) {
1910 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1911 testFileName, lineNumber, expectedPos);
73c04bcf
A
1912 break;
1913 }
46f4442e
A
1914 pos = bi->next();
1915 expectedI++;
1916 }
73c04bcf 1917
46f4442e
A
1918 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1919 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1920 testFileName, lineNumber, breakPositions->elementAti(expectedI));
73c04bcf 1921 }
46f4442e 1922}
73c04bcf 1923
73c04bcf 1924
73c04bcf
A
1925
1926#if !UCONFIG_NO_REGULAR_EXPRESSIONS
73c04bcf
A
1927//---------------------------------------------------------------------------------------
1928//
1929// classs RBBIMonkeyKind
1930//
1931// Monkey Test for Break Iteration
1932// Abstract interface class. Concrete derived classes independently
1933// implement the break rules for different iterator types.
1934//
1935// The Monkey Test itself uses doesn't know which type of break iterator it is
1936// testing, but works purely in terms of the interface defined here.
1937//
1938//---------------------------------------------------------------------------------------
1939class RBBIMonkeyKind {
1940public:
1941 // Return a UVector of UnicodeSets, representing the character classes used
1942 // for this type of iterator.
1943 virtual UVector *charClasses() = 0;
1944
1945 // Set the test text on which subsequent calls to next() will operate
1946 virtual void setText(const UnicodeString &s) = 0;
1947
1948 // Find the next break postion, starting from the prev break position, or from zero.
1949 // Return -1 after reaching end of string.
1950 virtual int32_t next(int32_t i) = 0;
1951
1952 virtual ~RBBIMonkeyKind();
1953 UErrorCode deferredStatus;
1954
1955
1956protected:
1957 RBBIMonkeyKind();
1958
1959private:
1960};
1961
1962RBBIMonkeyKind::RBBIMonkeyKind() {
1963 deferredStatus = U_ZERO_ERROR;
1964}
1965
1966RBBIMonkeyKind::~RBBIMonkeyKind() {
1967}
1968
1969
1970//----------------------------------------------------------------------------------------
1971//
1972// Random Numbers. Similar to standard lib rand() and srand()
1973// Not using library to
1974// 1. Get same results on all platforms.
1975// 2. Get access to current seed, to more easily reproduce failures.
1976//
1977//---------------------------------------------------------------------------------------
1978static uint32_t m_seed = 1;
1979
1980static uint32_t m_rand()
1981{
1982 m_seed = m_seed * 1103515245 + 12345;
1983 return (uint32_t)(m_seed/65536) % 32768;
1984}
1985
1986
1987//------------------------------------------------------------------------------------------
1988//
1989// class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1990// of RBBIMonkeyKind.
1991//
1992//------------------------------------------------------------------------------------------
1993class RBBICharMonkey: public RBBIMonkeyKind {
1994public:
1995 RBBICharMonkey();
1996 virtual ~RBBICharMonkey();
1997 virtual UVector *charClasses();
1998 virtual void setText(const UnicodeString &s);
1999 virtual int32_t next(int32_t i);
2000private:
2001 UVector *fSets;
2002
2003 UnicodeSet *fCRLFSet;
2004 UnicodeSet *fControlSet;
2005 UnicodeSet *fExtendSet;
46f4442e
A
2006 UnicodeSet *fPrependSet;
2007 UnicodeSet *fSpacingSet;
2008 UnicodeSet *fLSet;
2009 UnicodeSet *fVSet;
2010 UnicodeSet *fTSet;
2011 UnicodeSet *fLVSet;
2012 UnicodeSet *fLVTSet;
73c04bcf
A
2013 UnicodeSet *fHangulSet;
2014 UnicodeSet *fAnySet;
2015
73c04bcf
A
2016 const UnicodeString *fText;
2017};
2018
2019
2020RBBICharMonkey::RBBICharMonkey() {
2021 UErrorCode status = U_ZERO_ERROR;
2022
2023 fText = NULL;
73c04bcf 2024
46f4442e
A
2025 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2026 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2027 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2028 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2029 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2030 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2031 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2032 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2033 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2034 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2035 fHangulSet = new UnicodeSet();
2036 fHangulSet->addAll(*fLSet);
2037 fHangulSet->addAll(*fVSet);
2038 fHangulSet->addAll(*fTSet);
2039 fHangulSet->addAll(*fLVSet);
2040 fHangulSet->addAll(*fLVTSet);
4388f060
A
2041 fAnySet = new UnicodeSet(0, 0x10ffff);
2042
73c04bcf
A
2043 fSets = new UVector(status);
2044 fSets->addElement(fCRLFSet, status);
2045 fSets->addElement(fControlSet, status);
2046 fSets->addElement(fExtendSet, status);
4388f060
A
2047 if (!fPrependSet->isEmpty()) {
2048 fSets->addElement(fPrependSet, status);
2049 }
46f4442e 2050 fSets->addElement(fSpacingSet, status);
73c04bcf
A
2051 fSets->addElement(fHangulSet, status);
2052 fSets->addElement(fAnySet, status);
2053 if (U_FAILURE(status)) {
2054 deferredStatus = status;
2055 }
2056}
2057
2058
2059void RBBICharMonkey::setText(const UnicodeString &s) {
2060 fText = &s;
73c04bcf
A
2061}
2062
2063
73c04bcf 2064
46f4442e
A
2065int32_t RBBICharMonkey::next(int32_t prevPos) {
2066 int p0, p1, p2, p3; // Indices of the significant code points around the
2067 // break position being tested. The candidate break
2068 // location is before p2.
2069
2070 int breakPos = -1;
2071
2072 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2073
2074 if (U_FAILURE(deferredStatus)) {
2075 return -1;
73c04bcf 2076 }
46f4442e
A
2077
2078 // Previous break at end of string. return DONE.
2079 if (prevPos >= fText->length()) {
2080 return -1;
73c04bcf 2081 }
46f4442e
A
2082 p0 = p1 = p2 = p3 = prevPos;
2083 c3 = fText->char32At(prevPos);
2084 c0 = c1 = c2 = 0;
2085
2086 // Loop runs once per "significant" character position in the input text.
2087 for (;;) {
2088 // Move all of the positions forward in the input string.
2089 p0 = p1; c0 = c1;
2090 p1 = p2; c1 = c2;
2091 p2 = p3; c2 = c3;
2092
2093 // Advancd p3 by one codepoint
2094 p3 = fText->moveIndex32(p3, 1);
2095 c3 = fText->char32At(p3);
2096
2097 if (p1 == p2) {
2098 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2099 continue;
2100 }
2101 if (p2 == fText->length()) {
2102 // Reached end of string. Always a break position.
2103 break;
2104 }
2105
2106 // Rule GB3 CR x LF
2107 // No Extend or Format characters may appear between the CR and LF,
2108 // which requires the additional check for p2 immediately following p1.
2109 //
2110 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2111 continue;
2112 }
2113
2114 // Rule (GB4). ( Control | CR | LF ) <break>
2115 if (fControlSet->contains(c1) ||
2116 c1 == 0x0D ||
2117 c1 == 0x0A) {
2118 break;
2119 }
2120
2121 // Rule (GB5) <break> ( Control | CR | LF )
2122 //
2123 if (fControlSet->contains(c2) ||
2124 c2 == 0x0D ||
2125 c2 == 0x0A) {
2126 break;
2127 }
2128
2129
2130 // Rule (GB6) L x ( L | V | LV | LVT )
2131 if (fLSet->contains(c1) &&
2132 (fLSet->contains(c2) ||
2133 fVSet->contains(c2) ||
2134 fLVSet->contains(c2) ||
2135 fLVTSet->contains(c2))) {
2136 continue;
2137 }
2138
2139 // Rule (GB7) ( LV | V ) x ( V | T )
2140 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2141 (fVSet->contains(c2) || fTSet->contains(c2))) {
2142 continue;
2143 }
2144
2145 // Rule (GB8) ( LVT | T) x T
2146 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2147 fTSet->contains(c2)) {
2148 continue;
2149 }
2150
2151 // Rule (GB9) Numeric x ALetter
2152 if (fExtendSet->contains(c2)) {
2153 continue;
2154 }
2155
2156 // Rule (GB9a) x SpacingMark
2157 if (fSpacingSet->contains(c2)) {
2158 continue;
2159 }
2160
2161 // Rule (GB9b) Prepend x
2162 if (fPrependSet->contains(c1)) {
2163 continue;
2164 }
2165
2166 // Rule (GB10) Any <break> Any
2167 break;
2168 }
2169
2170 breakPos = p2;
2171 return breakPos;
73c04bcf
A
2172}
2173
2174
46f4442e 2175
73c04bcf
A
2176UVector *RBBICharMonkey::charClasses() {
2177 return fSets;
2178}
2179
2180
2181RBBICharMonkey::~RBBICharMonkey() {
2182 delete fSets;
2183 delete fCRLFSet;
2184 delete fControlSet;
2185 delete fExtendSet;
46f4442e
A
2186 delete fPrependSet;
2187 delete fSpacingSet;
2188 delete fLSet;
2189 delete fVSet;
2190 delete fTSet;
2191 delete fLVSet;
2192 delete fLVTSet;
73c04bcf
A
2193 delete fHangulSet;
2194 delete fAnySet;
73c04bcf
A
2195}
2196
2197//------------------------------------------------------------------------------------------
2198//
2199// class RBBIWordMonkey Word Break specific implementation
2200// of RBBIMonkeyKind.
2201//
2202//------------------------------------------------------------------------------------------
2203class RBBIWordMonkey: public RBBIMonkeyKind {
2204public:
2205 RBBIWordMonkey();
2206 virtual ~RBBIWordMonkey();
2207 virtual UVector *charClasses();
2208 virtual void setText(const UnicodeString &s);
2209 virtual int32_t next(int32_t i);
2210private:
2211 UVector *fSets;
2212
46f4442e
A
2213 UnicodeSet *fCRSet;
2214 UnicodeSet *fLFSet;
2215 UnicodeSet *fNewlineSet;
73c04bcf
A
2216 UnicodeSet *fKatakanaSet;
2217 UnicodeSet *fALetterSet;
46f4442e 2218 UnicodeSet *fMidNumLetSet;
73c04bcf
A
2219 UnicodeSet *fMidLetterSet;
2220 UnicodeSet *fMidNumSet;
2221 UnicodeSet *fNumericSet;
2222 UnicodeSet *fFormatSet;
2223 UnicodeSet *fOtherSet;
2224 UnicodeSet *fExtendSet;
2225 UnicodeSet *fExtendNumLetSet;
2226
2227 RegexMatcher *fMatcher;
2228
2229 const UnicodeString *fText;
2230};
2231
2232
46f4442e 2233RBBIWordMonkey::RBBIWordMonkey()
73c04bcf
A
2234{
2235 UErrorCode status = U_ZERO_ERROR;
2236
73c04bcf
A
2237 fSets = new UVector(status);
2238
46f4442e
A
2239 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
2240 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
2241 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
2242 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2243 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
2244 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
2245 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
2246 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
2247 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
2248 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
2249 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2250 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
2251
73c04bcf
A
2252 fOtherSet = new UnicodeSet();
2253 if(U_FAILURE(status)) {
2254 deferredStatus = status;
2255 return;
2256 }
2257
2258 fOtherSet->complement();
46f4442e
A
2259 fOtherSet->removeAll(*fCRSet);
2260 fOtherSet->removeAll(*fLFSet);
2261 fOtherSet->removeAll(*fNewlineSet);
73c04bcf
A
2262 fOtherSet->removeAll(*fKatakanaSet);
2263 fOtherSet->removeAll(*fALetterSet);
2264 fOtherSet->removeAll(*fMidLetterSet);
2265 fOtherSet->removeAll(*fMidNumSet);
2266 fOtherSet->removeAll(*fNumericSet);
2267 fOtherSet->removeAll(*fExtendNumLetSet);
2268 fOtherSet->removeAll(*fFormatSet);
2269 fOtherSet->removeAll(*fExtendSet);
46f4442e
A
2270 // Inhibit dictionary characters from being tested at all.
2271 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
73c04bcf 2272
46f4442e
A
2273 fSets->addElement(fCRSet, status);
2274 fSets->addElement(fLFSet, status);
2275 fSets->addElement(fNewlineSet, status);
73c04bcf
A
2276 fSets->addElement(fALetterSet, status);
2277 fSets->addElement(fKatakanaSet, status);
2278 fSets->addElement(fMidLetterSet, status);
46f4442e 2279 fSets->addElement(fMidNumLetSet, status);
73c04bcf
A
2280 fSets->addElement(fMidNumSet, status);
2281 fSets->addElement(fNumericSet, status);
2282 fSets->addElement(fFormatSet, status);
2283 fSets->addElement(fExtendSet, status);
2284 fSets->addElement(fOtherSet, status);
2285 fSets->addElement(fExtendNumLetSet, status);
2286
73c04bcf
A
2287 if (U_FAILURE(status)) {
2288 deferredStatus = status;
2289 }
2290}
2291
2292void RBBIWordMonkey::setText(const UnicodeString &s) {
2293 fText = &s;
2294}
2295
2296
2297int32_t RBBIWordMonkey::next(int32_t prevPos) {
2298 int p0, p1, p2, p3; // Indices of the significant code points around the
2299 // break position being tested. The candidate break
2300 // location is before p2.
2301
2302 int breakPos = -1;
2303
2304 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
46f4442e
A
2305
2306 if (U_FAILURE(deferredStatus)) {
2307 return -1;
2308 }
73c04bcf
A
2309
2310 // Prev break at end of string. return DONE.
2311 if (prevPos >= fText->length()) {
2312 return -1;
2313 }
2314 p0 = p1 = p2 = p3 = prevPos;
2315 c3 = fText->char32At(prevPos);
2316 c0 = c1 = c2 = 0;
2317
2318 // Loop runs once per "significant" character position in the input text.
2319 for (;;) {
2320 // Move all of the positions forward in the input string.
2321 p0 = p1; c0 = c1;
2322 p1 = p2; c1 = c2;
2323 p2 = p3; c2 = c3;
2324
2325 // Advancd p3 by X(Extend | Format)* Rule 4
46f4442e 2326 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
73c04bcf
A
2327 do {
2328 p3 = fText->moveIndex32(p3, 1);
2329 c3 = fText->char32At(p3);
46f4442e
A
2330 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2331 break;
2332 };
73c04bcf
A
2333 }
2334 while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2335
2336
2337 if (p1 == p2) {
2338 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2339 continue;
2340 }
2341 if (p2 == fText->length()) {
2342 // Reached end of string. Always a break position.
2343 break;
2344 }
46f4442e 2345
73c04bcf
A
2346 // Rule (3) CR x LF
2347 // No Extend or Format characters may appear between the CR and LF,
2348 // which requires the additional check for p2 immediately following p1.
2349 //
46f4442e 2350 if (c1==0x0D && c2==0x0A) {
73c04bcf
A
2351 continue;
2352 }
46f4442e
A
2353
2354 // Rule (3a) Break before and after newlines (including CR and LF)
2355 //
2356 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2357 break;
2358 };
2359 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2360 break;
2361 };
73c04bcf
A
2362
2363 // Rule (5). ALetter x ALetter
2364 if (fALetterSet->contains(c1) &&
2365 fALetterSet->contains(c2)) {
2366 continue;
2367 }
2368
2369 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
2370 //
73c04bcf 2371 if ( fALetterSet->contains(c1) &&
46f4442e 2372 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
73c04bcf
A
2373 fALetterSet->contains(c3)) {
2374 continue;
2375 }
2376
2377
2378 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
2379 if (fALetterSet->contains(c0) &&
46f4442e 2380 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1)) &&
73c04bcf
A
2381 fALetterSet->contains(c2)) {
2382 continue;
2383 }
2384
2385 // Rule (8) Numeric x Numeric
2386 if (fNumericSet->contains(c1) &&
2387 fNumericSet->contains(c2)) {
2388 continue;
2389 }
2390
2391 // Rule (9) ALetter x Numeric
2392 if (fALetterSet->contains(c1) &&
2393 fNumericSet->contains(c2)) {
2394 continue;
2395 }
2396
2397 // Rule (10) Numeric x ALetter
2398 if (fNumericSet->contains(c1) &&
2399 fALetterSet->contains(c2)) {
2400 continue;
2401 }
2402
2403 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
46f4442e
A
2404 if (fNumericSet->contains(c0) &&
2405 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1)) &&
73c04bcf
A
2406 fNumericSet->contains(c2)) {
2407 continue;
2408 }
2409
2410 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
2411 if (fNumericSet->contains(c1) &&
46f4442e 2412 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
73c04bcf
A
2413 fNumericSet->contains(c3)) {
2414 continue;
2415 }
2416
2417 // Rule (13) Katakana x Katakana
2418 if (fKatakanaSet->contains(c1) &&
2419 fKatakanaSet->contains(c2)) {
2420 continue;
2421 }
2422
2423 // Rule 13a
2424 if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2425 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2426 fExtendNumLetSet->contains(c2)) {
2427 continue;
2428 }
2429
2430 // Rule 13b
2431 if (fExtendNumLetSet->contains(c1) &&
2432 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2433 fKatakanaSet->contains(c2))) {
2434 continue;
2435 }
2436
2437 // Rule 14. Break found here.
2438 break;
2439 }
2440
2441 breakPos = p2;
2442 return breakPos;
2443}
2444
2445
2446UVector *RBBIWordMonkey::charClasses() {
2447 return fSets;
2448}
2449
2450
2451RBBIWordMonkey::~RBBIWordMonkey() {
2452 delete fSets;
46f4442e
A
2453 delete fCRSet;
2454 delete fLFSet;
2455 delete fNewlineSet;
73c04bcf
A
2456 delete fKatakanaSet;
2457 delete fALetterSet;
46f4442e 2458 delete fMidNumLetSet;
73c04bcf
A
2459 delete fMidLetterSet;
2460 delete fMidNumSet;
2461 delete fNumericSet;
2462 delete fFormatSet;
2463 delete fExtendSet;
2464 delete fExtendNumLetSet;
2465 delete fOtherSet;
2466}
2467
2468
2469
2470
2471//------------------------------------------------------------------------------------------
2472//
2473// class RBBISentMonkey Sentence Break specific implementation
2474// of RBBIMonkeyKind.
2475//
2476//------------------------------------------------------------------------------------------
2477class RBBISentMonkey: public RBBIMonkeyKind {
2478public:
2479 RBBISentMonkey();
2480 virtual ~RBBISentMonkey();
2481 virtual UVector *charClasses();
2482 virtual void setText(const UnicodeString &s);
2483 virtual int32_t next(int32_t i);
2484private:
2485 int moveBack(int posFrom);
2486 int moveForward(int posFrom);
2487 UChar32 cAt(int pos);
2488
2489 UVector *fSets;
2490
2491 UnicodeSet *fSepSet;
2492 UnicodeSet *fFormatSet;
2493 UnicodeSet *fSpSet;
2494 UnicodeSet *fLowerSet;
2495 UnicodeSet *fUpperSet;
2496 UnicodeSet *fOLetterSet;
2497 UnicodeSet *fNumericSet;
2498 UnicodeSet *fATermSet;
46f4442e 2499 UnicodeSet *fSContinueSet;
73c04bcf
A
2500 UnicodeSet *fSTermSet;
2501 UnicodeSet *fCloseSet;
2502 UnicodeSet *fOtherSet;
2503 UnicodeSet *fExtendSet;
2504
2505 const UnicodeString *fText;
2506
2507};
2508
2509RBBISentMonkey::RBBISentMonkey()
2510{
2511 UErrorCode status = U_ZERO_ERROR;
2512
2513 fSets = new UVector(status);
2514
46f4442e
A
2515 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2516 // set and made into character classes of their own. For the monkey impl,
2517 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2518 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2519 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2520 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2521 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2522 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2523 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2524 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2525 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2526 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2527 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2528 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2529 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
73c04bcf
A
2530 fOtherSet = new UnicodeSet();
2531
2532 if(U_FAILURE(status)) {
2533 deferredStatus = status;
2534 return;
2535 }
2536
2537 fOtherSet->complement();
2538 fOtherSet->removeAll(*fSepSet);
2539 fOtherSet->removeAll(*fFormatSet);
2540 fOtherSet->removeAll(*fSpSet);
2541 fOtherSet->removeAll(*fLowerSet);
2542 fOtherSet->removeAll(*fUpperSet);
2543 fOtherSet->removeAll(*fOLetterSet);
2544 fOtherSet->removeAll(*fNumericSet);
2545 fOtherSet->removeAll(*fATermSet);
46f4442e 2546 fOtherSet->removeAll(*fSContinueSet);
73c04bcf
A
2547 fOtherSet->removeAll(*fSTermSet);
2548 fOtherSet->removeAll(*fCloseSet);
2549 fOtherSet->removeAll(*fExtendSet);
2550
46f4442e
A
2551 fSets->addElement(fSepSet, status);
2552 fSets->addElement(fFormatSet, status);
2553 fSets->addElement(fSpSet, status);
2554 fSets->addElement(fLowerSet, status);
2555 fSets->addElement(fUpperSet, status);
2556 fSets->addElement(fOLetterSet, status);
2557 fSets->addElement(fNumericSet, status);
2558 fSets->addElement(fATermSet, status);
2559 fSets->addElement(fSContinueSet, status);
2560 fSets->addElement(fSTermSet, status);
2561 fSets->addElement(fCloseSet, status);
2562 fSets->addElement(fOtherSet, status);
2563 fSets->addElement(fExtendSet, status);
73c04bcf
A
2564
2565 if (U_FAILURE(status)) {
2566 deferredStatus = status;
2567 }
2568}
2569
2570
2571
2572void RBBISentMonkey::setText(const UnicodeString &s) {
2573 fText = &s;
2574}
2575
2576UVector *RBBISentMonkey::charClasses() {
2577 return fSets;
2578}
2579
2580
2581// moveBack() Find the "significant" code point preceding the index i.
2582// Skips over ($Extend | $Format)* .
46f4442e 2583//
73c04bcf
A
2584int RBBISentMonkey::moveBack(int i) {
2585 if (i <= 0) {
2586 return -1;
2587 }
2588 UChar32 c;
2589 int32_t j = i;
2590 do {
2591 j = fText->moveIndex32(j, -1);
2592 c = fText->char32At(j);
2593 }
2594 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2595 return j;
2596
2597 }
2598
2599
2600int RBBISentMonkey::moveForward(int i) {
2601 if (i>=fText->length()) {
2602 return fText->length();
2603 }
2604 UChar32 c;
2605 int32_t j = i;
2606 do {
2607 j = fText->moveIndex32(j, 1);
2608 c = cAt(j);
2609 }
2610 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2611 return j;
2612}
2613
2614UChar32 RBBISentMonkey::cAt(int pos) {
2615 if (pos<0 || pos>=fText->length()) {
2616 return -1;
2617 } else {
2618 return fText->char32At(pos);
2619 }
2620}
2621
2622int32_t RBBISentMonkey::next(int32_t prevPos) {
2623 int p0, p1, p2, p3; // Indices of the significant code points around the
2624 // break position being tested. The candidate break
2625 // location is before p2.
2626
2627 int breakPos = -1;
2628
2629 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2630 UChar32 c;
2631
46f4442e
A
2632 if (U_FAILURE(deferredStatus)) {
2633 return -1;
2634 }
2635
73c04bcf
A
2636 // Prev break at end of string. return DONE.
2637 if (prevPos >= fText->length()) {
2638 return -1;
2639 }
2640 p0 = p1 = p2 = p3 = prevPos;
2641 c3 = fText->char32At(prevPos);
2642 c0 = c1 = c2 = 0;
2643
2644 // Loop runs once per "significant" character position in the input text.
2645 for (;;) {
2646 // Move all of the positions forward in the input string.
2647 p0 = p1; c0 = c1;
2648 p1 = p2; c1 = c2;
2649 p2 = p3; c2 = c3;
46f4442e 2650
73c04bcf
A
2651 // Advancd p3 by X(Extend | Format)* Rule 4
2652 p3 = moveForward(p3);
2653 c3 = cAt(p3);
2654
2655 // Rule (3) CR x LF
2656 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2657 continue;
2658 }
46f4442e 2659
73c04bcf
A
2660 // Rule (4). Sep <break>
2661 if (fSepSet->contains(c1)) {
2662 p2 = p1+1; // Separators don't combine with Extend or Format.
2663 break;
2664 }
2665
2666 if (p2 >= fText->length()) {
2667 // Reached end of string. Always a break position.
2668 break;
2669 }
2670
2671 if (p2 == prevPos) {
2672 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2673 continue;
2674 }
46f4442e 2675
73c04bcf
A
2676 // Rule (6). ATerm x Numeric
2677 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2678 continue;
2679 }
2680
2681 // Rule (7). Upper ATerm x Uppper
2682 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2683 continue;
2684 }
2685
2686 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2687 // Note: STerm | ATerm are added to the negated part of the expression by a
2688 // note to the Unicode 5.0 documents.
2689 int p8 = p1;
2690 while (fSpSet->contains(cAt(p8))) {
2691 p8 = moveBack(p8);
2692 }
2693 while (fCloseSet->contains(cAt(p8))) {
2694 p8 = moveBack(p8);
2695 }
2696 if (fATermSet->contains(cAt(p8))) {
2697 p8=p2;
2698 for (;;) {
2699 c = cAt(p8);
2700 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2701 fLowerSet->contains(c) || fSepSet->contains(c) ||
2702 fATermSet->contains(c) || fSTermSet->contains(c)) {
2703 break;
2704 }
2705 p8 = moveForward(p8);
2706 }
2707 if (fLowerSet->contains(cAt(p8))) {
2708 continue;
2709 }
2710 }
46f4442e
A
2711
2712 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2713 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
73c04bcf
A
2714 p8 = p1;
2715 while (fSpSet->contains(cAt(p8))) {
2716 p8 = moveBack(p8);
2717 }
2718 while (fCloseSet->contains(cAt(p8))) {
2719 p8 = moveBack(p8);
2720 }
2721 c = cAt(p8);
2722 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2723 continue;
2724 }
2725 }
2726
46f4442e 2727 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
73c04bcf
A
2728 int p9 = p1;
2729 while (fCloseSet->contains(cAt(p9))) {
2730 p9 = moveBack(p9);
2731 }
2732 c = cAt(p9);
2733 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2734 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2735 continue;
2736 }
2737 }
2738
46f4442e 2739 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
73c04bcf
A
2740 int p10 = p1;
2741 while (fSpSet->contains(cAt(p10))) {
2742 p10 = moveBack(p10);
2743 }
2744 while (fCloseSet->contains(cAt(p10))) {
2745 p10 = moveBack(p10);
2746 }
2747 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2748 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2749 continue;
2750 }
2751 }
2752
46f4442e 2753 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
73c04bcf 2754 int p11 = p1;
46f4442e
A
2755 if (fSepSet->contains(cAt(p11))) {
2756 p11 = moveBack(p11);
2757 }
73c04bcf
A
2758 while (fSpSet->contains(cAt(p11))) {
2759 p11 = moveBack(p11);
2760 }
2761 while (fCloseSet->contains(cAt(p11))) {
2762 p11 = moveBack(p11);
2763 }
2764 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2765 break;
2766 }
2767
2768 // Rule (12) Any x Any
2769 continue;
2770 }
2771 breakPos = p2;
2772 return breakPos;
2773}
2774
2775RBBISentMonkey::~RBBISentMonkey() {
2776 delete fSets;
2777 delete fSepSet;
2778 delete fFormatSet;
2779 delete fSpSet;
2780 delete fLowerSet;
2781 delete fUpperSet;
2782 delete fOLetterSet;
2783 delete fNumericSet;
2784 delete fATermSet;
46f4442e 2785 delete fSContinueSet;
73c04bcf
A
2786 delete fSTermSet;
2787 delete fCloseSet;
2788 delete fOtherSet;
2789 delete fExtendSet;
2790}
2791
2792
2793
2794//-------------------------------------------------------------------------------------------
2795//
2796// RBBILineMonkey
2797//
2798//-------------------------------------------------------------------------------------------
2799
2800class RBBILineMonkey: public RBBIMonkeyKind {
2801public:
2802 RBBILineMonkey();
2803 virtual ~RBBILineMonkey();
2804 virtual UVector *charClasses();
2805 virtual void setText(const UnicodeString &s);
2806 virtual int32_t next(int32_t i);
2807 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2808private:
2809 UVector *fSets;
2810
2811 UnicodeSet *fBK;
2812 UnicodeSet *fCR;
2813 UnicodeSet *fLF;
2814 UnicodeSet *fCM;
2815 UnicodeSet *fNL;
2816 UnicodeSet *fSG;
2817 UnicodeSet *fWJ;
2818 UnicodeSet *fZW;
2819 UnicodeSet *fGL;
2820 UnicodeSet *fCB;
2821 UnicodeSet *fSP;
2822 UnicodeSet *fB2;
2823 UnicodeSet *fBA;
2824 UnicodeSet *fBB;
2825 UnicodeSet *fHY;
2826 UnicodeSet *fH2;
2827 UnicodeSet *fH3;
2828 UnicodeSet *fCL;
729e4ab9 2829 UnicodeSet *fCP;
73c04bcf
A
2830 UnicodeSet *fEX;
2831 UnicodeSet *fIN;
2832 UnicodeSet *fJL;
2833 UnicodeSet *fJV;
2834 UnicodeSet *fJT;
2835 UnicodeSet *fNS;
2836 UnicodeSet *fOP;
2837 UnicodeSet *fQU;
2838 UnicodeSet *fIS;
2839 UnicodeSet *fNU;
2840 UnicodeSet *fPO;
2841 UnicodeSet *fPR;
2842 UnicodeSet *fSY;
2843 UnicodeSet *fAI;
2844 UnicodeSet *fAL;
4388f060
A
2845 UnicodeSet *fCJ;
2846 UnicodeSet *fHL;
73c04bcf
A
2847 UnicodeSet *fID;
2848 UnicodeSet *fSA;
2849 UnicodeSet *fXX;
2850
2851 BreakIterator *fCharBI;
2852
2853 const UnicodeString *fText;
2854 int32_t *fOrigPositions;
2855
2856 RegexMatcher *fNumberMatcher;
2857 RegexMatcher *fLB11Matcher;
2858};
2859
2860
2861RBBILineMonkey::RBBILineMonkey()
2862{
2863 UErrorCode status = U_ZERO_ERROR;
2864
2865 fSets = new UVector(status);
2866
46f4442e
A
2867 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2868 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2869 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2870 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2871 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2872 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2873 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2874 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2875 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2876 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2877 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2878 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2879 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2880 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2881 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2882 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2883 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
729e4ab9 2884 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
46f4442e
A
2885 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2886 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2887 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2888 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2889 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2890 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2891 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2892 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2893 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2894 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2895 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2896 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2897 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2898 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2899 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
4388f060
A
2900 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2901 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
46f4442e
A
2902 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2903 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2904 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2905 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
73c04bcf
A
2906
2907 if (U_FAILURE(status)) {
2908 deferredStatus = status;
2909 fCharBI = NULL;
2910 fNumberMatcher = NULL;
2911 return;
2912 }
2913
2914 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2915 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2916 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
2917 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2918
4388f060
A
2919 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2920
73c04bcf
A
2921 fSets->addElement(fBK, status);
2922 fSets->addElement(fCR, status);
2923 fSets->addElement(fLF, status);
2924 fSets->addElement(fCM, status);
2925 fSets->addElement(fNL, status);
2926 fSets->addElement(fWJ, status);
2927 fSets->addElement(fZW, status);
2928 fSets->addElement(fGL, status);
2929 fSets->addElement(fCB, status);
2930 fSets->addElement(fSP, status);
2931 fSets->addElement(fB2, status);
2932 fSets->addElement(fBA, status);
2933 fSets->addElement(fBB, status);
2934 fSets->addElement(fHY, status);
2935 fSets->addElement(fH2, status);
2936 fSets->addElement(fH3, status);
2937 fSets->addElement(fCL, status);
729e4ab9 2938 fSets->addElement(fCP, status);
73c04bcf
A
2939 fSets->addElement(fEX, status);
2940 fSets->addElement(fIN, status);
2941 fSets->addElement(fJL, status);
2942 fSets->addElement(fJT, status);
2943 fSets->addElement(fJV, status);
2944 fSets->addElement(fNS, status);
2945 fSets->addElement(fOP, status);
2946 fSets->addElement(fQU, status);
2947 fSets->addElement(fIS, status);
2948 fSets->addElement(fNU, status);
2949 fSets->addElement(fPO, status);
2950 fSets->addElement(fPR, status);
2951 fSets->addElement(fSY, status);
2952 fSets->addElement(fAI, status);
2953 fSets->addElement(fAL, status);
4388f060 2954 fSets->addElement(fHL, status);
73c04bcf
A
2955 fSets->addElement(fID, status);
2956 fSets->addElement(fWJ, status);
2957 fSets->addElement(fSA, status);
2958 fSets->addElement(fSG, status);
2959
46f4442e
A
2960 const char *rules =
2961 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2962 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2963 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2964 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
729e4ab9 2965 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
46f4442e
A
2966 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2967
73c04bcf 2968 fNumberMatcher = new RegexMatcher(
46f4442e 2969 UnicodeString(rules, -1, US_INV), 0, status);
73c04bcf
A
2970
2971 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2972
2973 if (U_FAILURE(status)) {
2974 deferredStatus = status;
2975 }
2976}
2977
2978
2979void RBBILineMonkey::setText(const UnicodeString &s) {
2980 fText = &s;
2981 fCharBI->setText(s);
2982 fNumberMatcher->reset(s);
2983}
2984
2985//
2986// rule9Adjust
2987// Line Break TR rules 9 and 10 implementation.
2988// This deals with combining marks and other sequences that
2989// that must be treated as if they were something other than what they actually are.
2990//
2991// This is factored out into a separate function because it must be applied twice for
2992// each potential break, once to the chars before the position being checked, then
2993// again to the text following the possible break.
2994//
2995void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2996 if (pos == -1) {
2997 // Invalid initial position. Happens during the warmup iteration of the
2998 // main loop in next().
2999 return;
3000 }
3001
3002 int32_t nPos = *nextPos;
3003
3004 // LB 9 Keep combining sequences together.
3005 // advance over any CM class chars. Note that Line Break CM is different
3006 // from the normal Grapheme Extend property.
3007 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3008 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3009 for (;;) {
3010 *nextChar = fText->char32At(nPos);
3011 if (!fCM->contains(*nextChar)) {
3012 break;
3013 }
3014 nPos = fText->moveIndex32(nPos, 1);
3015 }
3016 }
3017
3018
3019 // LB 9 Treat X CM* as if it were x.
3020 // No explicit action required.
3021
3022 // LB 10 Treat any remaining combining mark as AL
3023 if (fCM->contains(*posChar)) {
3024 *posChar = 0x41; // thisChar = 'A';
3025 }
3026
3027 // Push the updated nextPos and nextChar back to our caller.
3028 // This only makes a difference if posChar got bigger by consuming a
3029 // combining sequence.
3030 *nextPos = nPos;
3031 *nextChar = fText->char32At(nPos);
3032}
3033
3034
3035
3036int32_t RBBILineMonkey::next(int32_t startPos) {
3037 UErrorCode status = U_ZERO_ERROR;
3038 int32_t pos; // Index of the char following a potential break position
3039 UChar32 thisChar; // Character at above position "pos"
3040
3041 int32_t prevPos; // Index of the char preceding a potential break position
3042 UChar32 prevChar; // Character at above position. Note that prevChar
3043 // and thisChar may not be adjacent because combining
3044 // characters between them will be ignored.
3045
4388f060
A
3046 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
3047 UChar32 prevCharX2;
3048
73c04bcf
A
3049 int32_t nextPos; // Index of the next character following pos.
3050 // Usually skips over combining marks.
3051 int32_t nextCPPos; // Index of the code point following "pos."
3052 // May point to a combining mark.
3053 int32_t tPos; // temp value.
3054 UChar32 c;
3055
46f4442e
A
3056 if (U_FAILURE(deferredStatus)) {
3057 return -1;
3058 }
3059
73c04bcf
A
3060 if (startPos >= fText->length()) {
3061 return -1;
3062 }
3063
3064
3065 // Initial values for loop. Loop will run the first time without finding breaks,
3066 // while the invalid values shift out and the "this" and
3067 // "prev" positions are filled in with good values.
4388f060
A
3068 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
3069 thisChar = prevChar = prevCharX2 = 0;
73c04bcf
A
3070 nextPos = nextCPPos = startPos;
3071
3072
3073 // Loop runs once per position in the test text, until a break position
3074 // is found.
3075 for (;;) {
4388f060
A
3076 prevPosX2 = prevPos;
3077 prevCharX2 = prevChar;
3078
73c04bcf
A
3079 prevPos = pos;
3080 prevChar = thisChar;
3081
3082 pos = nextPos;
3083 thisChar = fText->char32At(pos);
3084
3085 nextCPPos = fText->moveIndex32(pos, 1);
3086 nextPos = nextCPPos;
3087
3088 // Rule LB2 - Break at end of text.
3089 if (pos >= fText->length()) {
3090 break;
3091 }
3092
3093 // Rule LB 9 - adjust for combining sequences.
3094 // We do this one out-of-order because the adjustment does not change anything
3095 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3096 // be applied.
3097 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3098 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3099 c = fText->char32At(nextPos);
3100 rule9Adjust(pos, &thisChar, &nextPos, &c);
3101
3102 // If the loop is still warming up - if we haven't shifted the initial
3103 // -1 positions out of prevPos yet - loop back to advance the
3104 // position in the input without any further looking for breaks.
3105 if (prevPos == -1) {
3106 continue;
3107 }
46f4442e 3108
73c04bcf
A
3109 // LB 4 Always break after hard line breaks,
3110 if (fBK->contains(prevChar)) {
3111 break;
3112 }
3113
3114 // LB 5 Break after CR, LF, NL, but not inside CR LF
3115 if (prevChar == 0x0d && thisChar == 0x0a) {
3116 continue;
3117 }
3118 if (prevChar == 0x0d ||
3119 prevChar == 0x0a ||
3120 prevChar == 0x85) {
3121 break;
3122 }
3123
3124 // LB 6 Don't break before hard line breaks
3125 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3126 fBK->contains(thisChar)) {
3127 continue;
3128 }
3129
3130
3131 // LB 7 Don't break before spaces or zero-width space.
3132 if (fSP->contains(thisChar)) {
3133 continue;
3134 }
3135
3136 if (fZW->contains(thisChar)) {
3137 continue;
3138 }
3139
3140 // LB 8 Break after zero width space
3141 if (fZW->contains(prevChar)) {
3142 break;
3143 }
3144
3145 // LB 9, 10 Already done, at top of loop.
3146 //
3147
3148
3149 // LB 11 Do not break before or after WORD JOINER and related characters.
3150 // x WJ
3151 // WJ x
3152 //
3153 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3154 continue;
3155 }
3156
3157 // LB 12
73c04bcf 3158 // GL x
46f4442e 3159 if (fGL->contains(prevChar)) {
73c04bcf
A
3160 continue;
3161 }
3162
46f4442e
A
3163 // LB 12a
3164 // [^SP BA HY] x GL
3165 if (!(fSP->contains(prevChar) ||
3166 fBA->contains(prevChar) ||
3167 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3168 continue;
3169 }
3170
3171
73c04bcf
A
3172
3173 // LB 13 Don't break before closings.
729e4ab9 3174 // NU x CL, NU x CP and NU x IS are not matched here so that they will
73c04bcf
A
3175 // fall into LB 17 and the more general number regular expression.
3176 //
729e4ab9
A
3177 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3178 (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3179 fEX->contains(thisChar) ||
3180 (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3181 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
73c04bcf
A
3182 continue;
3183 }
3184
3185 // LB 14 Don't break after OP SP*
3186 // Scan backwards, checking for this sequence.
3187 // The OP char could include combining marks, so we actually check for
3188 // OP CM* SP*
3189 // Another Twist: The Rule 67 fixes may have changed a SP CM
3190 // sequence into a ID char, so before scanning back through spaces,
3191 // verify that prevChar is indeed a space. The prevChar variable
3192 // may differ from fText[prevPos]
3193 tPos = prevPos;
3194 if (fSP->contains(prevChar)) {
3195 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3196 tPos=fText->moveIndex32(tPos, -1);
3197 }
3198 }
3199 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3200 tPos=fText->moveIndex32(tPos, -1);
3201 }
3202 if (fOP->contains(fText->char32At(tPos))) {
3203 continue;
3204 }
3205
3206
3207 // LB 15 QU SP* x OP
3208 if (fOP->contains(thisChar)) {
3209 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3210 int tPos = prevPos;
3211 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3212 tPos = fText->moveIndex32(tPos, -1);
3213 }
3214 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3215 tPos = fText->moveIndex32(tPos, -1);
3216 }
3217 if (fQU->contains(fText->char32At(tPos))) {
3218 continue;
3219 }
3220 }
3221
3222
3223
729e4ab9
A
3224 // LB 16 (CL | CP) SP* x NS
3225 // Scan backwards for SP* CM* (CL | CP)
73c04bcf
A
3226 if (fNS->contains(thisChar)) {
3227 int tPos = prevPos;
3228 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3229 tPos = fText->moveIndex32(tPos, -1);
3230 }
3231 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3232 tPos = fText->moveIndex32(tPos, -1);
3233 }
729e4ab9 3234 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
73c04bcf
A
3235 continue;
3236 }
3237 }
3238
3239
3240 // LB 17 B2 SP* x B2
3241 if (fB2->contains(thisChar)) {
3242 // Scan backwards, checking for the B2 CM* SP* sequence.
3243 tPos = prevPos;
3244 if (fSP->contains(prevChar)) {
3245 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3246 tPos=fText->moveIndex32(tPos, -1);
3247 }
3248 }
3249 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3250 tPos=fText->moveIndex32(tPos, -1);
3251 }
3252 if (fB2->contains(fText->char32At(tPos))) {
3253 continue;
3254 }
3255 }
3256
46f4442e 3257
73c04bcf
A
3258 // LB 18 break after space
3259 if (fSP->contains(prevChar)) {
3260 break;
3261 }
3262
3263 // LB 19
3264 // x QU
3265 // QU x
3266 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3267 continue;
3268 }
3269
3270 // LB 20 Break around a CB
3271 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3272 break;
3273 }
3274
3275 // LB 21
3276 if (fBA->contains(thisChar) ||
3277 fHY->contains(thisChar) ||
3278 fNS->contains(thisChar) ||
3279 fBB->contains(prevChar) ) {
3280 continue;
3281 }
3282
4388f060
A
3283 // LB 21a
3284 // HL (HY | BA) x
3285 if (fHL->contains(prevCharX2) &&
3286 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3287 continue;
3288 }
3289
73c04bcf 3290 // LB 22
729e4ab9 3291 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
4388f060 3292 (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
729e4ab9
A
3293 (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3294 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3295 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
73c04bcf
A
3296 continue;
3297 }
3298
3299
3300 // LB 23 ID x PO
3301 // AL x NU
4388f060 3302 // HL x NU
73c04bcf 3303 // NU x AL
729e4ab9
A
3304 if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3305 (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
4388f060
A
3306 (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3307 (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3308 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) {
73c04bcf
A
3309 continue;
3310 }
3311
3312 // LB 24 Do not break between prefix and letters or ideographs.
3313 // PR x ID
4388f060
A
3314 // PR x (AL | HL)
3315 // PO x (AL | HL)
729e4ab9 3316 if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
4388f060
A
3317 (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3318 (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) {
73c04bcf
A
3319 continue;
3320 }
46f4442e
A
3321
3322
3323
73c04bcf
A
3324 // LB 25 Numbers
3325 if (fNumberMatcher->lookingAt(prevPos, status)) {
3326 if (U_FAILURE(status)) {
3327 break;
3328 }
3329 // Matched a number. But could have been just a single digit, which would
3330 // not represent a "no break here" between prevChar and thisChar
3331 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3332 if (numEndIdx > pos) {
3333 // Number match includes at least our two chars being checked
3334 if (numEndIdx > nextPos) {
3335 // Number match includes additional chars. Update pos and nextPos
3336 // so that next loop iteration will continue at the end of the number,
3337 // checking for breaks between last char in number & whatever follows.
3338 pos = nextPos = numEndIdx;
3339 do {
3340 pos = fText->moveIndex32(pos, -1);
3341 thisChar = fText->char32At(pos);
3342 } while (fCM->contains(thisChar));
3343 }
3344 continue;
3345 }
3346 }
3347
3348
3349 // LB 26 Do not break a Korean syllable.
3350 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3351 fJV->contains(thisChar) ||
3352 fH2->contains(thisChar) ||
3353 fH3->contains(thisChar))) {
3354 continue;
3355 }
3356
3357 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3358 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3359 continue;
3360 }
3361
3362 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3363 fJT->contains(thisChar)) {
3364 continue;
3365 }
3366
3367 // LB 27 Treat a Korean Syllable Block the same as ID.
3368 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3369 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3370 fIN->contains(thisChar)) {
3371 continue;
3372 }
3373 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3374 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3375 fPO->contains(thisChar)) {
3376 continue;
3377 }
3378 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3379 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3380 continue;
3381 }
3382
3383
3384
46f4442e 3385 // LB 28 Do not break between alphabetics ("at").
4388f060 3386 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
73c04bcf
A
3387 continue;
3388 }
3389
3390 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
4388f060 3391 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
73c04bcf
A
3392 continue;
3393 }
3394
729e4ab9
A
3395 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3396 // (AL | NU) x OP
3397 // CP x (AL | NU)
4388f060 3398 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
729e4ab9
A
3399 continue;
3400 }
4388f060 3401 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
729e4ab9
A
3402 continue;
3403 }
3404
73c04bcf
A
3405 // LB 31 Break everywhere else
3406 break;
3407
3408 }
3409
3410 return pos;
3411}
3412
3413
3414UVector *RBBILineMonkey::charClasses() {
3415 return fSets;
3416}
3417
3418
3419RBBILineMonkey::~RBBILineMonkey() {
3420 delete fSets;
3421
3422 delete fBK;
3423 delete fCR;
3424 delete fLF;
3425 delete fCM;
3426 delete fNL;
3427 delete fWJ;
3428 delete fZW;
3429 delete fGL;
3430 delete fCB;
3431 delete fSP;
3432 delete fB2;
3433 delete fBA;
3434 delete fBB;
3435 delete fHY;
3436 delete fH2;
3437 delete fH3;
3438 delete fCL;
729e4ab9 3439 delete fCP;
73c04bcf
A
3440 delete fEX;
3441 delete fIN;
3442 delete fJL;
3443 delete fJV;
3444 delete fJT;
3445 delete fNS;
3446 delete fOP;
3447 delete fQU;
3448 delete fIS;
3449 delete fNU;
3450 delete fPO;
3451 delete fPR;
3452 delete fSY;
3453 delete fAI;
3454 delete fAL;
4388f060
A
3455 delete fCJ;
3456 delete fHL;
73c04bcf
A
3457 delete fID;
3458 delete fSA;
3459 delete fSG;
3460 delete fXX;
3461
3462 delete fCharBI;
3463 delete fNumberMatcher;
3464}
3465
3466
3467//-------------------------------------------------------------------------------------------
3468//
3469// TestMonkey
3470//
3471// params
3472// seed=nnnnn Random number starting seed.
3473// Setting the seed allows errors to be reproduced.
3474// loop=nnn Looping count. Controls running time.
3475// -1: run forever.
3476// 0 or greater: run length.
3477//
3478// type = char | word | line | sent | title
3479//
3480//-------------------------------------------------------------------------------------------
3481
3482static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3483 int32_t val = defaultVal;
3484 name.append(" *= *(-?\\d+)");
3485 UErrorCode status = U_ZERO_ERROR;
3486 RegexMatcher m(name, params, 0, status);
3487 if (m.find()) {
3488 // The param exists. Convert the string to an int.
3489 char valString[100];
3490 int32_t paramLength = m.end(1, status) - m.start(1, status);
3491 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3492 paramLength = (int32_t)(sizeof(valString)-2);
3493 }
3494 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3495 val = strtol(valString, NULL, 10);
3496
3497 // Delete this parameter from the params string.
3498 m.reset();
3499 params = m.replaceFirst("", status);
3500 }
3501 U_ASSERT(U_SUCCESS(status));
3502 return val;
3503}
3504#endif
3505
3506static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3507 BreakIterator *bi,
3508 int expected[],
3509 int expectedcount)
3510{
3511 int count = 0;
3512 int i = 0;
3513 int forward[50];
3514 bi->setText(ustr);
3515 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3516 forward[count] = i;
3517 if (count < expectedcount && expected[count] != i) {
3518 test->errln("break forward test failed: expected %d but got %d",
3519 expected[count], i);
3520 break;
3521 }
3522 count ++;
3523 }
3524 if (count != expectedcount) {
3525 printStringBreaks(ustr, expected, expectedcount);
3526 test->errln("break forward test failed: missed %d match",
3527 expectedcount - count);
3528 return;
3529 }
3530 // testing boundaries
3531 for (i = 1; i < expectedcount; i ++) {
3532 int j = expected[i - 1];
3533 if (!bi->isBoundary(j)) {
3534 printStringBreaks(ustr, expected, expectedcount);
3535 test->errln("isBoundary() failed. Expected boundary at position %d", j);
3536 return;
3537 }
3538 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3539 if (bi->isBoundary(j)) {
3540 printStringBreaks(ustr, expected, expectedcount);
3541 test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
3542 return;
3543 }
3544 }
3545 }
3546
3547 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3548 count --;
3549 if (forward[count] != i) {
3550 test->errln("happy break test previous() failed: expected %d but got %d",
3551 forward[count], i);
3552 break;
3553 }
3554 }
3555 if (count != 0) {
3556 printStringBreaks(ustr, expected, expectedcount);
3557 test->errln("break test previous() failed: missed a match");
3558 return;
3559 }
3560
3561 // testing preceding
3562 for (i = 0; i < expectedcount - 1; i ++) {
3563 // int j = expected[i] + 1;
3564 int j = ustr.moveIndex32(expected[i], 1);
3565 for (; j <= expected[i + 1]; j ++) {
3566 if (bi->preceding(j) != expected[i]) {
3567 printStringBreaks(ustr, expected, expectedcount);
3568 test->errln("preceding(): Not expecting boundary at position %d", j);
3569 return;
3570 }
3571 }
3572 }
3573}
3574
3575void RBBITest::TestWordBreaks(void)
3576{
3577#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3578
73c04bcf
A
3579 Locale locale("en");
3580 UErrorCode status = U_ZERO_ERROR;
3581 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3582 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
73c04bcf
A
3583 static const char *strlist[] =
3584 {
3585 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3586 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
46f4442e 3587 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
73c04bcf
A
3588 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3589 "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3590 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3591 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3592 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3593 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3594 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3595 "\\u2027\\U000e0067\\u0a47\\u00b7",
3596 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3597 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3598 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3599 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3600 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3601 "\\u0027\\u11af\\U000e0057\\u0602",
3602 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3603 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3604 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3605 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
46f4442e 3606 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
73c04bcf
A
3607 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3608 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3609 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3610 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3611 "\\u58f4\\U000e0049\\u20e7\\u2027",
3612 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3613 "\\ua183\\u102d\\u0bec\\u003a",
3614 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3615 "\\u003a\\u0e57\\u0fad\\u002e",
3616 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3617 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3618 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3619 "\\u003a\\u0664\\u00b7\\u1fba",
3620 "\\u003b\\u0027\\u00b7\\u47a3",
3621 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3622 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3623 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3624 };
3625 int loop;
3626 if (U_FAILURE(status)) {
729e4ab9 3627 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
73c04bcf
A
3628 return;
3629 }
3630 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3631 // printf("looping %d\n", loop);
46f4442e 3632 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
73c04bcf
A
3633 // RBBICharMonkey monkey;
3634 RBBIWordMonkey monkey;
3635
3636 int expected[50];
3637 int expectedcount = 0;
3638
3639 monkey.setText(ustr);
3640 int i;
3641 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3642 expected[expectedcount ++] = i;
3643 }
3644
3645 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3646 }
3647 delete bi;
3648#endif
3649}
3650
3651void RBBITest::TestWordBoundary(void)
3652{
3653 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3654 Locale locale("en");
3655 UErrorCode status = U_ZERO_ERROR;
3656 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3657 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3658 UChar str[50];
3659 static const char *strlist[] =
3660 {
3661 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3662 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3663 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3664 "\\u2027\\U000e0067\\u0a47\\u00b7",
3665 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3666 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3667 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3668 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3669 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3670 "\\u0027\\u11af\\U000e0057\\u0602",
3671 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3672 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3673 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3674 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3675 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3676 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3677 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3678 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3679 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3680 "\\u58f4\\U000e0049\\u20e7\\u2027",
3681 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3682 "\\ua183\\u102d\\u0bec\\u003a",
3683 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3684 "\\u003a\\u0e57\\u0fad\\u002e",
3685 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3686 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3687 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3688 "\\u003a\\u0664\\u00b7\\u1fba",
3689 "\\u003b\\u0027\\u00b7\\u47a3",
3690 };
3691 int loop;
3692 if (U_FAILURE(status)) {
729e4ab9 3693 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
73c04bcf
A
3694 return;
3695 }
3696 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3697 // printf("looping %d\n", loop);
3698 u_unescape(strlist[loop], str, 20);
3699 UnicodeString ustr(str);
3700 int forward[50];
3701 int count = 0;
3702
3703 bi->setText(ustr);
3704 int prev = 0;
3705 int i;
3706 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3707 forward[count ++] = i;
3708 if (i > prev) {
3709 int j;
3710 for (j = prev + 1; j < i; j ++) {
3711 if (bi->isBoundary(j)) {
3712 printStringBreaks(ustr, forward, count);
3713 errln("happy boundary test failed: expected %d not a boundary",
3714 j);
3715 return;
3716 }
3717 }
3718 }
3719 if (!bi->isBoundary(i)) {
3720 printStringBreaks(ustr, forward, count);
3721 errln("happy boundary test failed: expected %d a boundary",
3722 i);
3723 return;
3724 }
3725 prev = i;
3726 }
3727 }
3728 delete bi;
3729}
3730
3731void RBBITest::TestLineBreaks(void)
3732{
3733#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3734 Locale locale("en");
3735 UErrorCode status = U_ZERO_ERROR;
3736 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3737 const int32_t STRSIZE = 50;
3738 UChar str[STRSIZE];
3739 static const char *strlist[] =
3740 {
3741 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3742 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3743 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3744 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3745 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3746 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3747 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3748 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3749 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3750 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3751 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3752 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3753 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3754 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3755 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3756 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3757 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3758 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3759 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3760 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3761 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3762 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3763 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3764 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3765 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3766 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3767 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3768 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3769 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3770 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3771 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3772 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3773 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3774 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3775 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3776 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3777 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3778 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3779 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3780 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3781 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3782 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3783 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3784 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3785 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3786 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3787 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3788 };
3789 int loop;
3790 TEST_ASSERT_SUCCESS(status);
3791 if (U_FAILURE(status)) {
3792 return;
3793 }
3794 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3795 // printf("looping %d\n", loop);
3796 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3797 if (t >= STRSIZE) {
3798 TEST_ASSERT(FALSE);
3799 continue;
3800 }
3801
46f4442e 3802
73c04bcf
A
3803 UnicodeString ustr(str);
3804 RBBILineMonkey monkey;
3805 if (U_FAILURE(monkey.deferredStatus)) {
3806 continue;
3807 }
3808
3809 const int EXPECTEDSIZE = 50;
3810 int expected[EXPECTEDSIZE];
3811 int expectedcount = 0;
3812
3813 monkey.setText(ustr);
3814 int i;
3815 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3816 if (expectedcount >= EXPECTEDSIZE) {
3817 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3818 return;
3819 }
3820 expected[expectedcount ++] = i;
3821 }
3822
3823 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3824 }
3825 delete bi;
3826#endif
3827}
3828
3829void RBBITest::TestSentBreaks(void)
3830{
3831#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3832 Locale locale("en");
3833 UErrorCode status = U_ZERO_ERROR;
3834 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3835 UChar str[200];
3836 static const char *strlist[] =
3837 {
3838 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3839 "This\n",
3840 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3841 "\"Sentence ending with a quote.\" Bye.",
3842 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3843 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3844 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3845 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3846 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3847 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3848 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3849 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3850 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3851 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3852 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3853 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3854 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3855 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3856 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3857 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3858 };
3859 int loop;
3860 if (U_FAILURE(status)) {
729e4ab9 3861 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
73c04bcf
A
3862 return;
3863 }
3864 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3865 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3866 UnicodeString ustr(str);
3867
3868 RBBISentMonkey monkey;
3869 if (U_FAILURE(monkey.deferredStatus)) {
3870 continue;
3871 }
3872
3873 const int EXPECTEDSIZE = 50;
3874 int expected[EXPECTEDSIZE];
3875 int expectedcount = 0;
3876
3877 monkey.setText(ustr);
3878 int i;
3879 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3880 if (expectedcount >= EXPECTEDSIZE) {
3881 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3882 return;
3883 }
3884 expected[expectedcount ++] = i;
3885 }
3886
3887 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3888 }
3889 delete bi;
3890#endif
3891}
3892
3893void RBBITest::TestMonkey(char *params) {
3894#if !UCONFIG_NO_REGULAR_EXPRESSIONS
3895
3896 UErrorCode status = U_ZERO_ERROR;
3897 int32_t loopCount = 500;
3898 int32_t seed = 1;
3899 UnicodeString breakType = "all";
3900 Locale locale("en");
3901 UBool useUText = FALSE;
3902
3903 if (quick == FALSE) {
3904 loopCount = 10000;
3905 }
3906
3907 if (params) {
3908 UnicodeString p(params);
3909 loopCount = getIntParam("loop", p, loopCount);
3910 seed = getIntParam("seed", p, seed);
3911
3912 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3913 if (m.find()) {
3914 breakType = m.group(1, status);
3915 m.reset();
3916 p = m.replaceFirst("", status);
3917 }
3918
3919 RegexMatcher u(" *utext", p, 0, status);
3920 if (u.find()) {
3921 useUText = TRUE;
3922 u.reset();
3923 p = u.replaceFirst("", status);
3924 }
3925
3926
3927 // m.reset(p);
46f4442e 3928 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
73c04bcf
A
3929 // Each option is stripped out of the option string as it is processed.
3930 // All options have been checked. The option string should have been completely emptied..
3931 char buf[100];
3932 p.extract(buf, sizeof(buf), NULL, status);
3933 buf[sizeof(buf)-1] = 0;
3934 errln("Unrecognized or extra parameter: %s\n", buf);
3935 return;
3936 }
3937
3938 }
3939
3940 if (breakType == "char" || breakType == "all") {
3941 RBBICharMonkey m;
3942 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3943 if (U_SUCCESS(status)) {
3944 RunMonkey(bi, m, "char", seed, loopCount, useUText);
3945 if (breakType == "all" && useUText==FALSE) {
3946 // Also run a quick test with UText when "all" is specified
3947 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3948 }
3949 }
3950 else {
729e4ab9 3951 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
73c04bcf
A
3952 }
3953 delete bi;
3954 }
3955
3956 if (breakType == "word" || breakType == "all") {
3957 logln("Word Break Monkey Test");
3958 RBBIWordMonkey m;
3959 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3960 if (U_SUCCESS(status)) {
3961 RunMonkey(bi, m, "word", seed, loopCount, useUText);
3962 }
3963 else {
729e4ab9 3964 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
73c04bcf
A
3965 }
3966 delete bi;
3967 }
3968
3969 if (breakType == "line" || breakType == "all") {
3970 logln("Line Break Monkey Test");
3971 RBBILineMonkey m;
3972 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3973 if (loopCount >= 10) {
3974 loopCount = loopCount / 5; // Line break runs slower than the others.
3975 }
3976 if (U_SUCCESS(status)) {
3977 RunMonkey(bi, m, "line", seed, loopCount, useUText);
3978 }
3979 else {
729e4ab9 3980 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
73c04bcf
A
3981 }
3982 delete bi;
3983 }
3984
46f4442e 3985 if (breakType == "sent" || breakType == "all" ) {
73c04bcf
A
3986 logln("Sentence Break Monkey Test");
3987 RBBISentMonkey m;
3988 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3989 if (loopCount >= 10) {
3990 loopCount = loopCount / 10; // Sentence runs slower than the other break types
3991 }
3992 if (U_SUCCESS(status)) {
3993 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3994 }
3995 else {
729e4ab9 3996 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
73c04bcf
A
3997 }
3998 delete bi;
3999 }
4000
4001#endif
4002}
4003
4004//
4005// Run a RBBI monkey test. Common routine, for all break iterator types.
4006// Parameters:
4007// bi - the break iterator to use
4008// mk - MonkeyKind, abstraction for obtaining expected results
4009// name - Name of test (char, word, etc.) for use in error messages
4010// seed - Seed for starting random number generator (parameter from user)
4011// numIterations
4012//
46f4442e 4013void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
73c04bcf
A
4014 int32_t numIterations, UBool useUText) {
4015
4016#if !UCONFIG_NO_REGULAR_EXPRESSIONS
4017
4018 const int32_t TESTSTRINGLEN = 500;
4019 UnicodeString testText;
4020 int32_t numCharClasses;
4021 UVector *chClasses;
4022 int expected[TESTSTRINGLEN*2 + 1];
4023 int expectedCount = 0;
4024 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4025 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4026 char reverseBreaks[TESTSTRINGLEN*2+1];
4027 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4028 char followingBreaks[TESTSTRINGLEN*2+1];
4029 char precedingBreaks[TESTSTRINGLEN*2+1];
4030 int i;
4031 int loopCount = 0;
4032
4033 m_seed = seed;
4034
4035 numCharClasses = mk.charClasses()->size();
4036 chClasses = mk.charClasses();
4037
4038 // Check for errors that occured during the construction of the MonkeyKind object.
4039 // Can't report them where they occured because errln() is a method coming from intlTest,
4040 // and is not visible outside of RBBITest :-(
4041 if (U_FAILURE(mk.deferredStatus)) {
4042 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4043 return;
4044 }
4045
4046 // Verify that the character classes all have at least one member.
4047 for (i=0; i<numCharClasses; i++) {
4048 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4049 if (s == NULL || s->size() == 0) {
4050 errln("Character Class #%d is null or of zero size.", i);
4051 return;
4052 }
4053 }
4054
4055 while (loopCount < numIterations || numIterations == -1) {
4056 if (numIterations == -1 && loopCount % 10 == 0) {
4057 // If test is running in an infinite loop, display a periodic tic so
4058 // we can tell that it is making progress.
4059 fprintf(stderr, ".");
4060 }
4061 // Save current random number seed, so that we can recreate the random numbers
4062 // for this loop iteration in event of an error.
4063 seed = m_seed;
4064
4065 // Populate a test string with data.
4066 testText.truncate(0);
4067 for (i=0; i<TESTSTRINGLEN; i++) {
4068 int32_t aClassNum = m_rand() % numCharClasses;
4069 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4070 int32_t charIdx = m_rand() % classSet->size();
4071 UChar32 c = classSet->charAt(charIdx);
4072 if (c < 0) { // TODO: deal with sets containing strings.
4073 errln("c < 0");
4074 break;
4075 }
4076 testText.append(c);
4077 }
4078
4079 // Calculate the expected results for this test string.
4080 mk.setText(testText);
4081 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4082 expectedBreaks[0] = 1;
4083 int32_t breakPos = 0;
4084 expectedCount = 0;
4085 for (;;) {
4086 breakPos = mk.next(breakPos);
4087 if (breakPos == -1) {
4088 break;
4089 }
4090 if (breakPos > testText.length()) {
4091 errln("breakPos > testText.length()");
4092 }
4093 expectedBreaks[breakPos] = 1;
4094 U_ASSERT(expectedCount<testText.length());
4095 expected[expectedCount ++] = breakPos;
4096 }
4097
4098 // Find the break positions using forward iteration
4099 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4100 if (useUText) {
4101 UErrorCode status = U_ZERO_ERROR;
4102 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4103 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4104 bi->setText(testUText, status);
4105 TEST_ASSERT_SUCCESS(status);
4106 utext_close(testUText); // The break iterator does a shallow clone of the UText
4107 // This UText can be closed immediately, so long as the
4108 // testText string continues to exist.
4109 } else {
4110 bi->setText(testText);
4111 }
4112
4113 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4114 if (i < 0 || i > testText.length()) {
4115 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4116 break;
4117 }
4118 forwardBreaks[i] = 1;
4119 }
4120
4121 // Find the break positions using reverse iteration
4122 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4123 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4124 if (i < 0 || i > testText.length()) {
4125 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4126 break;
4127 }
4128 reverseBreaks[i] = 1;
4129 }
4130
4131 // Find the break positions using isBoundary() tests.
4132 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4133 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4134 for (i=0; i<=testText.length(); i++) {
4135 isBoundaryBreaks[i] = bi->isBoundary(i);
4136 }
4137
4138
4139 // Find the break positions using the following() function.
4140 // printf(".");
4141 memset(followingBreaks, 0, sizeof(followingBreaks));
4142 int32_t lastBreakPos = 0;
4143 followingBreaks[0] = 1;
4144 for (i=0; i<testText.length(); i++) {
4145 breakPos = bi->following(i);
4146 if (breakPos <= i ||
4147 breakPos < lastBreakPos ||
4148 breakPos > testText.length() ||
729e4ab9 4149 (breakPos > lastBreakPos && lastBreakPos > i)) {
73c04bcf
A
4150 errln("%s break monkey test: "
4151 "Out of range value returned by BreakIterator::following().\n"
4152 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4153 name, seed, i, breakPos, lastBreakPos);
4154 break;
4155 }
4156 followingBreaks[breakPos] = 1;
4157 lastBreakPos = breakPos;
4158 }
4159
4160 // Find the break positions using the preceding() function.
46f4442e 4161 memset(precedingBreaks, 0, sizeof(precedingBreaks));
73c04bcf
A
4162 lastBreakPos = testText.length();
4163 precedingBreaks[testText.length()] = 1;
4164 for (i=testText.length(); i>0; i--) {
4165 breakPos = bi->preceding(i);
4166 if (breakPos >= i ||
4167 breakPos > lastBreakPos ||
729e4ab9
A
4168 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4169 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
73c04bcf
A
4170 errln("%s break monkey test: "
4171 "Out of range value returned by BreakIterator::preceding().\n"
4172 "index=%d; prev returned %d; lastBreak=%d" ,
4173 name, i, breakPos, lastBreakPos);
46f4442e
A
4174 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4175 precedingBreaks[i] = 2; // Forces an error.
4176 }
73c04bcf 4177 } else {
46f4442e
A
4178 if (breakPos >= 0) {
4179 precedingBreaks[breakPos] = 1;
4180 }
73c04bcf
A
4181 lastBreakPos = breakPos;
4182 }
4183 }
4184
4185 // Compare the expected and actual results.
4186 for (i=0; i<=testText.length(); i++) {
4187 const char *errorType = NULL;
4188 if (forwardBreaks[i] != expectedBreaks[i]) {
4189 errorType = "next()";
4190 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4191 errorType = "previous()";
4192 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4193 errorType = "isBoundary()";
4194 } else if (followingBreaks[i] != expectedBreaks[i]) {
4195 errorType = "following()";
4196 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4197 errorType = "preceding()";
4198 }
4199
4200
4201 if (errorType != NULL) {
4202 // Format a range of the test text that includes the failure as
4203 // a data item that can be included in the rbbi test data file.
4204
4205 // Start of the range is the last point where expected and actual results
4206 // both agreed that there was a break position.
4207 int startContext = i;
4208 int32_t count = 0;
4209 for (;;) {
4210 if (startContext==0) { break; }
4211 startContext --;
4212 if (expectedBreaks[startContext] != 0) {
4213 if (count == 2) break;
4214 count ++;
4215 }
4216 }
4217
4218 // End of range is two expected breaks past the start position.
4219 int endContext = i + 1;
4220 int ci;
4221 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4222 for (;;) {
4223 if (endContext >= testText.length()) {break;}
4224 if (expectedBreaks[endContext-1] != 0) {
4225 if (count == 0) break;
4226 count --;
4227 }
4228 endContext ++;
4229 }
4230 }
4231
4232 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4233 UnicodeString errorText = "<data>";
4234 /***if (strcmp(errorType, "next()") == 0) {
4235 startContext = 0;
4236 endContext = testText.length();
4237
4238 printStringBreaks(testText, expected, expectedCount);
4239 }***/
4240
4241 for (ci=startContext; ci<endContext;) {
4242 UnicodeString hexChars("0123456789abcdef");
4243 UChar32 c;
4244 int bn;
4245 c = testText.char32At(ci);
4246 if (ci == i) {
4247 // This is the location of the error.
4248 errorText.append("<?>");
4249 } else if (expectedBreaks[ci] != 0) {
4250 // This a non-error expected break position.
4251 errorText.append("\\");
4252 }
4253 if (c < 0x10000) {
4254 errorText.append("\\u");
4255 for (bn=12; bn>=0; bn-=4) {
4256 errorText.append(hexChars.charAt((c>>bn)&0xf));
4257 }
4258 } else {
4259 errorText.append("\\U");
4260 for (bn=28; bn>=0; bn-=4) {
4261 errorText.append(hexChars.charAt((c>>bn)&0xf));
4262 }
4263 }
4264 ci = testText.moveIndex32(ci, 1);
4265 }
4266 errorText.append("\\");
4267 errorText.append("</data>\n");
4268
4269 // Output the error
4270 char charErrorTxt[500];
4271 UErrorCode status = U_ZERO_ERROR;
4272 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4273 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4388f060
A
4274 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4275
4276 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4277 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
73c04bcf
A
4278 errorType, seed, i, charErrorTxt);
4279 break;
4280 }
4281 }
4282
4283 loopCount++;
4284 }
4285#endif
4286}
4287
729e4ab9
A
4288
4289// Bug 5532. UTF-8 based UText fails in dictionary code.
4290// This test checks the initial patch,
4291// which is to just keep it from crashing. Correct word boundaries
4292// await a proper fix to the dictionary code.
4293//
4294void RBBITest::TestBug5532(void) {
4295 // Text includes a mixture of Thai and Latin.
4296 const unsigned char utf8Data[] = {
4297 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4298 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4299 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4300 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4301 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4302 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4303 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4304 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4305 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4306 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4307 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4308
4309 UErrorCode status = U_ZERO_ERROR;
4310 UText utext=UTEXT_INITIALIZER;
4311 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4312 TEST_ASSERT_SUCCESS(status);
4313
4314 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4315 TEST_ASSERT_SUCCESS(status);
4316 if (U_SUCCESS(status)) {
4317 bi->setText(&utext, status);
4318 TEST_ASSERT_SUCCESS(status);
4319
4320 int32_t breakCount = 0;
4321 int32_t previousBreak = -1;
4322 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4323 // For now, just make sure that the break iterator doesn't hang.
4324 TEST_ASSERT(previousBreak < bi->current());
4325 previousBreak = bi->current();
4326 }
4327 TEST_ASSERT(breakCount > 0);
4328 }
4329 delete bi;
4330 utext_close(&utext);
4331}
4332
4333
73c04bcf
A
4334//
4335// TestDebug - A place-holder test for debugging purposes.
4336// For putting in fragments of other tests that can be invoked
4337// for tracing without a lot of unwanted extra stuff happening.
4338//
4339void RBBITest::TestDebug(void) {
4340#if 0
4341 UErrorCode status = U_ZERO_ERROR;
4342 int pos = 0;
4343 int ruleStatus = 0;
4344
4345 RuleBasedBreakIterator* bi =
4346 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4347 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4348 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4349 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4350 // UnicodeString s("Aaa. Bcd");
4351 s = s.unescape();
4352 bi->setText(s);
4353 UBool r = bi->isBoundary(8);
4354 printf("%s", r?"true":"false");
4355 return;
4356 pos = bi->last();
4357 do {
4358 // ruleStatus = bi->getRuleStatus();
4359 printf("%d\t%d\n", pos, ruleStatus);
4360 pos = bi->previous();
4361 } while (pos != BreakIterator::DONE);
4362#endif
4363}
4364
4388f060
A
4365void RBBITest::TestProperties() {
4366 UErrorCode errorCode = U_ZERO_ERROR;
4367 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4368 if (!prependSet.isEmpty()) {
4369 errln(
4370 "[:GCB=Prepend:] is not empty any more. "
4371 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4372 "change this test to the opposite condition.");
4373 }
4374}
4375
73c04bcf 4376#endif /* #if !UCONFIG_NO_BREAK_ITERATION */