]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/rbbitst.cpp
ICU-551.24.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
11
12 #include "utypeinfo.h" // for 'typeid' to work
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_BREAK_ITERATION
17
18 #include "unicode/utypes.h"
19 #include "unicode/brkiter.h"
20 #include "unicode/rbbi.h"
21 #include "unicode/uchar.h"
22 #include "unicode/utf16.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/schriter.h"
25 #include "unicode/uniset.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 #include "unicode/regex.h"
28 #endif
29 #include "unicode/ustring.h"
30 #include "unicode/utext.h"
31 #include "intltest.h"
32 #include "rbbitst.h"
33 #include <string.h>
34 #include "charstr.h"
35 #include "uvector.h"
36 #include "uvectr32.h"
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include "unicode/numfmt.h"
40 #include "unicode/uscript.h"
41 #include "cmemory.h"
42
43 #define TEST_ASSERT(x) {if (!(x)) { \
44 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
45
46 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
47 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
48
49
50 //---------------------------------------------
51 // runIndexedTest
52 //---------------------------------------------
53
54
55 // Note: Before adding new tests to this file, check whether the desired test data can
56 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
57 // it's much less work than writing a new test, diagnostic output in the event of failures
58 // is good, and the test data file will is shared with ICU4J, so eventually the test
59 // will run there as well, without additional effort.
60
61 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
62 {
63 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
64
65 switch (index) {
66 #if !UCONFIG_NO_FILE_IO
67 case 0: name = "TestBug4153072";
68 if(exec) TestBug4153072(); break;
69 #else
70 case 0: name = "skip";
71 break;
72 #endif
73
74 case 1: name = "skip";
75 break;
76 case 2: name = "TestStatusReturn";
77 if(exec) TestStatusReturn(); break;
78
79 #if !UCONFIG_NO_FILE_IO
80 case 3: name = "TestUnicodeFiles";
81 if(exec) TestUnicodeFiles(); break;
82 case 4: name = "TestEmptyString";
83 if(exec) TestEmptyString(); break;
84 #else
85 case 3: case 4: name = "skip";
86 break;
87 #endif
88
89 case 5: name = "TestGetAvailableLocales";
90 if(exec) TestGetAvailableLocales(); break;
91
92 case 6: name = "TestGetDisplayName";
93 if(exec) TestGetDisplayName(); break;
94
95 #if !UCONFIG_NO_FILE_IO
96 case 7: name = "TestEndBehaviour";
97 if(exec) TestEndBehaviour(); break;
98 case 8: case 9: case 10: name = "skip";
99 break;
100 case 11: name = "TestWordBreaks";
101 if(exec) TestWordBreaks(); break;
102 case 12: name = "TestWordBoundary";
103 if(exec) TestWordBoundary(); break;
104 case 13: name = "TestLineBreaks";
105 if(exec) TestLineBreaks(); break;
106 case 14: name = "TestSentBreaks";
107 if(exec) TestSentBreaks(); break;
108 case 15: name = "TestExtended";
109 if(exec) TestExtended(); break;
110 #else
111 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
112 break;
113 #endif
114
115 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
116 case 16:
117 name = "TestMonkey"; if(exec) TestMonkey(params); break;
118 #else
119 case 16:
120 name = "skip"; break;
121 #endif
122
123 #if !UCONFIG_NO_FILE_IO
124 case 17: name = "TestBug3818";
125 if(exec) TestBug3818(); break;
126 #else
127 case 17: name = "skip";
128 break;
129 #endif
130
131 case 18: name = "skip";
132 break;
133 case 19: name = "TestDebug";
134 if(exec) TestDebug(); break;
135 case 20: name = "skip";
136 break;
137
138 #if !UCONFIG_NO_FILE_IO
139 case 21: name = "TestBug5775";
140 if (exec) TestBug5775(); break;
141 #else
142 case 21: name = "skip";
143 break;
144 #endif
145
146 case 22: name = "TestBug9983";
147 if (exec) TestBug9983(); break;
148 case 23: name = "TestDictRules";
149 if (exec) TestDictRules(); break;
150 case 24: name = "TestBug5532";
151 if (exec) TestBug5532(); break;
152 default: name = ""; break; //needed to end loop
153 }
154 }
155
156
157 //---------------------------------------------------------------------------
158 //
159 // class BITestData Holds a set of Break iterator test data and results
160 // Includes
161 // - the string data to be broken
162 // - a vector of the expected break positions.
163 // - a vector of source line numbers for the data,
164 // (to help see where errors occured.)
165 // - The expected break tag values.
166 // - Vectors of actual break positions and tag values.
167 // - Functions for comparing actual with expected and
168 // reporting errors.
169 //
170 //----------------------------------------------------------------------------
171 class BITestData {
172 public:
173 UnicodeString fDataToBreak;
174 UVector fExpectedBreakPositions;
175 UVector fExpectedTags;
176 UVector fLineNum;
177 UVector fActualBreakPositions; // Test Results.
178 UVector fActualTags;
179
180 BITestData(UErrorCode &status);
181 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
182 void checkResults(const char *heading, RBBITest *test);
183 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
184 void clearResults();
185 };
186
187 //
188 // Constructor.
189 //
190 BITestData::BITestData(UErrorCode &status)
191 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
192 fActualTags(status)
193 {
194 }
195
196 //
197 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
198 // The macro form collects the line number, which is helpful
199 // when tracking down failures.
200 //
201 // A null data item is inserted at the start of each test's data
202 // to put the starting zero into the data list. The position saved for
203 // each non-null item is its ending position.
204 //
205 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
206 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
207 if (U_FAILURE(status)) {return;}
208 if (data != NULL) {
209 fDataToBreak.append(CharsToUnicodeString(data));
210 }
211 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
212 fExpectedTags.addElement(tag, status);
213 fLineNum.addElement(lineNum, status);
214 }
215
216
217 //
218 // checkResults. Compare the actual and expected break positions, report any differences.
219 //
220 void BITestData::checkResults(const char *heading, RBBITest *test) {
221 int32_t expectedIndex = 0;
222 int32_t actualIndex = 0;
223
224 for (;;) {
225 // If we've run through both the expected and actual results vectors, we're done.
226 // break out of the loop.
227 if (expectedIndex >= fExpectedBreakPositions.size() &&
228 actualIndex >= fActualBreakPositions.size()) {
229 break;
230 }
231
232
233 if (expectedIndex >= fExpectedBreakPositions.size()) {
234 err(heading, test, expectedIndex-1, actualIndex);
235 actualIndex++;
236 continue;
237 }
238
239 if (actualIndex >= fActualBreakPositions.size()) {
240 err(heading, test, expectedIndex, actualIndex-1);
241 expectedIndex++;
242 continue;
243 }
244
245 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
246 err(heading, test, expectedIndex, actualIndex);
247 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
248 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
249 actualIndex++;
250 } else {
251 expectedIndex++;
252 }
253 continue;
254 }
255
256 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
257 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
258 heading, fLineNum.elementAt(expectedIndex),
259 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
260 }
261
262 actualIndex++;
263 expectedIndex++;
264 }
265 }
266
267 //
268 // err - An error was found. Report it, along with information about where the
269 // incorrectly broken test data appeared in the source file.
270 //
271 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
272 {
273 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
274 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
275 int32_t o = 0;
276 int32_t line = fLineNum.elementAti(expectedIdx);
277 if (expectedIdx > 0) {
278 // The line numbers are off by one because a premature break occurs somewhere
279 // within the previous item, rather than at the start of the current (expected) item.
280 // We want to report the offset of the unexpected break from the start of
281 // this previous item.
282 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
283 }
284 if (actual < expected) {
285 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
286 } else {
287 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
288 }
289 }
290
291
292 void BITestData::clearResults() {
293 fActualBreakPositions.removeAllElements();
294 fActualTags.removeAllElements();
295 }
296
297
298 //--------------------------------------------------------------------------------------
299 //
300 // RBBITest constructor and destructor
301 //
302 //--------------------------------------------------------------------------------------
303
304 RBBITest::RBBITest() {
305 }
306
307
308 RBBITest::~RBBITest() {
309 }
310
311 //-----------------------------------------------------------------------------------
312 //
313 // Test for status {tag} return value from break rules.
314 // TODO: a more thorough test.
315 //
316 //-----------------------------------------------------------------------------------
317 void RBBITest::TestStatusReturn() {
318 UnicodeString rulesString1("$Letters = [:L:];\n"
319 "$Numbers = [:N:];\n"
320 "$Letters+{1};\n"
321 "$Numbers+{2};\n"
322 "Help\\ {4}/me\\!;\n"
323 "[^$Letters $Numbers];\n"
324 "!.*;\n", -1, US_INV);
325 UnicodeString testString1 = "abc123..abc Help me Help me!";
326 // 01234567890123456789012345678
327 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
328 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
329
330 UErrorCode status=U_ZERO_ERROR;
331 UParseError parseError;
332
333 BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
334 if(U_FAILURE(status)) {
335 dataerrln("FAIL : in construction - %s", u_errorName(status));
336 } else {
337 int32_t pos;
338 int32_t i = 0;
339 bi->setText(testString1);
340 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
341 if (pos != bounds1[i]) {
342 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
343 break;
344 }
345
346 int tag = bi->getRuleStatus();
347 if (tag != brkStatus[i]) {
348 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
349 break;
350 }
351 i++;
352 }
353 }
354 delete bi;
355 }
356
357
358 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
359 UErrorCode status = U_ZERO_ERROR;
360 char name[100];
361 printf("code alpha extend alphanum type word sent line name\n");
362 int nextExpectedIndex = 0;
363 utext_setNativeIndex(tstr, 0);
364 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
365 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
366 printf("------------------------------------------------ %d\n", j);
367 ++nextExpectedIndex;
368 }
369
370 UChar32 c = utext_next32(tstr);
371 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
372 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
373 u_isUAlphabetic(c),
374 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
375 u_isalnum(c),
376 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
377 u_charType(c),
378 U_SHORT_PROPERTY_NAME),
379 u_getPropertyValueName(UCHAR_WORD_BREAK,
380 u_getIntPropertyValue(c,
381 UCHAR_WORD_BREAK),
382 U_SHORT_PROPERTY_NAME),
383 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
384 u_getIntPropertyValue(c,
385 UCHAR_SENTENCE_BREAK),
386 U_SHORT_PROPERTY_NAME),
387 u_getPropertyValueName(UCHAR_LINE_BREAK,
388 u_getIntPropertyValue(c,
389 UCHAR_LINE_BREAK),
390 U_SHORT_PROPERTY_NAME),
391 name);
392 }
393 }
394
395
396 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
397 UErrorCode status = U_ZERO_ERROR;
398 UText *tstr = NULL;
399 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
400 if (U_FAILURE(status)) {
401 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
402 return;
403 }
404 printStringBreaks(tstr, expected, expectedCount);
405 utext_close(tstr);
406 }
407
408
409 void RBBITest::TestBug3818() {
410 UErrorCode status = U_ZERO_ERROR;
411
412 // Four Thai words...
413 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
414 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
415 UnicodeString thaiStr(thaiWordData);
416
417 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
418 if (U_FAILURE(status) || bi == NULL) {
419 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
420 return;
421 }
422 bi->setText(thaiStr);
423
424 int32_t startOfSecondWord = bi->following(1);
425 if (startOfSecondWord != 4) {
426 errln("Fail at file %s, line %d expected start of word at 4, got %d",
427 __FILE__, __LINE__, startOfSecondWord);
428 }
429 startOfSecondWord = bi->following(0);
430 if (startOfSecondWord != 4) {
431 errln("Fail at file %s, line %d expected start of word at 4, got %d",
432 __FILE__, __LINE__, startOfSecondWord);
433 }
434 delete bi;
435 }
436
437 //----------------------------------------------------------------------------
438 //
439 // generalIteratorTest Given a break iterator and a set of test data,
440 // Run the tests and report the results.
441 //
442 //----------------------------------------------------------------------------
443 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
444 {
445
446 bi.setText(td.fDataToBreak);
447
448 testFirstAndNext(bi, td);
449
450 testLastAndPrevious(bi, td);
451
452 testFollowing(bi, td);
453 testPreceding(bi, td);
454 testIsBoundary(bi, td);
455 doMultipleSelectionTest(bi, td);
456 }
457
458
459 //
460 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
461 // kind of loop.
462 //
463 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
464 {
465 UErrorCode status = U_ZERO_ERROR;
466 int32_t p;
467 int32_t lastP = -1;
468 int32_t tag;
469
470 logln("Test first and next");
471 bi.setText(td.fDataToBreak);
472 td.clearResults();
473
474 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
475 td.fActualBreakPositions.addElement(p, status); // Save result.
476 tag = bi.getRuleStatus();
477 td.fActualTags.addElement(tag, status);
478 if (p <= lastP) {
479 // If the iterator is not making forward progress, stop.
480 // No need to raise an error here, it'll be detected in the normal check of results.
481 break;
482 }
483 lastP = p;
484 }
485 td.checkResults("testFirstAndNext", this);
486 }
487
488
489 //
490 // TestLastAndPrevious. Run the iterator backwards, starting with last().
491 //
492 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
493 {
494 UErrorCode status = U_ZERO_ERROR;
495 int32_t p;
496 int32_t lastP = 0x7ffffffe;
497 int32_t tag;
498
499 logln("Test last and previous");
500 bi.setText(td.fDataToBreak);
501 td.clearResults();
502
503 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
504 // Save break position. Insert it at start of vector of results, shoving
505 // already-saved results further towards the end.
506 td.fActualBreakPositions.insertElementAt(p, 0, status);
507 // bi.previous(); // TODO: Why does this fix things up????
508 // bi.next();
509 tag = bi.getRuleStatus();
510 td.fActualTags.insertElementAt(tag, 0, status);
511 if (p >= lastP) {
512 // If the iterator is not making progress, stop.
513 // No need to raise an error here, it'll be detected in the normal check of results.
514 break;
515 }
516 lastP = p;
517 }
518 td.checkResults("testLastAndPrevious", this);
519 }
520
521
522 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
523 {
524 UErrorCode status = U_ZERO_ERROR;
525 int32_t p;
526 int32_t tag;
527 int32_t lastP = -2; // A value that will never be returned as a break position.
528 // cannot be -1; that is returned for DONE.
529 int i;
530
531 logln("testFollowing():");
532 bi.setText(td.fDataToBreak);
533 td.clearResults();
534
535 // Save the starting point, since we won't get that out of following.
536 p = bi.first();
537 td.fActualBreakPositions.addElement(p, status); // Save result.
538 tag = bi.getRuleStatus();
539 td.fActualTags.addElement(tag, status);
540
541 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
542 p = bi.following(i);
543 if (p != lastP) {
544 if (p == RuleBasedBreakIterator::DONE) {
545 break;
546 }
547 // We've reached a new break position. Save it.
548 td.fActualBreakPositions.addElement(p, status); // Save result.
549 tag = bi.getRuleStatus();
550 td.fActualTags.addElement(tag, status);
551 lastP = p;
552 }
553 }
554 // The loop normally exits by means of the break in the middle.
555 // Make sure that the index was at the correct position for the break iterator to have
556 // returned DONE.
557 if (i != td.fDataToBreak.length()) {
558 errln("testFollowing(): iterator returned DONE prematurely.");
559 }
560
561 // Full check of all results.
562 td.checkResults("testFollowing", this);
563 }
564
565
566
567 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
568 UErrorCode status = U_ZERO_ERROR;
569 int32_t p;
570 int32_t tag;
571 int32_t lastP = 0x7ffffffe;
572 int i;
573
574 logln("testPreceding():");
575 bi.setText(td.fDataToBreak);
576 td.clearResults();
577
578 p = bi.last();
579 td.fActualBreakPositions.addElement(p, status);
580 tag = bi.getRuleStatus();
581 td.fActualTags.addElement(tag, status);
582
583 for (i = td.fDataToBreak.length(); i>=-1; i--) {
584 p = bi.preceding(i);
585 if (p != lastP) {
586 if (p == RuleBasedBreakIterator::DONE) {
587 break;
588 }
589 // We've reached a new break position. Save it.
590 td.fActualBreakPositions.insertElementAt(p, 0, status);
591 lastP = p;
592 tag = bi.getRuleStatus();
593 td.fActualTags.insertElementAt(tag, 0, status);
594 }
595 }
596 // The loop normally exits by means of the break in the middle.
597 // Make sure that the index was at the correct position for the break iterator to have
598 // returned DONE.
599 if (i != 0) {
600 errln("testPreceding(): iterator returned DONE prematurely.");
601 }
602
603 // Full check of all results.
604 td.checkResults("testPreceding", this);
605 }
606
607
608
609 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
610 UErrorCode status = U_ZERO_ERROR;
611 int i;
612 int32_t tag;
613
614 logln("testIsBoundary():");
615 bi.setText(td.fDataToBreak);
616 td.clearResults();
617
618 for (i = 0; i <= td.fDataToBreak.length(); i++) {
619 if (bi.isBoundary(i)) {
620 td.fActualBreakPositions.addElement(i, status); // Save result.
621 tag = bi.getRuleStatus();
622 td.fActualTags.addElement(tag, status);
623 }
624 }
625 td.checkResults("testIsBoundary: ", this);
626 }
627
628
629
630 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
631 {
632 iterator.setText(td.fDataToBreak);
633
634 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
635 int32_t offset = iterator.first();
636 int32_t testOffset;
637 int32_t count = 0;
638
639 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
640
641 if (*testIterator != iterator)
642 errln("clone() or operator!= failed: two clones compared unequal");
643
644 do {
645 testOffset = testIterator->first();
646 testOffset = testIterator->next(count);
647 if (offset != testOffset)
648 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
649
650 if (offset != RuleBasedBreakIterator::DONE) {
651 count++;
652 offset = iterator.next();
653
654 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
655 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
656 if (count > 10000 || offset == -1) {
657 errln("operator== failed too many times. Stopping test.");
658 if (offset == -1) {
659 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
660 }
661 return;
662 }
663 }
664 }
665 } while (offset != RuleBasedBreakIterator::DONE);
666
667 // now do it backwards...
668 offset = iterator.last();
669 count = 0;
670
671 do {
672 testOffset = testIterator->last();
673 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
674 if (offset != testOffset)
675 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
676
677 if (offset != RuleBasedBreakIterator::DONE) {
678 count--;
679 offset = iterator.previous();
680 }
681 } while (offset != RuleBasedBreakIterator::DONE);
682
683 delete testIterator;
684 }
685
686
687 //---------------------------------------------
688 //
689 // other tests
690 //
691 //---------------------------------------------
692 void RBBITest::TestEmptyString()
693 {
694 UnicodeString text = "";
695 UErrorCode status = U_ZERO_ERROR;
696
697 BITestData x(status);
698 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
699 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
700 if (U_FAILURE(status))
701 {
702 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
703 return;
704 }
705 generalIteratorTest(*bi, x);
706 delete bi;
707 }
708
709 void RBBITest::TestGetAvailableLocales()
710 {
711 int32_t locCount = 0;
712 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
713
714 if (locCount == 0)
715 dataerrln("getAvailableLocales() returned an empty list!");
716 // Just make sure that it's returning good memory.
717 int32_t i;
718 for (i = 0; i < locCount; ++i) {
719 logln(locList[i].getName());
720 }
721 }
722
723 //Testing the BreakIterator::getDisplayName() function
724 void RBBITest::TestGetDisplayName()
725 {
726 UnicodeString result;
727
728 BreakIterator::getDisplayName(Locale::getUS(), result);
729 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
730 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
731 + result);
732
733 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
734 if (result != "French (France)")
735 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
736 + result);
737 }
738 /**
739 * Test End Behaviour
740 * @bug 4068137
741 */
742 void RBBITest::TestEndBehaviour()
743 {
744 UErrorCode status = U_ZERO_ERROR;
745 UnicodeString testString("boo.");
746 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
747 if (U_FAILURE(status))
748 {
749 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
750 return;
751 }
752 wb->setText(testString);
753
754 if (wb->first() != 0)
755 errln("Didn't get break at beginning of string.");
756 if (wb->next() != 3)
757 errln("Didn't get break before period in \"boo.\"");
758 if (wb->current() != 4 && wb->next() != 4)
759 errln("Didn't get break at end of string.");
760 delete wb;
761 }
762 /*
763 * @bug 4153072
764 */
765 void RBBITest::TestBug4153072() {
766 UErrorCode status = U_ZERO_ERROR;
767 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
768 if (U_FAILURE(status))
769 {
770 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
771 return;
772 }
773 UnicodeString str("...Hello, World!...");
774 int32_t begin = 3;
775 int32_t end = str.length() - 3;
776 UBool onBoundary;
777
778 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
779 iter->adoptText(textIterator);
780 int index;
781 // Note: with the switch to UText, there is no way to restrict the
782 // iteration range to begin at an index other than zero.
783 // String character iterators created with a non-zero bound are
784 // treated by RBBI as being empty.
785 for (index = -1; index < begin + 1; ++index) {
786 onBoundary = iter->isBoundary(index);
787 if (index == 0? !onBoundary : onBoundary) {
788 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
789 " and begin index = " + begin);
790 }
791 }
792 delete iter;
793 }
794
795
796 //
797 // Test for problem reported by Ashok Matoria on 9 July 2007
798 // One.<kSoftHyphen><kSpace>Two.
799 //
800 // Sentence break at start (0) and then on calling next() it breaks at
801 // 'T' of "Two". Now, at this point if I do next() and
802 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
803 //
804 void RBBITest::TestBug5775() {
805 UErrorCode status = U_ZERO_ERROR;
806 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
807 TEST_ASSERT_SUCCESS(status);
808 if (U_FAILURE(status)) {
809 return;
810 }
811 // Check for status first for better handling of no data errors.
812 TEST_ASSERT(bi != NULL);
813 if (bi == NULL) {
814 return;
815 }
816
817 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
818 // 01234 56789
819 s = s.unescape();
820 bi->setText(s);
821 int pos = bi->next();
822 TEST_ASSERT(pos == 6);
823 pos = bi->next();
824 TEST_ASSERT(pos == 10);
825 pos = bi->previous();
826 TEST_ASSERT(pos == 6);
827 delete bi;
828 }
829
830
831
832 //------------------------------------------------------------------------------
833 //
834 // RBBITest::Extended Run RBBI Tests from an external test data file
835 //
836 //------------------------------------------------------------------------------
837
838 struct TestParams {
839 BreakIterator *bi; // Break iterator is set while parsing test source.
840 // Changed out whenever test data changes break type.
841
842 UnicodeString dataToBreak; // Data that is built up while parsing the test.
843 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
844 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
845 UVector32 *srcCol;
846
847 UText *textToBreak; // UText, could be UTF8 or UTF16.
848 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
849 CharString utf8String; // UTF-8 form of text to break.
850
851 TestParams(UErrorCode &status) : dataToBreak() {
852 bi = NULL;
853 expectedBreaks = new UVector32(status);
854 srcLine = new UVector32(status);
855 srcCol = new UVector32(status);
856 textToBreak = NULL;
857 textMap = new UVector32(status);
858 }
859
860 ~TestParams() {
861 delete bi;
862 delete expectedBreaks;
863 delete srcLine;
864 delete srcCol;
865 utext_close(textToBreak);
866 delete textMap;
867 }
868
869 int32_t getSrcLine(int32_t bp);
870 int32_t getExpectedBreak(int32_t bp);
871 int32_t getSrcCol(int32_t bp);
872
873 void setUTF16(UErrorCode &status);
874 void setUTF8(UErrorCode &status);
875 };
876
877 // Append a UnicodeString to a CharString with UTF-8 encoding.
878 // Substitute any invalid chars.
879 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
880 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
881 if (U_FAILURE(status)) {
882 return;
883 }
884 int32_t utf8Length;
885 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
886 src.getBuffer(), src.length(), // UTF-16 data
887 0xfffd, NULL, // Substitution char, number of subs.
888 &status);
889 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
890 return;
891 }
892 status = U_ZERO_ERROR;
893 int32_t capacity;
894 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
895 u_strToUTF8WithSub(buffer, utf8Length, NULL,
896 src.getBuffer(), src.length(),
897 0xfffd, NULL, &status);
898 dest.append(buffer, utf8Length, status);
899 }
900
901
902 void TestParams::setUTF16(UErrorCode &status) {
903 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
904 textMap->removeAllElements();
905 for (int32_t i=0; i<dataToBreak.length(); i++) {
906 if (i == dataToBreak.getChar32Start(i)) {
907 textMap->addElement(i, status);
908 } else {
909 textMap->addElement(-1, status);
910 }
911 }
912 textMap->addElement(dataToBreak.length(), status);
913 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
914 }
915
916
917 void TestParams::setUTF8(UErrorCode &status) {
918 if (U_FAILURE(status)) {
919 return;
920 }
921 utf8String.clear();
922 CharStringAppend(utf8String, dataToBreak, status);
923 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
924 if (U_FAILURE(status)) {
925 return;
926 }
927
928 textMap->removeAllElements();
929 int32_t utf16Index = 0;
930 for (;;) {
931 textMap->addElement(utf16Index, status);
932 UChar32 c32 = utext_current32(textToBreak);
933 if (c32 < 0) {
934 break;
935 }
936 utf16Index += U16_LENGTH(c32);
937 utext_next32(textToBreak);
938 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
939 textMap->addElement(-1, status);
940 }
941 }
942 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
943 }
944
945
946 int32_t TestParams::getSrcLine(int bp) {
947 if (bp >= textMap->size()) {
948 bp = textMap->size() - 1;
949 }
950 int32_t i = 0;
951 for(; bp >= 0 ; --bp) {
952 // Move to a character boundary if we are not on one already.
953 i = textMap->elementAti(bp);
954 if (i >= 0) {
955 break;
956 }
957 }
958 return srcLine->elementAti(i);
959 }
960
961
962 int32_t TestParams::getExpectedBreak(int bp) {
963 if (bp >= textMap->size()) {
964 return 0;
965 }
966 int32_t i = textMap->elementAti(bp);
967 int32_t retVal = 0;
968 if (i >= 0) {
969 retVal = expectedBreaks->elementAti(i);
970 }
971 return retVal;
972 }
973
974
975 int32_t TestParams::getSrcCol(int bp) {
976 if (bp >= textMap->size()) {
977 bp = textMap->size() - 1;
978 }
979 int32_t i = 0;
980 for(; bp >= 0; --bp) {
981 // Move bp to a character boundary if we are not on one already.
982 i = textMap->elementAti(bp);
983 if (i >= 0) {
984 break;
985 }
986 }
987 return srcCol->elementAti(i);
988 }
989
990
991 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
992 int32_t bp;
993 int32_t prevBP;
994 int32_t i;
995
996 TEST_ASSERT_SUCCESS(status);
997 if (U_FAILURE(status)) {
998 return;
999 }
1000
1001 if (t->bi == NULL) {
1002 return;
1003 }
1004
1005 t->bi->setText(t->textToBreak, status);
1006 //
1007 // Run the iterator forward
1008 //
1009 prevBP = -1;
1010 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1011 if (prevBP == bp) {
1012 // Fail for lack of forward progress.
1013 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1014 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1015 break;
1016 }
1017
1018 // Check that there we didn't miss an expected break between the last one
1019 // and this one.
1020 for (i=prevBP+1; i<bp; i++) {
1021 if (t->getExpectedBreak(i) != 0) {
1022 int expected[] = {0, i};
1023 printStringBreaks(t->dataToBreak, expected, 2);
1024 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1025 i, t->getSrcLine(i), t->getSrcCol(i));
1026 }
1027 }
1028
1029 // Check that the break we did find was expected
1030 if (t->getExpectedBreak(bp) == 0) {
1031 int expected[] = {0, bp};
1032 printStringBreaks(t->textToBreak, expected, 2);
1033 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1034 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1035 } else {
1036 // The break was expected.
1037 // Check that the {nnn} tag value is correct.
1038 int32_t expectedTagVal = t->getExpectedBreak(bp);
1039 if (expectedTagVal == -1) {
1040 expectedTagVal = 0;
1041 }
1042 int32_t line = t->getSrcLine(bp);
1043 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1044 if (rs != expectedTagVal) {
1045 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1046 " Actual, Expected status = %4d, %4d",
1047 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1048 }
1049 }
1050
1051 prevBP = bp;
1052 }
1053
1054 // Verify that there were no missed expected breaks after the last one found
1055 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1056 if (t->getExpectedBreak(i) != 0) {
1057 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1058 i, t->getSrcLine(i), t->getSrcCol(i));
1059 }
1060 }
1061
1062 //
1063 // Run the iterator backwards, verify that the same breaks are found.
1064 //
1065 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
1066 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1067 if (prevBP == bp) {
1068 // Fail for lack of progress.
1069 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1070 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1071 break;
1072 }
1073
1074 // Check that we didn't miss an expected break between the last one
1075 // and this one. (UVector returns zeros for index out of bounds.)
1076 for (i=prevBP-1; i>bp; i--) {
1077 if (t->getExpectedBreak(i) != 0) {
1078 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1079 i, t->getSrcLine(i), t->getSrcCol(i));
1080 }
1081 }
1082
1083 // Check that the break we did find was expected
1084 if (t->getExpectedBreak(bp) == 0) {
1085 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1086 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1087 } else {
1088 // The break was expected.
1089 // Check that the {nnn} tag value is correct.
1090 int32_t expectedTagVal = t->getExpectedBreak(bp);
1091 if (expectedTagVal == -1) {
1092 expectedTagVal = 0;
1093 }
1094 int line = t->getSrcLine(bp);
1095 int32_t rs = t->bi->getRuleStatus();
1096 if (rs != expectedTagVal) {
1097 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1098 " Actual, Expected status = %4d, %4d",
1099 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1100 }
1101 }
1102
1103 prevBP = bp;
1104 }
1105
1106 // Verify that there were no missed breaks prior to the last one found
1107 for (i=prevBP-1; i>=0; i--) {
1108 if (t->getExpectedBreak(i) != 0) {
1109 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1110 i, t->getSrcLine(i), t->getSrcCol(i));
1111 }
1112 }
1113
1114 // Check isBoundary()
1115 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1116 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1117 UBool boundaryFound = t->bi->isBoundary(i);
1118 if (boundaryExpected != boundaryFound) {
1119 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1120 " Expected, Actual= %s, %s",
1121 i, t->getSrcLine(i), t->getSrcCol(i),
1122 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1123 }
1124 }
1125
1126 // Check following()
1127 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1128 int32_t actualBreak = t->bi->following(i);
1129 int32_t expectedBreak = BreakIterator::DONE;
1130 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1131 if (t->getExpectedBreak(j) != 0) {
1132 expectedBreak = j;
1133 break;
1134 }
1135 }
1136 if (expectedBreak != actualBreak) {
1137 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1138 " Expected, Actual= %d, %d",
1139 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1140 }
1141 }
1142
1143 // Check preceding()
1144 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1145 int32_t actualBreak = t->bi->preceding(i);
1146 int32_t expectedBreak = BreakIterator::DONE;
1147
1148 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1149 // preceding(trailing byte) will return the index of some preceding code point,
1150 // not the lead byte of the current code point, even though that has a smaller index.
1151 // Therefore, start looking at the expected break data not at i-1, but at
1152 // the start of code point index - 1.
1153 utext_setNativeIndex(t->textToBreak, i);
1154 int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1155 for (; j >= 0; j--) {
1156 if (t->getExpectedBreak(j) != 0) {
1157 expectedBreak = j;
1158 break;
1159 }
1160 }
1161 if (expectedBreak != actualBreak) {
1162 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1163 " Expected, Actual= %d, %d",
1164 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1165 }
1166 }
1167 }
1168
1169
1170 void RBBITest::TestExtended() {
1171 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1172 UErrorCode status = U_ZERO_ERROR;
1173 Locale locale("");
1174
1175 UnicodeString rules;
1176 TestParams tp(status);
1177
1178 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status);
1179 if (U_FAILURE(status)) {
1180 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1181 }
1182
1183
1184 //
1185 // Open and read the test data file.
1186 //
1187 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1188 char testFileName[1000];
1189 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1190 errln("Can't open test data. Path too long.");
1191 return;
1192 }
1193 strcpy(testFileName, testDataDirectory);
1194 strcat(testFileName, "rbbitst.txt");
1195
1196 int len;
1197 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1198 if (U_FAILURE(status)) {
1199 return; /* something went wrong, error already output */
1200 }
1201
1202
1203
1204
1205 //
1206 // Put the test data into a UnicodeString
1207 //
1208 UnicodeString testString(FALSE, testFile, len);
1209
1210 enum EParseState{
1211 PARSE_COMMENT,
1212 PARSE_TAG,
1213 PARSE_DATA,
1214 PARSE_NUM
1215 }
1216 parseState = PARSE_TAG;
1217
1218 EParseState savedState = PARSE_TAG;
1219
1220 static const UChar CH_LF = 0x0a;
1221 static const UChar CH_CR = 0x0d;
1222 static const UChar CH_HASH = 0x23;
1223 /*static const UChar CH_PERIOD = 0x2e;*/
1224 static const UChar CH_LT = 0x3c;
1225 static const UChar CH_GT = 0x3e;
1226 static const UChar CH_BACKSLASH = 0x5c;
1227 static const UChar CH_BULLET = 0x2022;
1228
1229 int32_t lineNum = 1;
1230 int32_t colStart = 0;
1231 int32_t column = 0;
1232 int32_t charIdx = 0;
1233
1234 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1235
1236 for (charIdx = 0; charIdx < len; ) {
1237 status = U_ZERO_ERROR;
1238 UChar c = testString.charAt(charIdx);
1239 charIdx++;
1240 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1241 // treat CRLF as a unit
1242 c = CH_LF;
1243 charIdx++;
1244 }
1245 if (c == CH_LF || c == CH_CR) {
1246 lineNum++;
1247 colStart = charIdx;
1248 }
1249 column = charIdx - colStart + 1;
1250
1251 switch (parseState) {
1252 case PARSE_COMMENT:
1253 if (c == 0x0a || c == 0x0d) {
1254 parseState = savedState;
1255 }
1256 break;
1257
1258 case PARSE_TAG:
1259 {
1260 if (c == CH_HASH) {
1261 parseState = PARSE_COMMENT;
1262 savedState = PARSE_TAG;
1263 break;
1264 }
1265 if (u_isUWhiteSpace(c)) {
1266 break;
1267 }
1268 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1269 delete tp.bi;
1270 tp.bi = BreakIterator::createWordInstance(locale, status);
1271 charIdx += 5;
1272 break;
1273 }
1274 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1275 delete tp.bi;
1276 tp.bi = BreakIterator::createCharacterInstance(locale, status);
1277 charIdx += 5;
1278 break;
1279 }
1280 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1281 delete tp.bi;
1282 tp.bi = BreakIterator::createLineInstance(locale, status);
1283 charIdx += 5;
1284 break;
1285 }
1286 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1287 delete tp.bi;
1288 tp.bi = NULL;
1289 tp.bi = BreakIterator::createSentenceInstance(locale, status);
1290 charIdx += 5;
1291 break;
1292 }
1293 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1294 delete tp.bi;
1295 tp.bi = BreakIterator::createTitleInstance(locale, status);
1296 charIdx += 6;
1297 break;
1298 }
1299
1300 // <locale loc_name>
1301 localeMatcher.reset(testString);
1302 if (localeMatcher.lookingAt(charIdx-1, status)) {
1303 UnicodeString localeName = localeMatcher.group(1, status);
1304 char localeName8[100];
1305 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1306 locale = Locale::createFromName(localeName8);
1307 charIdx += localeMatcher.group(0, status).length() - 1;
1308 TEST_ASSERT_SUCCESS(status);
1309 break;
1310 }
1311 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1312 parseState = PARSE_DATA;
1313 charIdx += 5;
1314 tp.dataToBreak = "";
1315 tp.expectedBreaks->removeAllElements();
1316 tp.srcCol ->removeAllElements();
1317 tp.srcLine->removeAllElements();
1318 break;
1319 }
1320
1321 errln("line %d: Tag expected in test file.", lineNum);
1322 parseState = PARSE_COMMENT;
1323 savedState = PARSE_DATA;
1324 goto end_test; // Stop the test.
1325 }
1326 break;
1327
1328 case PARSE_DATA:
1329 if (c == CH_BULLET) {
1330 int32_t breakIdx = tp.dataToBreak.length();
1331 tp.expectedBreaks->setSize(breakIdx+1);
1332 tp.expectedBreaks->setElementAt(-1, breakIdx);
1333 tp.srcLine->setSize(breakIdx+1);
1334 tp.srcLine->setElementAt(lineNum, breakIdx);
1335 tp.srcCol ->setSize(breakIdx+1);
1336 tp.srcCol ->setElementAt(column, breakIdx);
1337 break;
1338 }
1339
1340 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1341 // Add final entry to mappings from break location to source file position.
1342 // Need one extra because last break position returned is after the
1343 // last char in the data, not at the last char.
1344 tp.srcLine->addElement(lineNum, status);
1345 tp.srcCol ->addElement(column, status);
1346
1347 parseState = PARSE_TAG;
1348 charIdx += 6;
1349
1350 // RUN THE TEST!
1351 status = U_ZERO_ERROR;
1352 tp.setUTF16(status);
1353 executeTest(&tp, status);
1354 TEST_ASSERT_SUCCESS(status);
1355
1356 // Run again, this time with UTF-8 text wrapped in a UText.
1357 status = U_ZERO_ERROR;
1358 tp.setUTF8(status);
1359 TEST_ASSERT_SUCCESS(status);
1360 executeTest(&tp, status);
1361 break;
1362 }
1363
1364 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1365 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1366 // Get the code point from the name and insert it into the test data.
1367 // (Damn, no API takes names in Unicode !!!
1368 // we've got to take it back to char *)
1369 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1370 int32_t nameLength = nameEndIdx - (charIdx+2);
1371 char charNameBuf[200];
1372 UChar32 theChar = -1;
1373 if (nameEndIdx != -1) {
1374 UErrorCode status = U_ZERO_ERROR;
1375 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1376 charNameBuf[sizeof(charNameBuf)-1] = 0;
1377 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1378 if (U_FAILURE(status)) {
1379 theChar = -1;
1380 }
1381 }
1382 if (theChar == -1) {
1383 errln("Error in named character in test file at line %d, col %d",
1384 lineNum, column);
1385 } else {
1386 // Named code point was recognized. Insert it
1387 // into the test data.
1388 tp.dataToBreak.append(theChar);
1389 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1390 tp.srcLine->addElement(lineNum, status);
1391 tp.srcCol ->addElement(column, status);
1392 }
1393 }
1394 if (nameEndIdx > charIdx) {
1395 charIdx = nameEndIdx+1;
1396
1397 }
1398 break;
1399 }
1400
1401
1402
1403
1404 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1405 charIdx++;
1406 int32_t breakIdx = tp.dataToBreak.length();
1407 tp.expectedBreaks->setSize(breakIdx+1);
1408 tp.expectedBreaks->setElementAt(-1, breakIdx);
1409 tp.srcLine->setSize(breakIdx+1);
1410 tp.srcLine->setElementAt(lineNum, breakIdx);
1411 tp.srcCol ->setSize(breakIdx+1);
1412 tp.srcCol ->setElementAt(column, breakIdx);
1413 break;
1414 }
1415
1416 if (c == CH_LT) {
1417 tagValue = 0;
1418 parseState = PARSE_NUM;
1419 break;
1420 }
1421
1422 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
1423 parseState = PARSE_COMMENT;
1424 savedState = PARSE_DATA;
1425 break;
1426 }
1427
1428 if (c == CH_BACKSLASH) {
1429 // Check for \ at end of line, a line continuation.
1430 // Advance over (discard) the newline
1431 UChar32 cp = testString.char32At(charIdx);
1432 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1433 // We have a CR LF
1434 // Need an extra increment of the input ptr to move over both of them
1435 charIdx++;
1436 }
1437 if (cp == CH_LF || cp == CH_CR) {
1438 lineNum++;
1439 colStart = charIdx;
1440 charIdx++;
1441 break;
1442 }
1443
1444 // Let unescape handle the back slash.
1445 cp = testString.unescapeAt(charIdx);
1446 if (cp != -1) {
1447 // Escape sequence was recognized. Insert the char
1448 // into the test data.
1449 tp.dataToBreak.append(cp);
1450 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1451 tp.srcLine->addElement(lineNum, status);
1452 tp.srcCol ->addElement(column, status);
1453 }
1454 break;
1455 }
1456
1457
1458 // Not a recognized backslash escape sequence.
1459 // Take the next char as a literal.
1460 // TODO: Should this be an error?
1461 c = testString.charAt(charIdx);
1462 charIdx = testString.moveIndex32(charIdx, 1);
1463 }
1464
1465 // Normal, non-escaped data char.
1466 tp.dataToBreak.append(c);
1467
1468 // Save the mapping from offset in the data to line/column numbers in
1469 // the original input file. Will be used for better error messages only.
1470 // If there's an expected break before this char, the slot in the mapping
1471 // vector will already be set for this char; don't overwrite it.
1472 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1473 tp.srcLine->addElement(lineNum, status);
1474 tp.srcCol ->addElement(column, status);
1475 }
1476 break;
1477
1478
1479 case PARSE_NUM:
1480 // We are parsing an expected numeric tag value, like <1234>,
1481 // within a chunk of data.
1482 if (u_isUWhiteSpace(c)) {
1483 break;
1484 }
1485
1486 if (c == CH_GT) {
1487 // Finished the number. Add the info to the expected break data,
1488 // and switch parse state back to doing plain data.
1489 parseState = PARSE_DATA;
1490 if (tagValue == 0) {
1491 tagValue = -1;
1492 }
1493 int32_t breakIdx = tp.dataToBreak.length();
1494 tp.expectedBreaks->setSize(breakIdx+1);
1495 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1496 tp.srcLine->setSize(breakIdx+1);
1497 tp.srcLine->setElementAt(lineNum, breakIdx);
1498 tp.srcCol ->setSize(breakIdx+1);
1499 tp.srcCol ->setElementAt(column, breakIdx);
1500 break;
1501 }
1502
1503 if (u_isdigit(c)) {
1504 tagValue = tagValue*10 + u_charDigitValue(c);
1505 break;
1506 }
1507
1508 errln("Syntax Error in test file at line %d, col %d",
1509 lineNum, column);
1510 parseState = PARSE_COMMENT;
1511 goto end_test; // Stop the test
1512 break;
1513 }
1514
1515
1516 if (U_FAILURE(status)) {
1517 dataerrln("ICU Error %s while parsing test file at line %d.",
1518 u_errorName(status), lineNum);
1519 status = U_ZERO_ERROR;
1520 goto end_test; // Stop the test
1521 }
1522
1523 }
1524
1525 end_test:
1526 delete [] testFile;
1527 #endif
1528 }
1529
1530
1531 //-------------------------------------------------------------------------------
1532 //
1533 // TestDictRules create a break iterator from source rules that includes a
1534 // dictionary range. Regression for bug #7130. Source rules
1535 // do not declare a break iterator type (word, line, sentence, etc.
1536 // but the dictionary code, without a type, would loop.
1537 //
1538 //-------------------------------------------------------------------------------
1539 void RBBITest::TestDictRules() {
1540 const char *rules = "$dictionary = [a-z]; \n"
1541 "!!forward; \n"
1542 "$dictionary $dictionary; \n"
1543 "!!reverse; \n"
1544 "$dictionary $dictionary; \n";
1545 const char *text = "aa";
1546 UErrorCode status = U_ZERO_ERROR;
1547 UParseError parseError;
1548
1549 RuleBasedBreakIterator bi(rules, parseError, status);
1550 if (U_SUCCESS(status)) {
1551 UnicodeString utext = text;
1552 bi.setText(utext);
1553 int32_t position;
1554 int32_t loops;
1555 for (loops = 0; loops<10; loops++) {
1556 position = bi.next();
1557 if (position == RuleBasedBreakIterator::DONE) {
1558 break;
1559 }
1560 }
1561 TEST_ASSERT(loops == 1);
1562 } else {
1563 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1564 }
1565 }
1566
1567
1568
1569 //-------------------------------------------------------------------------------
1570 //
1571 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1572 // return the datain one big UChar * buffer, which the caller must delete.
1573 //
1574 // parameters:
1575 // fileName: the name of the file, with no directory part. The test data directory
1576 // is assumed.
1577 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1578 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1579 // specified here. The BOM, if it exists, will be stripped from the returned data.
1580 // Pass NULL for the system default encoding.
1581 // status
1582 // returns:
1583 // The file data, converted to UChar.
1584 // The caller must delete this when done with
1585 // delete [] theBuffer;
1586 //
1587 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1588 // Move this function to some common place.
1589 //
1590 //--------------------------------------------------------------------------------
1591 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1592 UChar *retPtr = NULL;
1593 char *fileBuf = NULL;
1594 UConverter* conv = NULL;
1595 FILE *f = NULL;
1596
1597 ulen = 0;
1598 if (U_FAILURE(status)) {
1599 return retPtr;
1600 }
1601
1602 //
1603 // Open the file.
1604 //
1605 f = fopen(fileName, "rb");
1606 if (f == 0) {
1607 dataerrln("Error opening test data file %s\n", fileName);
1608 status = U_FILE_ACCESS_ERROR;
1609 return NULL;
1610 }
1611 //
1612 // Read it in
1613 //
1614 int fileSize;
1615 int amt_read;
1616
1617 fseek( f, 0, SEEK_END);
1618 fileSize = ftell(f);
1619 fileBuf = new char[fileSize];
1620 fseek(f, 0, SEEK_SET);
1621 amt_read = fread(fileBuf, 1, fileSize, f);
1622 if (amt_read != fileSize || fileSize <= 0) {
1623 errln("Error reading test data file.");
1624 goto cleanUpAndReturn;
1625 }
1626
1627 //
1628 // Look for a Unicode Signature (BOM) on the data just read
1629 //
1630 int32_t signatureLength;
1631 const char * fileBufC;
1632 const char* bomEncoding;
1633
1634 fileBufC = fileBuf;
1635 bomEncoding = ucnv_detectUnicodeSignature(
1636 fileBuf, fileSize, &signatureLength, &status);
1637 if(bomEncoding!=NULL ){
1638 fileBufC += signatureLength;
1639 fileSize -= signatureLength;
1640 encoding = bomEncoding;
1641 }
1642
1643 //
1644 // Open a converter to take the rule file to UTF-16
1645 //
1646 conv = ucnv_open(encoding, &status);
1647 if (U_FAILURE(status)) {
1648 goto cleanUpAndReturn;
1649 }
1650
1651 //
1652 // Convert the rules to UChar.
1653 // Preflight first to determine required buffer size.
1654 //
1655 ulen = ucnv_toUChars(conv,
1656 NULL, // dest,
1657 0, // destCapacity,
1658 fileBufC,
1659 fileSize,
1660 &status);
1661 if (status == U_BUFFER_OVERFLOW_ERROR) {
1662 // Buffer Overflow is expected from the preflight operation.
1663 status = U_ZERO_ERROR;
1664
1665 retPtr = new UChar[ulen+1];
1666 ucnv_toUChars(conv,
1667 retPtr, // dest,
1668 ulen+1,
1669 fileBufC,
1670 fileSize,
1671 &status);
1672 }
1673
1674 cleanUpAndReturn:
1675 fclose(f);
1676 delete []fileBuf;
1677 ucnv_close(conv);
1678 if (U_FAILURE(status)) {
1679 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1680 delete []retPtr;
1681 retPtr = 0;
1682 ulen = 0;
1683 };
1684 return retPtr;
1685 }
1686
1687
1688
1689 //--------------------------------------------------------------------------------------------
1690 //
1691 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1692 //
1693 //-------------------------------------------------------------------------------------------
1694 void RBBITest::TestUnicodeFiles() {
1695 RuleBasedBreakIterator *bi;
1696 UErrorCode status = U_ZERO_ERROR;
1697
1698 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1699 TEST_ASSERT_SUCCESS(status);
1700 if (U_SUCCESS(status)) {
1701 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1702 }
1703 delete bi;
1704
1705 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1706 TEST_ASSERT_SUCCESS(status);
1707 if (U_SUCCESS(status)) {
1708 runUnicodeTestData("WordBreakTest.txt", bi);
1709 }
1710 delete bi;
1711
1712 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1713 TEST_ASSERT_SUCCESS(status);
1714 if (U_SUCCESS(status)) {
1715 runUnicodeTestData("SentenceBreakTest.txt", bi);
1716 }
1717 delete bi;
1718
1719 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1720 TEST_ASSERT_SUCCESS(status);
1721 if (U_SUCCESS(status)) {
1722 runUnicodeTestData("LineBreakTest.txt", bi);
1723 }
1724 delete bi;
1725 }
1726
1727
1728 // Check for test cases from the Unicode test data files that are known to fail
1729 // and should be skipped because ICU is not yet able to fully implement the spec.
1730 // See ticket #7270.
1731
1732 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1733 static const UChar badTestCases[][4] = { // Line Numbers from Unicode 7.0.0 file.
1734 {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000}, // Line 5198
1735 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000}, // Line 5202
1736 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000}, // Line 5214
1737 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000}, // Line 5246
1738 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000}, // Line 5298
1739 {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000} // Line 5302
1740 };
1741 if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1742 return FALSE;
1743 }
1744
1745 for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1746 if (testCase == UnicodeString(badTestCases[i])) {
1747 return logKnownIssue("7270");
1748 }
1749 }
1750 return FALSE;
1751 }
1752
1753
1754 //--------------------------------------------------------------------------------------------
1755 //
1756 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1757 //
1758 //-------------------------------------------------------------------------------------------
1759 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1761 UErrorCode status = U_ZERO_ERROR;
1762
1763 //
1764 // Open and read the test data file, put it into a UnicodeString.
1765 //
1766 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1767 char testFileName[1000];
1768 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1769 dataerrln("Can't open test data. Path too long.");
1770 return;
1771 }
1772 strcpy(testFileName, testDataDirectory);
1773 strcat(testFileName, fileName);
1774
1775 logln("Opening data file %s\n", fileName);
1776
1777 int len;
1778 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1779 if (status != U_FILE_ACCESS_ERROR) {
1780 TEST_ASSERT_SUCCESS(status);
1781 TEST_ASSERT(testFile != NULL);
1782 }
1783 if (U_FAILURE(status) || testFile == NULL) {
1784 return; /* something went wrong, error already output */
1785 }
1786 UnicodeString testFileAsString(TRUE, testFile, len);
1787
1788 //
1789 // Parse the test data file using a regular expression.
1790 // Each kind of token is recognized in its own capture group; what type of item was scanned
1791 // is identified by which group had a match.
1792 //
1793 // Caputure Group # 1 2 3 4 5
1794 // Parses this item: divide x hex digits comment \n unrecognized \n
1795 //
1796 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1797 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1798 UnicodeString testString;
1799 UVector32 breakPositions(status);
1800 int lineNumber = 1;
1801 TEST_ASSERT_SUCCESS(status);
1802 if (U_FAILURE(status)) {
1803 return;
1804 }
1805
1806 //
1807 // Scan through each test case, building up the string to be broken in testString,
1808 // and the positions that should be boundaries in the breakPositions vector.
1809 //
1810 int spin = 0;
1811 while (tokenMatcher.find()) {
1812 if(tokenMatcher.hitEnd()) {
1813 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1814 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1815 and caused an infinite loop here on EBCDIC systems!
1816 */
1817 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1818 // return;
1819 }
1820 if (tokenMatcher.start(1, status) >= 0) {
1821 // Scanned a divide sign, indicating a break position in the test data.
1822 if (testString.length()>0) {
1823 breakPositions.addElement(testString.length(), status);
1824 }
1825 }
1826 else if (tokenMatcher.start(2, status) >= 0) {
1827 // Scanned an 'x', meaning no break at this position in the test data
1828 // Nothing to be done here.
1829 }
1830 else if (tokenMatcher.start(3, status) >= 0) {
1831 // Scanned Hex digits. Convert them to binary, append to the character data string.
1832 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1833 int length = hexNumber.length();
1834 if (length<=8) {
1835 char buf[10];
1836 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1837 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1838 if (c<=0x10ffff) {
1839 testString.append(c);
1840 } else {
1841 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1842 fileName, lineNumber);
1843 }
1844 } else {
1845 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1846 fileName, lineNumber);
1847 }
1848 }
1849 else if (tokenMatcher.start(4, status) >= 0) {
1850 // Scanned to end of a line, possibly skipping over a comment in the process.
1851 // If the line from the file contained test data, run the test now.
1852 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1853 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1854 }
1855
1856 // Clear out this test case.
1857 // The string and breakPositions vector will be refilled as the next
1858 // test case is parsed.
1859 testString.remove();
1860 breakPositions.removeAllElements();
1861 lineNumber++;
1862 } else {
1863 // Scanner catchall. Something unrecognized appeared on the line.
1864 char token[16];
1865 UnicodeString uToken = tokenMatcher.group(0, status);
1866 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1867 token[sizeof(token)-1] = 0;
1868 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1869
1870 // Clean up, in preparation for continuing with the next line.
1871 testString.remove();
1872 breakPositions.removeAllElements();
1873 lineNumber++;
1874 }
1875 TEST_ASSERT_SUCCESS(status);
1876 if (U_FAILURE(status)) {
1877 break;
1878 }
1879 }
1880
1881 delete [] testFile;
1882 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1883 }
1884
1885 //--------------------------------------------------------------------------------------------
1886 //
1887 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1888 // test data files. Do only a simple, forward-only check -
1889 // this test is mostly to check that ICU and the Unicode
1890 // data agree with each other.
1891 //
1892 //--------------------------------------------------------------------------------------------
1893 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1894 const UnicodeString &testString, // Text data to be broken
1895 UVector32 *breakPositions, // Positions where breaks should be found.
1896 RuleBasedBreakIterator *bi) {
1897 int32_t pos; // Break Position in the test string
1898 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1899 int32_t expectedPos; // Expected break position (index into test string)
1900
1901 bi->setText(testString);
1902 pos = bi->first();
1903 pos = bi->next();
1904
1905 while (pos != BreakIterator::DONE) {
1906 if (expectedI >= breakPositions->size()) {
1907 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1908 testFileName, lineNumber, pos);
1909 break;
1910 }
1911 expectedPos = breakPositions->elementAti(expectedI);
1912 if (pos < expectedPos) {
1913 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1914 testFileName, lineNumber, pos);
1915 break;
1916 }
1917 if (pos > expectedPos) {
1918 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1919 testFileName, lineNumber, expectedPos);
1920 break;
1921 }
1922 pos = bi->next();
1923 expectedI++;
1924 }
1925
1926 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1927 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1928 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1929 }
1930 }
1931
1932
1933
1934 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1935 //---------------------------------------------------------------------------------------
1936 //
1937 // classs RBBIMonkeyKind
1938 //
1939 // Monkey Test for Break Iteration
1940 // Abstract interface class. Concrete derived classes independently
1941 // implement the break rules for different iterator types.
1942 //
1943 // The Monkey Test itself uses doesn't know which type of break iterator it is
1944 // testing, but works purely in terms of the interface defined here.
1945 //
1946 //---------------------------------------------------------------------------------------
1947 class RBBIMonkeyKind {
1948 public:
1949 // Return a UVector of UnicodeSets, representing the character classes used
1950 // for this type of iterator.
1951 virtual UVector *charClasses() = 0;
1952
1953 // Set the test text on which subsequent calls to next() will operate
1954 virtual void setText(const UnicodeString &s) = 0;
1955
1956 // Find the next break postion, starting from the prev break position, or from zero.
1957 // Return -1 after reaching end of string.
1958 virtual int32_t next(int32_t i) = 0;
1959
1960 virtual ~RBBIMonkeyKind();
1961 UErrorCode deferredStatus;
1962
1963
1964 protected:
1965 RBBIMonkeyKind();
1966
1967 private:
1968 };
1969
1970 RBBIMonkeyKind::RBBIMonkeyKind() {
1971 deferredStatus = U_ZERO_ERROR;
1972 }
1973
1974 RBBIMonkeyKind::~RBBIMonkeyKind() {
1975 }
1976
1977
1978 //----------------------------------------------------------------------------------------
1979 //
1980 // Random Numbers. Similar to standard lib rand() and srand()
1981 // Not using library to
1982 // 1. Get same results on all platforms.
1983 // 2. Get access to current seed, to more easily reproduce failures.
1984 //
1985 //---------------------------------------------------------------------------------------
1986 static uint32_t m_seed = 1;
1987
1988 static uint32_t m_rand()
1989 {
1990 m_seed = m_seed * 1103515245 + 12345;
1991 return (uint32_t)(m_seed/65536) % 32768;
1992 }
1993
1994
1995 //------------------------------------------------------------------------------------------
1996 //
1997 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1998 // of RBBIMonkeyKind.
1999 //
2000 //------------------------------------------------------------------------------------------
2001 class RBBICharMonkey: public RBBIMonkeyKind {
2002 public:
2003 RBBICharMonkey();
2004 virtual ~RBBICharMonkey();
2005 virtual UVector *charClasses();
2006 virtual void setText(const UnicodeString &s);
2007 virtual int32_t next(int32_t i);
2008 private:
2009 UVector *fSets;
2010
2011 UnicodeSet *fCRLFSet;
2012 UnicodeSet *fControlSet;
2013 UnicodeSet *fExtendSet;
2014 UnicodeSet *fRegionalIndicatorSet;
2015 UnicodeSet *fPrependSet;
2016 UnicodeSet *fSpacingSet;
2017 UnicodeSet *fLSet;
2018 UnicodeSet *fVSet;
2019 UnicodeSet *fTSet;
2020 UnicodeSet *fLVSet;
2021 UnicodeSet *fLVTSet;
2022 UnicodeSet *fHangulSet;
2023 UnicodeSet *fAnySet;
2024
2025 const UnicodeString *fText;
2026 };
2027
2028
2029 RBBICharMonkey::RBBICharMonkey() {
2030 UErrorCode status = U_ZERO_ERROR;
2031
2032 fText = NULL;
2033
2034 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2035 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2036 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2037 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2038 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2039 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2040 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2041 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2042 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2043 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2044 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2045 fHangulSet = new UnicodeSet();
2046 fHangulSet->addAll(*fLSet);
2047 fHangulSet->addAll(*fVSet);
2048 fHangulSet->addAll(*fTSet);
2049 fHangulSet->addAll(*fLVSet);
2050 fHangulSet->addAll(*fLVTSet);
2051 fAnySet = new UnicodeSet(0, 0x10ffff);
2052
2053 fSets = new UVector(status);
2054 fSets->addElement(fCRLFSet, status);
2055 fSets->addElement(fControlSet, status);
2056 fSets->addElement(fExtendSet, status);
2057 fSets->addElement(fRegionalIndicatorSet, status);
2058 if (!fPrependSet->isEmpty()) {
2059 fSets->addElement(fPrependSet, status);
2060 }
2061 fSets->addElement(fSpacingSet, status);
2062 fSets->addElement(fHangulSet, status);
2063 fSets->addElement(fAnySet, status);
2064 if (U_FAILURE(status)) {
2065 deferredStatus = status;
2066 }
2067 }
2068
2069
2070 void RBBICharMonkey::setText(const UnicodeString &s) {
2071 fText = &s;
2072 }
2073
2074
2075
2076 int32_t RBBICharMonkey::next(int32_t prevPos) {
2077 int p0, p1, p2, p3; // Indices of the significant code points around the
2078 // break position being tested. The candidate break
2079 // location is before p2.
2080
2081 int breakPos = -1;
2082
2083 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2084
2085 if (U_FAILURE(deferredStatus)) {
2086 return -1;
2087 }
2088
2089 // Previous break at end of string. return DONE.
2090 if (prevPos >= fText->length()) {
2091 return -1;
2092 }
2093 p0 = p1 = p2 = p3 = prevPos;
2094 c3 = fText->char32At(prevPos);
2095 c0 = c1 = c2 = 0;
2096 (void)p0; // suppress set but not used warning.
2097 (void)c0;
2098
2099 // Loop runs once per "significant" character position in the input text.
2100 for (;;) {
2101 // Move all of the positions forward in the input string.
2102 p0 = p1; c0 = c1;
2103 p1 = p2; c1 = c2;
2104 p2 = p3; c2 = c3;
2105
2106 // Advancd p3 by one codepoint
2107 p3 = fText->moveIndex32(p3, 1);
2108 c3 = fText->char32At(p3);
2109
2110 if (p1 == p2) {
2111 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2112 continue;
2113 }
2114 if (p2 == fText->length()) {
2115 // Reached end of string. Always a break position.
2116 break;
2117 }
2118
2119 // Rule GB3 CR x LF
2120 // No Extend or Format characters may appear between the CR and LF,
2121 // which requires the additional check for p2 immediately following p1.
2122 //
2123 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2124 continue;
2125 }
2126
2127 // Rule (GB4). ( Control | CR | LF ) <break>
2128 if (fControlSet->contains(c1) ||
2129 c1 == 0x0D ||
2130 c1 == 0x0A) {
2131 break;
2132 }
2133
2134 // Rule (GB5) <break> ( Control | CR | LF )
2135 //
2136 if (fControlSet->contains(c2) ||
2137 c2 == 0x0D ||
2138 c2 == 0x0A) {
2139 break;
2140 }
2141
2142
2143 // Rule (GB6) L x ( L | V | LV | LVT )
2144 if (fLSet->contains(c1) &&
2145 (fLSet->contains(c2) ||
2146 fVSet->contains(c2) ||
2147 fLVSet->contains(c2) ||
2148 fLVTSet->contains(c2))) {
2149 continue;
2150 }
2151
2152 // Rule (GB7) ( LV | V ) x ( V | T )
2153 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2154 (fVSet->contains(c2) || fTSet->contains(c2))) {
2155 continue;
2156 }
2157
2158 // Rule (GB8) ( LVT | T) x T
2159 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2160 fTSet->contains(c2)) {
2161 continue;
2162 }
2163
2164 // Just adding extra Apple rule does here not work, behavior depends on arbitrary context
2165
2166 // Rule (GB8a) Regional_Indicator x Regional_Indicator
2167 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2168 continue;
2169 }
2170
2171 // Rule (GB9) Numeric x ALetter
2172 if (fExtendSet->contains(c2)) {
2173 continue;
2174 }
2175
2176 // Rule (GB9a) x SpacingMark
2177 if (fSpacingSet->contains(c2)) {
2178 continue;
2179 }
2180
2181 // Rule (GB9b) Prepend x
2182 if (fPrependSet->contains(c1)) {
2183 continue;
2184 }
2185
2186 // Rule (GB10) Any <break> Any
2187 break;
2188 }
2189
2190 breakPos = p2;
2191 return breakPos;
2192 }
2193
2194
2195
2196 UVector *RBBICharMonkey::charClasses() {
2197 return fSets;
2198 }
2199
2200
2201 RBBICharMonkey::~RBBICharMonkey() {
2202 delete fSets;
2203 delete fCRLFSet;
2204 delete fControlSet;
2205 delete fExtendSet;
2206 delete fRegionalIndicatorSet;
2207 delete fPrependSet;
2208 delete fSpacingSet;
2209 delete fLSet;
2210 delete fVSet;
2211 delete fTSet;
2212 delete fLVSet;
2213 delete fLVTSet;
2214 delete fHangulSet;
2215 delete fAnySet;
2216 }
2217
2218 //------------------------------------------------------------------------------------------
2219 //
2220 // class RBBIWordMonkey Word Break specific implementation
2221 // of RBBIMonkeyKind.
2222 //
2223 //------------------------------------------------------------------------------------------
2224 class RBBIWordMonkey: public RBBIMonkeyKind {
2225 public:
2226 RBBIWordMonkey();
2227 virtual ~RBBIWordMonkey();
2228 virtual UVector *charClasses();
2229 virtual void setText(const UnicodeString &s);
2230 virtual int32_t next(int32_t i);
2231 private:
2232 UVector *fSets;
2233
2234 UnicodeSet *fCRSet;
2235 UnicodeSet *fLFSet;
2236 UnicodeSet *fNewlineSet;
2237 UnicodeSet *fRegionalIndicatorSet;
2238 UnicodeSet *fKatakanaSet;
2239 UnicodeSet *fHebrew_LetterSet;
2240 UnicodeSet *fALetterSet;
2241 // TODO(jungshik): Do we still need this change?
2242 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
2243 UnicodeSet *fSingle_QuoteSet;
2244 UnicodeSet *fDouble_QuoteSet;
2245 UnicodeSet *fMidNumLetSet;
2246 UnicodeSet *fMidLetterSet;
2247 UnicodeSet *fMidNumSet;
2248 UnicodeSet *fNumericSet;
2249 UnicodeSet *fFormatSet;
2250 UnicodeSet *fOtherSet;
2251 UnicodeSet *fExtendSet;
2252 UnicodeSet *fExtendNumLetSet;
2253 UnicodeSet *fDictionaryCjkSet;
2254
2255 const UnicodeString *fText;
2256 };
2257
2258
2259 RBBIWordMonkey::RBBIWordMonkey()
2260 {
2261 UErrorCode status = U_ZERO_ERROR;
2262
2263 fSets = new UVector(status);
2264
2265 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
2266 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
2267 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
2268 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2269 // Exclude Hangul syllables from ALetterSet during testing.
2270 // Leave CJK dictionary characters out from the monkey tests!
2271 #if 0
2272 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
2273 "[\\p{Line_Break = Complex_Context}"
2274 "-\\p{Grapheme_Cluster_Break = Extend}"
2275 "-\\p{Grapheme_Cluster_Break = Control}"
2276 "]]",
2277 status);
2278 #endif
2279 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2280 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
2281 fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2282 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2283 fALetterSet->removeAll(*fDictionaryCjkSet);
2284 fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status);
2285 fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status);
2286 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
2287 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
2288 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
2289 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2290 // we should figure out why
2291 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
2292 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
2293 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2294 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
2295
2296 fOtherSet = new UnicodeSet();
2297 if(U_FAILURE(status)) {
2298 deferredStatus = status;
2299 return;
2300 }
2301
2302 fOtherSet->complement();
2303 fOtherSet->removeAll(*fCRSet);
2304 fOtherSet->removeAll(*fLFSet);
2305 fOtherSet->removeAll(*fNewlineSet);
2306 fOtherSet->removeAll(*fKatakanaSet);
2307 fOtherSet->removeAll(*fHebrew_LetterSet);
2308 fOtherSet->removeAll(*fALetterSet);
2309 fOtherSet->removeAll(*fSingle_QuoteSet);
2310 fOtherSet->removeAll(*fDouble_QuoteSet);
2311 fOtherSet->removeAll(*fMidLetterSet);
2312 fOtherSet->removeAll(*fMidNumSet);
2313 fOtherSet->removeAll(*fNumericSet);
2314 fOtherSet->removeAll(*fExtendNumLetSet);
2315 fOtherSet->removeAll(*fFormatSet);
2316 fOtherSet->removeAll(*fExtendSet);
2317 fOtherSet->removeAll(*fRegionalIndicatorSet);
2318 // Inhibit dictionary characters from being tested at all.
2319 fOtherSet->removeAll(*fDictionaryCjkSet);
2320 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2321
2322 fSets->addElement(fCRSet, status);
2323 fSets->addElement(fLFSet, status);
2324 fSets->addElement(fNewlineSet, status);
2325 fSets->addElement(fRegionalIndicatorSet, status);
2326 fSets->addElement(fHebrew_LetterSet, status);
2327 fSets->addElement(fALetterSet, status);
2328 fSets->addElement(fSingle_QuoteSet, status);
2329 fSets->addElement(fDouble_QuoteSet, status);
2330 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
2331 fSets->addElement(fMidLetterSet, status);
2332 fSets->addElement(fMidNumLetSet, status);
2333 fSets->addElement(fMidNumSet, status);
2334 fSets->addElement(fNumericSet, status);
2335 fSets->addElement(fFormatSet, status);
2336 fSets->addElement(fExtendSet, status);
2337 fSets->addElement(fOtherSet, status);
2338 fSets->addElement(fExtendNumLetSet, status);
2339
2340 if (U_FAILURE(status)) {
2341 deferredStatus = status;
2342 }
2343 }
2344
2345 void RBBIWordMonkey::setText(const UnicodeString &s) {
2346 fText = &s;
2347 }
2348
2349
2350 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2351 int p0, p1, p2, p3; // Indices of the significant code points around the
2352 // break position being tested. The candidate break
2353 // location is before p2.
2354
2355 int breakPos = -1;
2356
2357 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2358
2359 if (U_FAILURE(deferredStatus)) {
2360 return -1;
2361 }
2362
2363 // Prev break at end of string. return DONE.
2364 if (prevPos >= fText->length()) {
2365 return -1;
2366 }
2367 p0 = p1 = p2 = p3 = prevPos;
2368 c3 = fText->char32At(prevPos);
2369 c0 = c1 = c2 = 0;
2370 (void)p0; // Suppress set but not used warning.
2371
2372 // Loop runs once per "significant" character position in the input text.
2373 for (;;) {
2374 // Move all of the positions forward in the input string.
2375 p0 = p1; c0 = c1;
2376 p1 = p2; c1 = c2;
2377 p2 = p3; c2 = c3;
2378
2379 // Advancd p3 by X(Extend | Format)* Rule 4
2380 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2381 do {
2382 p3 = fText->moveIndex32(p3, 1);
2383 c3 = fText->char32At(p3);
2384 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2385 break;
2386 };
2387 }
2388 while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2389
2390
2391 if (p1 == p2) {
2392 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2393 continue;
2394 }
2395 if (p2 == fText->length()) {
2396 // Reached end of string. Always a break position.
2397 break;
2398 }
2399
2400 // Rule (3) CR x LF
2401 // No Extend or Format characters may appear between the CR and LF,
2402 // which requires the additional check for p2 immediately following p1.
2403 //
2404 if (c1==0x0D && c2==0x0A) {
2405 continue;
2406 }
2407
2408 // Rule (3a) Break before and after newlines (including CR and LF)
2409 //
2410 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2411 break;
2412 };
2413 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2414 break;
2415 };
2416
2417 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2418 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2419 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2420 continue;
2421 }
2422
2423 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2424 //
2425 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2426 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2427 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2428 continue;
2429 }
2430
2431 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2432 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2433 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2434 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2435 continue;
2436 }
2437
2438 // Rule (7a) Hebrew_Letter x Single_Quote
2439 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2440 continue;
2441 }
2442
2443 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2444 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2445 continue;
2446 }
2447
2448 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2449 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2450 continue;
2451 }
2452
2453 // Rule (8) Numeric x Numeric
2454 if (fNumericSet->contains(c1) &&
2455 fNumericSet->contains(c2)) {
2456 continue;
2457 }
2458
2459 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2460 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2461 fNumericSet->contains(c2)) {
2462 continue;
2463 }
2464
2465 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2466 if (fNumericSet->contains(c1) &&
2467 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2468 continue;
2469 }
2470
2471 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2472 if (fNumericSet->contains(c0) &&
2473 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2474 fNumericSet->contains(c2)) {
2475 continue;
2476 }
2477
2478 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2479 if (fNumericSet->contains(c1) &&
2480 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2481 fNumericSet->contains(c3)) {
2482 continue;
2483 }
2484
2485 // Rule (13) Katakana x Katakana
2486 if (fKatakanaSet->contains(c1) &&
2487 fKatakanaSet->contains(c2)) {
2488 continue;
2489 }
2490
2491 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2492 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2493 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2494 fExtendNumLetSet->contains(c2)) {
2495 continue;
2496 }
2497
2498 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2499 if (fExtendNumLetSet->contains(c1) &&
2500 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2501 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2502 continue;
2503 }
2504
2505 // Rule 13c
2506 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2507 continue;
2508 }
2509
2510 // Rule 14. Break found here.
2511 break;
2512 }
2513
2514 breakPos = p2;
2515 return breakPos;
2516 }
2517
2518
2519 UVector *RBBIWordMonkey::charClasses() {
2520 return fSets;
2521 }
2522
2523
2524 RBBIWordMonkey::~RBBIWordMonkey() {
2525 delete fSets;
2526 delete fCRSet;
2527 delete fLFSet;
2528 delete fNewlineSet;
2529 delete fKatakanaSet;
2530 delete fHebrew_LetterSet;
2531 delete fALetterSet;
2532 delete fSingle_QuoteSet;
2533 delete fDouble_QuoteSet;
2534 delete fMidNumLetSet;
2535 delete fMidLetterSet;
2536 delete fMidNumSet;
2537 delete fNumericSet;
2538 delete fFormatSet;
2539 delete fExtendSet;
2540 delete fExtendNumLetSet;
2541 delete fRegionalIndicatorSet;
2542 delete fDictionaryCjkSet;
2543 delete fOtherSet;
2544 }
2545
2546
2547
2548
2549 //------------------------------------------------------------------------------------------
2550 //
2551 // class RBBISentMonkey Sentence Break specific implementation
2552 // of RBBIMonkeyKind.
2553 //
2554 //------------------------------------------------------------------------------------------
2555 class RBBISentMonkey: public RBBIMonkeyKind {
2556 public:
2557 RBBISentMonkey();
2558 virtual ~RBBISentMonkey();
2559 virtual UVector *charClasses();
2560 virtual void setText(const UnicodeString &s);
2561 virtual int32_t next(int32_t i);
2562 private:
2563 int moveBack(int posFrom);
2564 int moveForward(int posFrom);
2565 UChar32 cAt(int pos);
2566
2567 UVector *fSets;
2568
2569 UnicodeSet *fSepSet;
2570 UnicodeSet *fFormatSet;
2571 UnicodeSet *fSpSet;
2572 UnicodeSet *fLowerSet;
2573 UnicodeSet *fUpperSet;
2574 UnicodeSet *fOLetterSet;
2575 UnicodeSet *fNumericSet;
2576 UnicodeSet *fATermSet;
2577 UnicodeSet *fSContinueSet;
2578 UnicodeSet *fSTermSet;
2579 UnicodeSet *fCloseSet;
2580 UnicodeSet *fOtherSet;
2581 UnicodeSet *fExtendSet;
2582
2583 const UnicodeString *fText;
2584
2585 };
2586
2587 RBBISentMonkey::RBBISentMonkey()
2588 {
2589 UErrorCode status = U_ZERO_ERROR;
2590
2591 fSets = new UVector(status);
2592
2593 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2594 // set and made into character classes of their own. For the monkey impl,
2595 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2596 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2597 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2598 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2599 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2600 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2601 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2602 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2603 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2604 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2605 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2606 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2607 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2608 fOtherSet = new UnicodeSet();
2609
2610 if(U_FAILURE(status)) {
2611 deferredStatus = status;
2612 return;
2613 }
2614
2615 fOtherSet->complement();
2616 fOtherSet->removeAll(*fSepSet);
2617 fOtherSet->removeAll(*fFormatSet);
2618 fOtherSet->removeAll(*fSpSet);
2619 fOtherSet->removeAll(*fLowerSet);
2620 fOtherSet->removeAll(*fUpperSet);
2621 fOtherSet->removeAll(*fOLetterSet);
2622 fOtherSet->removeAll(*fNumericSet);
2623 fOtherSet->removeAll(*fATermSet);
2624 fOtherSet->removeAll(*fSContinueSet);
2625 fOtherSet->removeAll(*fSTermSet);
2626 fOtherSet->removeAll(*fCloseSet);
2627 fOtherSet->removeAll(*fExtendSet);
2628
2629 fSets->addElement(fSepSet, status);
2630 fSets->addElement(fFormatSet, status);
2631 fSets->addElement(fSpSet, status);
2632 fSets->addElement(fLowerSet, status);
2633 fSets->addElement(fUpperSet, status);
2634 fSets->addElement(fOLetterSet, status);
2635 fSets->addElement(fNumericSet, status);
2636 fSets->addElement(fATermSet, status);
2637 fSets->addElement(fSContinueSet, status);
2638 fSets->addElement(fSTermSet, status);
2639 fSets->addElement(fCloseSet, status);
2640 fSets->addElement(fOtherSet, status);
2641 fSets->addElement(fExtendSet, status);
2642
2643 if (U_FAILURE(status)) {
2644 deferredStatus = status;
2645 }
2646 }
2647
2648
2649
2650 void RBBISentMonkey::setText(const UnicodeString &s) {
2651 fText = &s;
2652 }
2653
2654 UVector *RBBISentMonkey::charClasses() {
2655 return fSets;
2656 }
2657
2658
2659 // moveBack() Find the "significant" code point preceding the index i.
2660 // Skips over ($Extend | $Format)* .
2661 //
2662 int RBBISentMonkey::moveBack(int i) {
2663 if (i <= 0) {
2664 return -1;
2665 }
2666 UChar32 c;
2667 int32_t j = i;
2668 do {
2669 j = fText->moveIndex32(j, -1);
2670 c = fText->char32At(j);
2671 }
2672 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2673 return j;
2674
2675 }
2676
2677
2678 int RBBISentMonkey::moveForward(int i) {
2679 if (i>=fText->length()) {
2680 return fText->length();
2681 }
2682 UChar32 c;
2683 int32_t j = i;
2684 do {
2685 j = fText->moveIndex32(j, 1);
2686 c = cAt(j);
2687 }
2688 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2689 return j;
2690 }
2691
2692 UChar32 RBBISentMonkey::cAt(int pos) {
2693 if (pos<0 || pos>=fText->length()) {
2694 return -1;
2695 } else {
2696 return fText->char32At(pos);
2697 }
2698 }
2699
2700 int32_t RBBISentMonkey::next(int32_t prevPos) {
2701 int p0, p1, p2, p3; // Indices of the significant code points around the
2702 // break position being tested. The candidate break
2703 // location is before p2.
2704
2705 int breakPos = -1;
2706
2707 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2708 UChar32 c;
2709
2710 if (U_FAILURE(deferredStatus)) {
2711 return -1;
2712 }
2713
2714 // Prev break at end of string. return DONE.
2715 if (prevPos >= fText->length()) {
2716 return -1;
2717 }
2718 p0 = p1 = p2 = p3 = prevPos;
2719 c3 = fText->char32At(prevPos);
2720 c0 = c1 = c2 = 0;
2721 (void)p0; // Suppress set but not used warning.
2722
2723 // Loop runs once per "significant" character position in the input text.
2724 for (;;) {
2725 // Move all of the positions forward in the input string.
2726 p0 = p1; c0 = c1;
2727 p1 = p2; c1 = c2;
2728 p2 = p3; c2 = c3;
2729
2730 // Advancd p3 by X(Extend | Format)* Rule 4
2731 p3 = moveForward(p3);
2732 c3 = cAt(p3);
2733
2734 // Rule (3) CR x LF
2735 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2736 continue;
2737 }
2738
2739 // Rule (4). Sep <break>
2740 if (fSepSet->contains(c1)) {
2741 p2 = p1+1; // Separators don't combine with Extend or Format.
2742 break;
2743 }
2744
2745 if (p2 >= fText->length()) {
2746 // Reached end of string. Always a break position.
2747 break;
2748 }
2749
2750 if (p2 == prevPos) {
2751 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2752 continue;
2753 }
2754
2755 // Rule (6). ATerm x Numeric
2756 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2757 continue;
2758 }
2759
2760 // Rule (7). Upper ATerm x Uppper
2761 if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2762 continue;
2763 }
2764
2765 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2766 // Note: STerm | ATerm are added to the negated part of the expression by a
2767 // note to the Unicode 5.0 documents.
2768 int p8 = p1;
2769 while (fSpSet->contains(cAt(p8))) {
2770 p8 = moveBack(p8);
2771 }
2772 while (fCloseSet->contains(cAt(p8))) {
2773 p8 = moveBack(p8);
2774 }
2775 if (fATermSet->contains(cAt(p8))) {
2776 p8=p2;
2777 for (;;) {
2778 c = cAt(p8);
2779 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2780 fLowerSet->contains(c) || fSepSet->contains(c) ||
2781 fATermSet->contains(c) || fSTermSet->contains(c)) {
2782 break;
2783 }
2784 p8 = moveForward(p8);
2785 }
2786 if (fLowerSet->contains(cAt(p8))) {
2787 continue;
2788 }
2789 }
2790
2791 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2792 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2793 p8 = p1;
2794 while (fSpSet->contains(cAt(p8))) {
2795 p8 = moveBack(p8);
2796 }
2797 while (fCloseSet->contains(cAt(p8))) {
2798 p8 = moveBack(p8);
2799 }
2800 c = cAt(p8);
2801 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2802 continue;
2803 }
2804 }
2805
2806 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2807 int p9 = p1;
2808 while (fCloseSet->contains(cAt(p9))) {
2809 p9 = moveBack(p9);
2810 }
2811 c = cAt(p9);
2812 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2813 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2814 continue;
2815 }
2816 }
2817
2818 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2819 int p10 = p1;
2820 while (fSpSet->contains(cAt(p10))) {
2821 p10 = moveBack(p10);
2822 }
2823 while (fCloseSet->contains(cAt(p10))) {
2824 p10 = moveBack(p10);
2825 }
2826 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2827 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2828 continue;
2829 }
2830 }
2831
2832 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2833 int p11 = p1;
2834 if (fSepSet->contains(cAt(p11))) {
2835 p11 = moveBack(p11);
2836 }
2837 while (fSpSet->contains(cAt(p11))) {
2838 p11 = moveBack(p11);
2839 }
2840 while (fCloseSet->contains(cAt(p11))) {
2841 p11 = moveBack(p11);
2842 }
2843 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2844 break;
2845 }
2846
2847 // Rule (12) Any x Any
2848 continue;
2849 }
2850 breakPos = p2;
2851 return breakPos;
2852 }
2853
2854 RBBISentMonkey::~RBBISentMonkey() {
2855 delete fSets;
2856 delete fSepSet;
2857 delete fFormatSet;
2858 delete fSpSet;
2859 delete fLowerSet;
2860 delete fUpperSet;
2861 delete fOLetterSet;
2862 delete fNumericSet;
2863 delete fATermSet;
2864 delete fSContinueSet;
2865 delete fSTermSet;
2866 delete fCloseSet;
2867 delete fOtherSet;
2868 delete fExtendSet;
2869 }
2870
2871
2872
2873 //-------------------------------------------------------------------------------------------
2874 //
2875 // RBBILineMonkey
2876 //
2877 //-------------------------------------------------------------------------------------------
2878
2879 class RBBILineMonkey: public RBBIMonkeyKind {
2880 public:
2881 RBBILineMonkey();
2882 virtual ~RBBILineMonkey();
2883 virtual UVector *charClasses();
2884 virtual void setText(const UnicodeString &s);
2885 virtual int32_t next(int32_t i);
2886 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2887 private:
2888 UVector *fSets;
2889
2890 UnicodeSet *fBK;
2891 UnicodeSet *fCR;
2892 UnicodeSet *fLF;
2893 UnicodeSet *fCM;
2894 UnicodeSet *fNL;
2895 UnicodeSet *fSG;
2896 UnicodeSet *fWJ;
2897 UnicodeSet *fZW;
2898 UnicodeSet *fGL;
2899 UnicodeSet *fCB;
2900 UnicodeSet *fSP;
2901 UnicodeSet *fB2;
2902 UnicodeSet *fBA;
2903 UnicodeSet *fBB;
2904 UnicodeSet *fHY;
2905 UnicodeSet *fH2;
2906 UnicodeSet *fH3;
2907 UnicodeSet *fCL;
2908 UnicodeSet *fCP;
2909 UnicodeSet *fEX;
2910 UnicodeSet *fIN;
2911 UnicodeSet *fJL;
2912 UnicodeSet *fJV;
2913 UnicodeSet *fJT;
2914 UnicodeSet *fNS;
2915 UnicodeSet *fOP;
2916 UnicodeSet *fQU;
2917 UnicodeSet *fIS;
2918 UnicodeSet *fNU;
2919 UnicodeSet *fPO;
2920 UnicodeSet *fPR;
2921 UnicodeSet *fSY;
2922 UnicodeSet *fAI;
2923 UnicodeSet *fAL;
2924 UnicodeSet *fCJ;
2925 UnicodeSet *fHL;
2926 UnicodeSet *fID;
2927 UnicodeSet *fRI;
2928 UnicodeSet *fSA;
2929 UnicodeSet *fXX;
2930
2931 BreakIterator *fCharBI;
2932 const UnicodeString *fText;
2933 RegexMatcher *fNumberMatcher;
2934 };
2935
2936
2937 RBBILineMonkey::RBBILineMonkey()
2938 {
2939 UErrorCode status = U_ZERO_ERROR;
2940
2941 fSets = new UVector(status);
2942
2943 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2944 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2945 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2946 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2947 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2948 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2949 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2950 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2951 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2952 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2953 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2954 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2955 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2956 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2957 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2958 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2959 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2960 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2961 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2962 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2963 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2964 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2965 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2966 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2967 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2968 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2969 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2970 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2971 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2972 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2973 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2974 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2975 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2976 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2977 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2978 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2979 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2980 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2981 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2982 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2983
2984 if (U_FAILURE(status)) {
2985 deferredStatus = status;
2986 fCharBI = NULL;
2987 fNumberMatcher = NULL;
2988 return;
2989 }
2990
2991 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2992 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2993 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
2994 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2995
2996 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2997
2998 fSets->addElement(fBK, status);
2999 fSets->addElement(fCR, status);
3000 fSets->addElement(fLF, status);
3001 fSets->addElement(fCM, status);
3002 fSets->addElement(fNL, status);
3003 fSets->addElement(fWJ, status);
3004 fSets->addElement(fZW, status);
3005 fSets->addElement(fGL, status);
3006 fSets->addElement(fCB, status);
3007 fSets->addElement(fSP, status);
3008 fSets->addElement(fB2, status);
3009 fSets->addElement(fBA, status);
3010 fSets->addElement(fBB, status);
3011 fSets->addElement(fHY, status);
3012 fSets->addElement(fH2, status);
3013 fSets->addElement(fH3, status);
3014 fSets->addElement(fCL, status);
3015 fSets->addElement(fCP, status);
3016 fSets->addElement(fEX, status);
3017 fSets->addElement(fIN, status);
3018 fSets->addElement(fJL, status);
3019 fSets->addElement(fJT, status);
3020 fSets->addElement(fJV, status);
3021 fSets->addElement(fNS, status);
3022 fSets->addElement(fOP, status);
3023 fSets->addElement(fQU, status);
3024 fSets->addElement(fIS, status);
3025 fSets->addElement(fNU, status);
3026 fSets->addElement(fPO, status);
3027 fSets->addElement(fPR, status);
3028 fSets->addElement(fSY, status);
3029 fSets->addElement(fAI, status);
3030 fSets->addElement(fAL, status);
3031 fSets->addElement(fHL, status);
3032 fSets->addElement(fID, status);
3033 fSets->addElement(fWJ, status);
3034 fSets->addElement(fRI, status);
3035 fSets->addElement(fSA, status);
3036 fSets->addElement(fSG, status);
3037
3038 const char *rules =
3039 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3040 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3041 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3042 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3043 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3044 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3045
3046 fNumberMatcher = new RegexMatcher(
3047 UnicodeString(rules, -1, US_INV), 0, status);
3048
3049 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3050
3051 if (U_FAILURE(status)) {
3052 deferredStatus = status;
3053 }
3054 }
3055
3056
3057 void RBBILineMonkey::setText(const UnicodeString &s) {
3058 fText = &s;
3059 fCharBI->setText(s);
3060 fNumberMatcher->reset(s);
3061 }
3062
3063 //
3064 // rule9Adjust
3065 // Line Break TR rules 9 and 10 implementation.
3066 // This deals with combining marks and other sequences that
3067 // that must be treated as if they were something other than what they actually are.
3068 //
3069 // This is factored out into a separate function because it must be applied twice for
3070 // each potential break, once to the chars before the position being checked, then
3071 // again to the text following the possible break.
3072 //
3073 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3074 if (pos == -1) {
3075 // Invalid initial position. Happens during the warmup iteration of the
3076 // main loop in next().
3077 return;
3078 }
3079
3080 int32_t nPos = *nextPos;
3081
3082 // LB 9 Keep combining sequences together.
3083 // advance over any CM class chars. Note that Line Break CM is different
3084 // from the normal Grapheme Extend property.
3085 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3086 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3087 for (;;) {
3088 *nextChar = fText->char32At(nPos);
3089 if (!fCM->contains(*nextChar)) {
3090 break;
3091 }
3092 nPos = fText->moveIndex32(nPos, 1);
3093 }
3094 }
3095
3096
3097 // LB 9 Treat X CM* as if it were x.
3098 // No explicit action required.
3099
3100 // LB 10 Treat any remaining combining mark as AL
3101 if (fCM->contains(*posChar)) {
3102 *posChar = 0x41; // thisChar = 'A';
3103 }
3104
3105 // Push the updated nextPos and nextChar back to our caller.
3106 // This only makes a difference if posChar got bigger by consuming a
3107 // combining sequence.
3108 *nextPos = nPos;
3109 *nextChar = fText->char32At(nPos);
3110 }
3111
3112
3113
3114 int32_t RBBILineMonkey::next(int32_t startPos) {
3115 UErrorCode status = U_ZERO_ERROR;
3116 int32_t pos; // Index of the char following a potential break position
3117 UChar32 thisChar; // Character at above position "pos"
3118
3119 int32_t prevPos; // Index of the char preceding a potential break position
3120 UChar32 prevChar; // Character at above position. Note that prevChar
3121 // and thisChar may not be adjacent because combining
3122 // characters between them will be ignored.
3123
3124 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
3125 UChar32 prevCharX2;
3126
3127 int32_t nextPos; // Index of the next character following pos.
3128 // Usually skips over combining marks.
3129 int32_t nextCPPos; // Index of the code point following "pos."
3130 // May point to a combining mark.
3131 int32_t tPos; // temp value.
3132 UChar32 c;
3133
3134 if (U_FAILURE(deferredStatus)) {
3135 return -1;
3136 }
3137
3138 if (startPos >= fText->length()) {
3139 return -1;
3140 }
3141
3142
3143 // Initial values for loop. Loop will run the first time without finding breaks,
3144 // while the invalid values shift out and the "this" and
3145 // "prev" positions are filled in with good values.
3146 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
3147 thisChar = prevChar = prevCharX2 = 0;
3148 nextPos = nextCPPos = startPos;
3149
3150
3151 // Loop runs once per position in the test text, until a break position
3152 // is found.
3153 for (;;) {
3154 prevPosX2 = prevPos;
3155 prevCharX2 = prevChar;
3156
3157 prevPos = pos;
3158 prevChar = thisChar;
3159
3160 pos = nextPos;
3161 thisChar = fText->char32At(pos);
3162
3163 nextCPPos = fText->moveIndex32(pos, 1);
3164 nextPos = nextCPPos;
3165
3166 // Rule LB2 - Break at end of text.
3167 if (pos >= fText->length()) {
3168 break;
3169 }
3170
3171 // Rule LB 9 - adjust for combining sequences.
3172 // We do this one out-of-order because the adjustment does not change anything
3173 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3174 // be applied.
3175 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3176 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3177 c = fText->char32At(nextPos);
3178 rule9Adjust(pos, &thisChar, &nextPos, &c);
3179
3180 // If the loop is still warming up - if we haven't shifted the initial
3181 // -1 positions out of prevPos yet - loop back to advance the
3182 // position in the input without any further looking for breaks.
3183 if (prevPos == -1) {
3184 continue;
3185 }
3186
3187 // LB 4 Always break after hard line breaks,
3188 if (fBK->contains(prevChar)) {
3189 break;
3190 }
3191
3192 // LB 5 Break after CR, LF, NL, but not inside CR LF
3193 if (prevChar == 0x0d && thisChar == 0x0a) {
3194 continue;
3195 }
3196 if (prevChar == 0x0d ||
3197 prevChar == 0x0a ||
3198 prevChar == 0x85) {
3199 break;
3200 }
3201
3202 // LB 6 Don't break before hard line breaks
3203 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3204 fBK->contains(thisChar)) {
3205 continue;
3206 }
3207
3208
3209 // LB 7 Don't break before spaces or zero-width space.
3210 if (fSP->contains(thisChar)) {
3211 continue;
3212 }
3213
3214 if (fZW->contains(thisChar)) {
3215 continue;
3216 }
3217
3218 // LB 8 Break after zero width space
3219 if (fZW->contains(prevChar)) {
3220 break;
3221 }
3222
3223 // LB 9, 10 Already done, at top of loop.
3224 //
3225
3226
3227 // LB 11 Do not break before or after WORD JOINER and related characters.
3228 // x WJ
3229 // WJ x
3230 //
3231 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3232 continue;
3233 }
3234
3235 // LB 12
3236 // GL x
3237 if (fGL->contains(prevChar)) {
3238 continue;
3239 }
3240
3241 // LB 12a
3242 // [^SP BA HY] x GL
3243 if (!(fSP->contains(prevChar) ||
3244 fBA->contains(prevChar) ||
3245 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3246 continue;
3247 }
3248
3249
3250
3251 // LB 13 Don't break before closings.
3252 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3253 // fall into LB 17 and the more general number regular expression.
3254 //
3255 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3256 (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3257 fEX->contains(thisChar) ||
3258 (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3259 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
3260 continue;
3261 }
3262
3263 // LB 14 Don't break after OP SP*
3264 // Scan backwards, checking for this sequence.
3265 // The OP char could include combining marks, so we actually check for
3266 // OP CM* SP*
3267 // Another Twist: The Rule 67 fixes may have changed a SP CM
3268 // sequence into a ID char, so before scanning back through spaces,
3269 // verify that prevChar is indeed a space. The prevChar variable
3270 // may differ from fText[prevPos]
3271 tPos = prevPos;
3272 if (fSP->contains(prevChar)) {
3273 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3274 tPos=fText->moveIndex32(tPos, -1);
3275 }
3276 }
3277 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3278 tPos=fText->moveIndex32(tPos, -1);
3279 }
3280 if (fOP->contains(fText->char32At(tPos))) {
3281 continue;
3282 }
3283
3284
3285 // LB 15 QU SP* x OP
3286 if (fOP->contains(thisChar)) {
3287 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3288 int tPos = prevPos;
3289 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3290 tPos = fText->moveIndex32(tPos, -1);
3291 }
3292 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3293 tPos = fText->moveIndex32(tPos, -1);
3294 }
3295 if (fQU->contains(fText->char32At(tPos))) {
3296 continue;
3297 }
3298 }
3299
3300
3301
3302 // LB 16 (CL | CP) SP* x NS
3303 // Scan backwards for SP* CM* (CL | CP)
3304 if (fNS->contains(thisChar)) {
3305 int tPos = prevPos;
3306 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3307 tPos = fText->moveIndex32(tPos, -1);
3308 }
3309 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3310 tPos = fText->moveIndex32(tPos, -1);
3311 }
3312 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3313 continue;
3314 }
3315 }
3316
3317
3318 // LB 17 B2 SP* x B2
3319 if (fB2->contains(thisChar)) {
3320 // Scan backwards, checking for the B2 CM* SP* sequence.
3321 tPos = prevPos;
3322 if (fSP->contains(prevChar)) {
3323 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3324 tPos=fText->moveIndex32(tPos, -1);
3325 }
3326 }
3327 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3328 tPos=fText->moveIndex32(tPos, -1);
3329 }
3330 if (fB2->contains(fText->char32At(tPos))) {
3331 continue;
3332 }
3333 }
3334
3335
3336 // LB 18 break after space
3337 if (fSP->contains(prevChar)) {
3338 break;
3339 }
3340
3341 // LB 19
3342 // x QU
3343 // QU x
3344 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3345 continue;
3346 }
3347
3348 // LB 20 Break around a CB
3349 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3350 break;
3351 }
3352
3353 // LB 21
3354 if (fBA->contains(thisChar) ||
3355 fHY->contains(thisChar) ||
3356 fNS->contains(thisChar) ||
3357 fBB->contains(prevChar) ) {
3358 continue;
3359 }
3360
3361 // LB 21a
3362 // HL (HY | BA) x
3363 if (fHL->contains(prevCharX2) &&
3364 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3365 continue;
3366 }
3367
3368 // LB 21b
3369 // SY x HL
3370 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3371 continue;
3372 }
3373
3374 // LB 22
3375 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3376 (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3377 (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3378 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3379 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
3380 continue;
3381 }
3382
3383
3384 // LB 23 ID x PO
3385 // AL x NU
3386 // HL x NU
3387 // NU x AL
3388 if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3389 (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3390 (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3391 (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3392 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) {
3393 continue;
3394 }
3395
3396 // LB 24 Do not break between prefix and letters or ideographs.
3397 // PR x ID
3398 // PR x (AL | HL)
3399 // PO x (AL | HL)
3400 if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3401 (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3402 (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) {
3403 continue;
3404 }
3405
3406
3407
3408 // LB 25 Numbers
3409 if (fNumberMatcher->lookingAt(prevPos, status)) {
3410 if (U_FAILURE(status)) {
3411 break;
3412 }
3413 // Matched a number. But could have been just a single digit, which would
3414 // not represent a "no break here" between prevChar and thisChar
3415 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3416 if (numEndIdx > pos) {
3417 // Number match includes at least our two chars being checked
3418 if (numEndIdx > nextPos) {
3419 // Number match includes additional chars. Update pos and nextPos
3420 // so that next loop iteration will continue at the end of the number,
3421 // checking for breaks between last char in number & whatever follows.
3422 pos = nextPos = numEndIdx;
3423 do {
3424 pos = fText->moveIndex32(pos, -1);
3425 thisChar = fText->char32At(pos);
3426 } while (fCM->contains(thisChar));
3427 }
3428 continue;
3429 }
3430 }
3431
3432
3433 // LB 26 Do not break a Korean syllable.
3434 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3435 fJV->contains(thisChar) ||
3436 fH2->contains(thisChar) ||
3437 fH3->contains(thisChar))) {
3438 continue;
3439 }
3440
3441 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3442 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3443 continue;
3444 }
3445
3446 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3447 fJT->contains(thisChar)) {
3448 continue;
3449 }
3450
3451 // LB 27 Treat a Korean Syllable Block the same as ID.
3452 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3453 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3454 fIN->contains(thisChar)) {
3455 continue;
3456 }
3457 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3458 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3459 fPO->contains(thisChar)) {
3460 continue;
3461 }
3462 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3463 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3464 continue;
3465 }
3466
3467
3468
3469 // LB 28 Do not break between alphabetics ("at").
3470 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3471 continue;
3472 }
3473
3474 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3475 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3476 continue;
3477 }
3478
3479 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3480 // (AL | NU) x OP
3481 // CP x (AL | NU)
3482 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3483 continue;
3484 }
3485 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3486 continue;
3487 }
3488
3489 // LB30a Do not break between regional indicators.
3490 // RI x RI
3491 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3492 continue;
3493 }
3494
3495 // LB 31 Break everywhere else
3496 break;
3497
3498 }
3499
3500 return pos;
3501 }
3502
3503
3504 UVector *RBBILineMonkey::charClasses() {
3505 return fSets;
3506 }
3507
3508
3509 RBBILineMonkey::~RBBILineMonkey() {
3510 delete fSets;
3511
3512 delete fBK;
3513 delete fCR;
3514 delete fLF;
3515 delete fCM;
3516 delete fNL;
3517 delete fWJ;
3518 delete fZW;
3519 delete fGL;
3520 delete fCB;
3521 delete fSP;
3522 delete fB2;
3523 delete fBA;
3524 delete fBB;
3525 delete fHY;
3526 delete fH2;
3527 delete fH3;
3528 delete fCL;
3529 delete fCP;
3530 delete fEX;
3531 delete fIN;
3532 delete fJL;
3533 delete fJV;
3534 delete fJT;
3535 delete fNS;
3536 delete fOP;
3537 delete fQU;
3538 delete fIS;
3539 delete fNU;
3540 delete fPO;
3541 delete fPR;
3542 delete fSY;
3543 delete fAI;
3544 delete fAL;
3545 delete fCJ;
3546 delete fHL;
3547 delete fID;
3548 delete fRI;
3549 delete fSA;
3550 delete fSG;
3551 delete fXX;
3552
3553 delete fCharBI;
3554 delete fNumberMatcher;
3555 }
3556
3557
3558 //-------------------------------------------------------------------------------------------
3559 //
3560 // TestMonkey
3561 //
3562 // params
3563 // seed=nnnnn Random number starting seed.
3564 // Setting the seed allows errors to be reproduced.
3565 // loop=nnn Looping count. Controls running time.
3566 // -1: run forever.
3567 // 0 or greater: run length.
3568 //
3569 // type = char | word | line | sent | title
3570 //
3571 //-------------------------------------------------------------------------------------------
3572
3573 static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3574 int32_t val = defaultVal;
3575 name.append(" *= *(-?\\d+)");
3576 UErrorCode status = U_ZERO_ERROR;
3577 RegexMatcher m(name, params, 0, status);
3578 if (m.find()) {
3579 // The param exists. Convert the string to an int.
3580 char valString[100];
3581 int32_t paramLength = m.end(1, status) - m.start(1, status);
3582 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3583 paramLength = (int32_t)(sizeof(valString)-2);
3584 }
3585 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3586 val = strtol(valString, NULL, 10);
3587
3588 // Delete this parameter from the params string.
3589 m.reset();
3590 params = m.replaceFirst("", status);
3591 }
3592 U_ASSERT(U_SUCCESS(status));
3593 return val;
3594 }
3595 #endif
3596
3597 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3598 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3599 BreakIterator *bi,
3600 int expected[],
3601 int expectedcount)
3602 {
3603 int count = 0;
3604 int i = 0;
3605 int forward[50];
3606 bi->setText(ustr);
3607 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3608 forward[count] = i;
3609 if (count < expectedcount && expected[count] != i) {
3610 test->errln("break forward test failed: expected %d but got %d",
3611 expected[count], i);
3612 break;
3613 }
3614 count ++;
3615 }
3616 if (count != expectedcount) {
3617 printStringBreaks(ustr, expected, expectedcount);
3618 test->errln("break forward test failed: missed %d match",
3619 expectedcount - count);
3620 return;
3621 }
3622 // testing boundaries
3623 for (i = 1; i < expectedcount; i ++) {
3624 int j = expected[i - 1];
3625 if (!bi->isBoundary(j)) {
3626 printStringBreaks(ustr, expected, expectedcount);
3627 test->errln("isBoundary() failed. Expected boundary at position %d", j);
3628 return;
3629 }
3630 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3631 if (bi->isBoundary(j)) {
3632 printStringBreaks(ustr, expected, expectedcount);
3633 test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
3634 return;
3635 }
3636 }
3637 }
3638
3639 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3640 count --;
3641 if (forward[count] != i) {
3642 printStringBreaks(ustr, expected, expectedcount);
3643 test->errln("happy break test previous() failed: expected %d but got %d",
3644 forward[count], i);
3645 break;
3646 }
3647 }
3648 if (count != 0) {
3649 printStringBreaks(ustr, expected, expectedcount);
3650 test->errln("break test previous() failed: missed a match");
3651 return;
3652 }
3653
3654 // testing preceding
3655 for (i = 0; i < expectedcount - 1; i ++) {
3656 // int j = expected[i] + 1;
3657 int j = ustr.moveIndex32(expected[i], 1);
3658 for (; j <= expected[i + 1]; j ++) {
3659 if (bi->preceding(j) != expected[i]) {
3660 printStringBreaks(ustr, expected, expectedcount);
3661 test->errln("preceding(): Not expecting boundary at position %d", j);
3662 return;
3663 }
3664 }
3665 }
3666 }
3667 #endif
3668
3669 void RBBITest::TestWordBreaks(void)
3670 {
3671 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3672
3673 Locale locale("en");
3674 UErrorCode status = U_ZERO_ERROR;
3675 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3676 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3677 // Replaced any C+J characters in a row with a random sequence of characters
3678 // of the same length to make our C+J segmentation not get in the way.
3679 static const char *strlist[] =
3680 {
3681 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3682 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3683 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3684 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3685 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3686 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3687 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3688 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3689 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3690 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3691 "\\u2027\\U000e0067\\u0a47\\u00b7",
3692 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3693 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3694 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3695 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3696 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3697 "\\u0027\\u11af\\U000e0057\\u0602",
3698 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3699 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3700 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3701 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3702 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3703 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3704 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3705 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3706 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3707 "\\u18f4\\U000e0049\\u20e7\\u2027",
3708 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3709 "\\ua183\\u102d\\u0bec\\u003a",
3710 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3711 "\\u003a\\u0e57\\u0fad\\u002e",
3712 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3713 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3714 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3715 "\\u003a\\u0664\\u00b7\\u1fba",
3716 "\\u003b\\u0027\\u00b7\\u47a3",
3717 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3718 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3719 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3720 };
3721 int loop;
3722 if (U_FAILURE(status)) {
3723 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3724 return;
3725 }
3726 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3727 // printf("looping %d\n", loop);
3728 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3729 // RBBICharMonkey monkey;
3730 RBBIWordMonkey monkey;
3731
3732 int expected[50];
3733 int expectedcount = 0;
3734
3735 monkey.setText(ustr);
3736 int i;
3737 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3738 expected[expectedcount ++] = i;
3739 }
3740
3741 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3742 }
3743 delete bi;
3744 #endif
3745 }
3746
3747 void RBBITest::TestWordBoundary(void)
3748 {
3749 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3750 Locale locale("en");
3751 UErrorCode status = U_ZERO_ERROR;
3752 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3753 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3754 UChar str[50];
3755 static const char *strlist[] =
3756 {
3757 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3758 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3759 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3760 "\\u2027\\U000e0067\\u0a47\\u00b7",
3761 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3762 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3763 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3764 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3765 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3766 "\\u0027\\u11af\\U000e0057\\u0602",
3767 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3768 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3769 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3770 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3771 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3772 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3773 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3774 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3775 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3776 "\\u58f4\\U000e0049\\u20e7\\u2027",
3777 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3778 "\\ua183\\u102d\\u0bec\\u003a",
3779 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3780 "\\u003a\\u0e57\\u0fad\\u002e",
3781 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3782 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3783 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3784 "\\u003a\\u0664\\u00b7\\u1fba",
3785 "\\u003b\\u0027\\u00b7\\u47a3",
3786 };
3787 int loop;
3788 if (U_FAILURE(status)) {
3789 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3790 return;
3791 }
3792 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3793 // printf("looping %d\n", loop);
3794 u_unescape(strlist[loop], str, 20);
3795 UnicodeString ustr(str);
3796 int forward[50];
3797 int count = 0;
3798
3799 bi->setText(ustr);
3800 int prev = 0;
3801 int i;
3802 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3803 forward[count ++] = i;
3804 if (i > prev) {
3805 int j;
3806 for (j = prev + 1; j < i; j ++) {
3807 if (bi->isBoundary(j)) {
3808 printStringBreaks(ustr, forward, count);
3809 errln("happy boundary test failed: expected %d not a boundary",
3810 j);
3811 return;
3812 }
3813 }
3814 }
3815 if (!bi->isBoundary(i)) {
3816 printStringBreaks(ustr, forward, count);
3817 errln("happy boundary test failed: expected %d a boundary",
3818 i);
3819 return;
3820 }
3821 prev = i;
3822 }
3823 }
3824 delete bi;
3825 }
3826
3827 void RBBITest::TestLineBreaks(void)
3828 {
3829 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3830 Locale locale("en");
3831 UErrorCode status = U_ZERO_ERROR;
3832 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3833 const int32_t STRSIZE = 50;
3834 UChar str[STRSIZE];
3835 static const char *strlist[] =
3836 {
3837 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3838 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3839 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3840 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3841 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3842 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3843 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3844 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3845 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3846 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3847 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3848 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3849 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3850 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3851 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3852 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3853 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3854 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3855 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3856 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3857 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3858 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3859 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3860 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3861 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3862 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3863 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3864 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3865 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3866 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3867 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3868 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3869 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3870 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3871 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3872 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3873 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3874 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3875 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3876 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3877 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3878 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3879 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3880 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3881 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3882 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3883 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3884 };
3885 int loop;
3886 TEST_ASSERT_SUCCESS(status);
3887 if (U_FAILURE(status)) {
3888 return;
3889 }
3890 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3891 // printf("looping %d\n", loop);
3892 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3893 if (t >= STRSIZE) {
3894 TEST_ASSERT(FALSE);
3895 continue;
3896 }
3897
3898
3899 UnicodeString ustr(str);
3900 RBBILineMonkey monkey;
3901 if (U_FAILURE(monkey.deferredStatus)) {
3902 continue;
3903 }
3904
3905 const int EXPECTEDSIZE = 50;
3906 int expected[EXPECTEDSIZE];
3907 int expectedcount = 0;
3908
3909 monkey.setText(ustr);
3910 int i;
3911 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3912 if (expectedcount >= EXPECTEDSIZE) {
3913 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3914 return;
3915 }
3916 expected[expectedcount ++] = i;
3917 }
3918
3919 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3920 }
3921 delete bi;
3922 #endif
3923 }
3924
3925 void RBBITest::TestSentBreaks(void)
3926 {
3927 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3928 Locale locale("en");
3929 UErrorCode status = U_ZERO_ERROR;
3930 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3931 UChar str[200];
3932 static const char *strlist[] =
3933 {
3934 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3935 "This\n",
3936 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3937 "\"Sentence ending with a quote.\" Bye.",
3938 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3939 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3940 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3941 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3942 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3943 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3944 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3945 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3946 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3947 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3948 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3949 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3950 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3951 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3952 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3953 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3954 };
3955 int loop;
3956 if (U_FAILURE(status)) {
3957 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3958 return;
3959 }
3960 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3961 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3962 UnicodeString ustr(str);
3963
3964 RBBISentMonkey monkey;
3965 if (U_FAILURE(monkey.deferredStatus)) {
3966 continue;
3967 }
3968
3969 const int EXPECTEDSIZE = 50;
3970 int expected[EXPECTEDSIZE];
3971 int expectedcount = 0;
3972
3973 monkey.setText(ustr);
3974 int i;
3975 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3976 if (expectedcount >= EXPECTEDSIZE) {
3977 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3978 return;
3979 }
3980 expected[expectedcount ++] = i;
3981 }
3982
3983 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3984 }
3985 delete bi;
3986 #endif
3987 }
3988
3989 void RBBITest::TestMonkey(char *params) {
3990 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3991
3992 UErrorCode status = U_ZERO_ERROR;
3993 int32_t loopCount = 500;
3994 int32_t seed = 1;
3995 UnicodeString breakType = "all";
3996 Locale locale("en");
3997 UBool useUText = FALSE;
3998
3999 if (quick == FALSE) {
4000 loopCount = 10000;
4001 }
4002
4003 if (params) {
4004 UnicodeString p(params);
4005 loopCount = getIntParam("loop", p, loopCount);
4006 seed = getIntParam("seed", p, seed);
4007
4008 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4009 if (m.find()) {
4010 breakType = m.group(1, status);
4011 m.reset();
4012 p = m.replaceFirst("", status);
4013 }
4014
4015 RegexMatcher u(" *utext", p, 0, status);
4016 if (u.find()) {
4017 useUText = TRUE;
4018 u.reset();
4019 p = u.replaceFirst("", status);
4020 }
4021
4022
4023 // m.reset(p);
4024 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4025 // Each option is stripped out of the option string as it is processed.
4026 // All options have been checked. The option string should have been completely emptied..
4027 char buf[100];
4028 p.extract(buf, sizeof(buf), NULL, status);
4029 buf[sizeof(buf)-1] = 0;
4030 errln("Unrecognized or extra parameter: %s\n", buf);
4031 return;
4032 }
4033
4034 }
4035
4036 if (breakType == "char" || breakType == "all") {
4037 RBBICharMonkey m;
4038 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
4039 if (U_SUCCESS(status)) {
4040 RunMonkey(bi, m, "char", seed, loopCount, useUText);
4041 if (breakType == "all" && useUText==FALSE) {
4042 // Also run a quick test with UText when "all" is specified
4043 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4044 }
4045 }
4046 else {
4047 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4048 }
4049 delete bi;
4050 }
4051
4052 if (breakType == "word" || breakType == "all") {
4053 logln("Word Break Monkey Test");
4054 RBBIWordMonkey m;
4055 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4056 if (U_SUCCESS(status)) {
4057 RunMonkey(bi, m, "word", seed, loopCount, useUText);
4058 }
4059 else {
4060 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4061 }
4062 delete bi;
4063 }
4064
4065 if (breakType == "line" || breakType == "all") {
4066 logln("Line Break Monkey Test");
4067 RBBILineMonkey m;
4068 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4069 if (loopCount >= 10) {
4070 loopCount = loopCount / 5; // Line break runs slower than the others.
4071 }
4072 if (U_SUCCESS(status)) {
4073 RunMonkey(bi, m, "line", seed, loopCount, useUText);
4074 }
4075 else {
4076 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4077 }
4078 delete bi;
4079 }
4080
4081 if (breakType == "sent" || breakType == "all" ) {
4082 logln("Sentence Break Monkey Test");
4083 RBBISentMonkey m;
4084 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4085 if (loopCount >= 10) {
4086 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4087 }
4088 if (U_SUCCESS(status)) {
4089 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4090 }
4091 else {
4092 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4093 }
4094 delete bi;
4095 }
4096
4097 #endif
4098 }
4099
4100 //
4101 // Run a RBBI monkey test. Common routine, for all break iterator types.
4102 // Parameters:
4103 // bi - the break iterator to use
4104 // mk - MonkeyKind, abstraction for obtaining expected results
4105 // name - Name of test (char, word, etc.) for use in error messages
4106 // seed - Seed for starting random number generator (parameter from user)
4107 // numIterations
4108 //
4109 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
4110 int32_t numIterations, UBool useUText) {
4111
4112 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4113
4114 const int32_t TESTSTRINGLEN = 500;
4115 UnicodeString testText;
4116 int32_t numCharClasses;
4117 UVector *chClasses;
4118 int expected[TESTSTRINGLEN*2 + 1];
4119 int expectedCount = 0;
4120 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4121 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4122 char reverseBreaks[TESTSTRINGLEN*2+1];
4123 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4124 char followingBreaks[TESTSTRINGLEN*2+1];
4125 char precedingBreaks[TESTSTRINGLEN*2+1];
4126 int i;
4127 int loopCount = 0;
4128
4129 m_seed = seed;
4130
4131 numCharClasses = mk.charClasses()->size();
4132 chClasses = mk.charClasses();
4133
4134 // Check for errors that occured during the construction of the MonkeyKind object.
4135 // Can't report them where they occured because errln() is a method coming from intlTest,
4136 // and is not visible outside of RBBITest :-(
4137 if (U_FAILURE(mk.deferredStatus)) {
4138 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4139 return;
4140 }
4141
4142 // Verify that the character classes all have at least one member.
4143 for (i=0; i<numCharClasses; i++) {
4144 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4145 if (s == NULL || s->size() == 0) {
4146 errln("Character Class #%d is null or of zero size.", i);
4147 return;
4148 }
4149 }
4150
4151 while (loopCount < numIterations || numIterations == -1) {
4152 if (numIterations == -1 && loopCount % 10 == 0) {
4153 // If test is running in an infinite loop, display a periodic tic so
4154 // we can tell that it is making progress.
4155 fprintf(stderr, ".");
4156 }
4157 // Save current random number seed, so that we can recreate the random numbers
4158 // for this loop iteration in event of an error.
4159 seed = m_seed;
4160
4161 // Populate a test string with data.
4162 testText.truncate(0);
4163 for (i=0; i<TESTSTRINGLEN; i++) {
4164 int32_t aClassNum = m_rand() % numCharClasses;
4165 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4166 int32_t charIdx = m_rand() % classSet->size();
4167 UChar32 c = classSet->charAt(charIdx);
4168 if (c < 0) { // TODO: deal with sets containing strings.
4169 errln("c < 0");
4170 break;
4171 }
4172 testText.append(c);
4173 }
4174
4175 // Calculate the expected results for this test string.
4176 mk.setText(testText);
4177 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4178 expectedBreaks[0] = 1;
4179 int32_t breakPos = 0;
4180 expectedCount = 0;
4181 for (;;) {
4182 breakPos = mk.next(breakPos);
4183 if (breakPos == -1) {
4184 break;
4185 }
4186 if (breakPos > testText.length()) {
4187 errln("breakPos > testText.length()");
4188 }
4189 expectedBreaks[breakPos] = 1;
4190 U_ASSERT(expectedCount<testText.length());
4191 expected[expectedCount ++] = breakPos;
4192 (void)expected; // Set but not used warning.
4193 // TODO (andy): check it out.
4194 }
4195
4196 // Find the break positions using forward iteration
4197 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4198 if (useUText) {
4199 UErrorCode status = U_ZERO_ERROR;
4200 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4201 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4202 bi->setText(testUText, status);
4203 TEST_ASSERT_SUCCESS(status);
4204 utext_close(testUText); // The break iterator does a shallow clone of the UText
4205 // This UText can be closed immediately, so long as the
4206 // testText string continues to exist.
4207 } else {
4208 bi->setText(testText);
4209 }
4210
4211 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4212 if (i < 0 || i > testText.length()) {
4213 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4214 break;
4215 }
4216 forwardBreaks[i] = 1;
4217 }
4218
4219 // Find the break positions using reverse iteration
4220 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4221 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4222 if (i < 0 || i > testText.length()) {
4223 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4224 break;
4225 }
4226 reverseBreaks[i] = 1;
4227 }
4228
4229 // Find the break positions using isBoundary() tests.
4230 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4231 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4232 for (i=0; i<=testText.length(); i++) {
4233 isBoundaryBreaks[i] = bi->isBoundary(i);
4234 }
4235
4236
4237 // Find the break positions using the following() function.
4238 // printf(".");
4239 memset(followingBreaks, 0, sizeof(followingBreaks));
4240 int32_t lastBreakPos = 0;
4241 followingBreaks[0] = 1;
4242 for (i=0; i<testText.length(); i++) {
4243 breakPos = bi->following(i);
4244 if (breakPos <= i ||
4245 breakPos < lastBreakPos ||
4246 breakPos > testText.length() ||
4247 (breakPos > lastBreakPos && lastBreakPos > i)) {
4248 UChar32 brkChar = testText.char32At(lastBreakPos);
4249 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4250 errln("%s break monkey test: "
4251 "Out of range value returned by BreakIterator::following().\n"
4252 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4253 name, seed, i, breakPos, lastBreakPos);
4254 }
4255 break;
4256 }
4257 followingBreaks[breakPos] = 1;
4258 lastBreakPos = breakPos;
4259 }
4260
4261 // Find the break positions using the preceding() function.
4262 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4263 lastBreakPos = testText.length();
4264 precedingBreaks[testText.length()] = 1;
4265 for (i=testText.length(); i>0; i--) {
4266 breakPos = bi->preceding(i);
4267 if (breakPos >= i ||
4268 breakPos > lastBreakPos ||
4269 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4270 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4271 UChar32 brkChar = testText.char32At(breakPos);
4272 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4273 errln("%s break monkey test: "
4274 "Out of range value returned by BreakIterator::preceding().\n"
4275 "index=%d; prev returned %d; lastBreak=%d" ,
4276 name, i, breakPos, lastBreakPos);
4277 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4278 precedingBreaks[i] = 2; // Forces an error.
4279 }
4280 }
4281 } else {
4282 if (breakPos >= 0) {
4283 precedingBreaks[breakPos] = 1;
4284 }
4285 lastBreakPos = breakPos;
4286 }
4287 }
4288
4289 // Compare the expected and actual results.
4290 for (i=0; i<=testText.length(); i++) {
4291 const char *errorType = NULL;
4292 if (forwardBreaks[i] != expectedBreaks[i]) {
4293 errorType = "next()";
4294 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4295 errorType = "previous()";
4296 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4297 errorType = "isBoundary()";
4298 } else if (followingBreaks[i] != expectedBreaks[i]) {
4299 errorType = "following()";
4300 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4301 errorType = "preceding()";
4302 }
4303
4304
4305 if (errorType != NULL) {
4306 // Format a range of the test text that includes the failure as
4307 // a data item that can be included in the rbbi test data file.
4308
4309 // Start of the range is the last point where expected and actual results
4310 // both agreed that there was a break position.
4311 int startContext = i;
4312 int32_t count = 0;
4313 for (;;) {
4314 if (startContext==0) { break; }
4315 startContext --;
4316 if (expectedBreaks[startContext] != 0) {
4317 if (count == 2) break;
4318 count ++;
4319 }
4320 }
4321
4322 // End of range is two expected breaks past the start position.
4323 int endContext = i + 1;
4324 int ci;
4325 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4326 for (;;) {
4327 if (endContext >= testText.length()) {break;}
4328 if (expectedBreaks[endContext-1] != 0) {
4329 if (count == 0) break;
4330 count --;
4331 }
4332 endContext ++;
4333 }
4334 }
4335
4336 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4337 UnicodeString errorText = "<data>";
4338 /***if (strcmp(errorType, "next()") == 0) {
4339 startContext = 0;
4340 endContext = testText.length();
4341
4342 printStringBreaks(testText, expected, expectedCount);
4343 }***/
4344
4345 for (ci=startContext; ci<endContext;) {
4346 UnicodeString hexChars("0123456789abcdef");
4347 UChar32 c;
4348 int bn;
4349 c = testText.char32At(ci);
4350 if (ci == i) {
4351 // This is the location of the error.
4352 errorText.append("<?>");
4353 } else if (expectedBreaks[ci] != 0) {
4354 // This a non-error expected break position.
4355 errorText.append("\\");
4356 }
4357 if (c < 0x10000) {
4358 errorText.append("\\u");
4359 for (bn=12; bn>=0; bn-=4) {
4360 errorText.append(hexChars.charAt((c>>bn)&0xf));
4361 }
4362 } else {
4363 errorText.append("\\U");
4364 for (bn=28; bn>=0; bn-=4) {
4365 errorText.append(hexChars.charAt((c>>bn)&0xf));
4366 }
4367 }
4368 ci = testText.moveIndex32(ci, 1);
4369 }
4370 errorText.append("\\");
4371 errorText.append("</data>\n");
4372
4373 // Output the error
4374 char charErrorTxt[500];
4375 UErrorCode status = U_ZERO_ERROR;
4376 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4377 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4378 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4379
4380 UChar32 brkChar = testText.char32At(i);
4381 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4382 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4383 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4384 errorType, seed, i, charErrorTxt);
4385 }
4386 break;
4387 }
4388 }
4389
4390 loopCount++;
4391 }
4392 #endif
4393 }
4394
4395
4396 // Bug 5532. UTF-8 based UText fails in dictionary code.
4397 // This test checks the initial patch,
4398 // which is to just keep it from crashing. Correct word boundaries
4399 // await a proper fix to the dictionary code.
4400 //
4401 void RBBITest::TestBug5532(void) {
4402 // Text includes a mixture of Thai and Latin.
4403 const unsigned char utf8Data[] = {
4404 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4405 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4406 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4407 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4408 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4409 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4410 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4411 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4412 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4413 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4414 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4415
4416 UErrorCode status = U_ZERO_ERROR;
4417 UText utext=UTEXT_INITIALIZER;
4418 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4419 TEST_ASSERT_SUCCESS(status);
4420
4421 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4422 TEST_ASSERT_SUCCESS(status);
4423 if (U_SUCCESS(status)) {
4424 bi->setText(&utext, status);
4425 TEST_ASSERT_SUCCESS(status);
4426
4427 int32_t breakCount = 0;
4428 int32_t previousBreak = -1;
4429 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4430 // For now, just make sure that the break iterator doesn't hang.
4431 TEST_ASSERT(previousBreak < bi->current());
4432 previousBreak = bi->current();
4433 }
4434 TEST_ASSERT(breakCount > 0);
4435 }
4436 delete bi;
4437 utext_close(&utext);
4438 }
4439
4440
4441 void RBBITest::TestBug9983(void) {
4442 UnicodeString text = UnicodeString("\\u002A" // * Other
4443 "\\uFF65" // Other
4444 "\\u309C" // Katakana
4445 "\\uFF9F" // Extend
4446 "\\uFF65" // Other
4447 "\\u0020" // Other
4448 "\\u0000").unescape();
4449
4450 UErrorCode status = U_ZERO_ERROR;
4451 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4452 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4453 TEST_ASSERT_SUCCESS(status);
4454 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4455 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4456 TEST_ASSERT_SUCCESS(status);
4457 if (U_FAILURE(status)) {
4458 return;
4459 }
4460 int32_t offset, rstatus, iterationCount;
4461
4462 brkiter->setText(text);
4463 brkiter->last();
4464 iterationCount = 0;
4465 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4466 iterationCount++;
4467 rstatus = brkiter->getRuleStatus();
4468 (void)rstatus; // Suppress set but not used warning.
4469 if (iterationCount >= 10) {
4470 break;
4471 }
4472 }
4473 TEST_ASSERT(iterationCount == 6);
4474
4475 brkiterPOSIX->setText(text);
4476 brkiterPOSIX->last();
4477 iterationCount = 0;
4478 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4479 iterationCount++;
4480 rstatus = brkiterPOSIX->getRuleStatus();
4481 (void)rstatus; // Suppress set but not used warning.
4482 if (iterationCount >= 10) {
4483 break;
4484 }
4485 }
4486 TEST_ASSERT(iterationCount == 6);
4487 }
4488
4489
4490 //
4491 // TestDebug - A place-holder test for debugging purposes.
4492 // For putting in fragments of other tests that can be invoked
4493 // for tracing without a lot of unwanted extra stuff happening.
4494 //
4495 void RBBITest::TestDebug(void) {
4496 #if 0
4497 UErrorCode status = U_ZERO_ERROR;
4498 int pos = 0;
4499 int ruleStatus = 0;
4500
4501 RuleBasedBreakIterator* bi =
4502 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4503 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4504 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4505 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4506 // UnicodeString s("Aaa. Bcd");
4507 s = s.unescape();
4508 bi->setText(s);
4509 UBool r = bi->isBoundary(8);
4510 printf("%s", r?"true":"false");
4511 return;
4512 pos = bi->last();
4513 do {
4514 // ruleStatus = bi->getRuleStatus();
4515 printf("%d\t%d\n", pos, ruleStatus);
4516 pos = bi->previous();
4517 } while (pos != BreakIterator::DONE);
4518 #endif
4519 }
4520
4521 void RBBITest::TestProperties() {
4522 UErrorCode errorCode = U_ZERO_ERROR;
4523 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4524 if (!prependSet.isEmpty()) {
4525 errln(
4526 "[:GCB=Prepend:] is not empty any more. "
4527 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4528 "change this test to the opposite condition.");
4529 }
4530 }
4531
4532 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */