]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/rbbitst.cpp
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / rbbitst.cpp
1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2016, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
11
12 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_BREAK_ITERATION
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18
19 #include "unicode/brkiter.h"
20 #include "unicode/localpointer.h"
21 #include "unicode/numfmt.h"
22 #include "unicode/rbbi.h"
23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
24 #include "unicode/regex.h"
25 #endif
26 #include "unicode/schriter.h"
27 #include "unicode/uchar.h"
28 #include "unicode/utf16.h"
29 #include "unicode/ucnv.h"
30 #include "unicode/uniset.h"
31 #include "unicode/uscript.h"
32 #include "unicode/ustring.h"
33 #include "unicode/utext.h"
34
35 #include "charstr.h"
36 #include "cmemory.h"
37 #include "intltest.h"
38 #include "rbbitst.h"
39 #include "utypeinfo.h" // for 'typeid' to work
40 #include "uvector.h"
41 #include "uvectr32.h"
42
43 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
44 #include "unicode/filteredbrk.h"
45 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
46
47 #define TEST_ASSERT(x) {if (!(x)) { \
48 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
49
50 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
51 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
52
53
54 //---------------------------------------------
55 // runIndexedTest
56 //---------------------------------------------
57
58
59 // Note: Before adding new tests to this file, check whether the desired test data can
60 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
61 // it's much less work than writing a new test, diagnostic output in the event of failures
62 // is good, and the test data file will is shared with ICU4J, so eventually the test
63 // will run there as well, without additional effort.
64
65 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
66 {
67 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
68
69 switch (index) {
70 #if !UCONFIG_NO_FILE_IO
71 case 0: name = "TestBug4153072";
72 if(exec) TestBug4153072(); break;
73 #else
74 case 0: name = "skip";
75 break;
76 #endif
77
78 case 1: name = "skip";
79 break;
80 case 2: name = "TestStatusReturn";
81 if(exec) TestStatusReturn(); break;
82
83 #if !UCONFIG_NO_FILE_IO
84 case 3: name = "TestUnicodeFiles";
85 if(exec) TestUnicodeFiles(); break;
86 case 4: name = "TestEmptyString";
87 if(exec) TestEmptyString(); break;
88 #else
89 case 3: case 4: name = "skip";
90 break;
91 #endif
92
93 case 5: name = "TestGetAvailableLocales";
94 if(exec) TestGetAvailableLocales(); break;
95
96 case 6: name = "TestGetDisplayName";
97 if(exec) TestGetDisplayName(); break;
98
99 #if !UCONFIG_NO_FILE_IO
100 case 7: name = "TestEndBehaviour";
101 if(exec) TestEndBehaviour(); break;
102 case 8: case 9: case 10: name = "skip";
103 break;
104 case 11: name = "TestWordBreaks";
105 if(exec) TestWordBreaks(); break;
106 case 12: name = "TestWordBoundary";
107 if(exec) TestWordBoundary(); break;
108 case 13: name = "TestLineBreaks";
109 if(exec) TestLineBreaks(); break;
110 case 14: name = "TestSentBreaks";
111 if(exec) TestSentBreaks(); break;
112 case 15: name = "TestExtended";
113 if(exec) TestExtended(); break;
114 #else
115 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
116 break;
117 #endif
118
119 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
120 case 16:
121 name = "TestMonkey"; if(exec) TestMonkey(params); break;
122 #else
123 case 16:
124 name = "skip"; break;
125 #endif
126
127 #if !UCONFIG_NO_FILE_IO
128 case 17: name = "TestBug3818";
129 if(exec) TestBug3818(); break;
130 #else
131 case 17: name = "skip";
132 break;
133 #endif
134
135 case 18: name = "skip";
136 break;
137 case 19: name = "TestDebug";
138 if(exec) TestDebug(); break;
139 case 20: name = "skip";
140 break;
141
142 #if !UCONFIG_NO_FILE_IO
143 case 21: name = "TestBug5775";
144 if (exec) TestBug5775(); break;
145 #else
146 case 21: name = "skip";
147 break;
148 #endif
149
150 case 22: name = "TestBug9983";
151 if (exec) TestBug9983(); break;
152 case 23: name = "TestDictRules";
153 if (exec) TestDictRules(); break;
154 case 24: name = "TestBug5532";
155 if (exec) TestBug5532(); break;
156 default: name = ""; break; //needed to end loop
157 }
158 }
159
160
161 //---------------------------------------------------------------------------
162 //
163 // class BITestData Holds a set of Break iterator test data and results
164 // Includes
165 // - the string data to be broken
166 // - a vector of the expected break positions.
167 // - a vector of source line numbers for the data,
168 // (to help see where errors occured.)
169 // - The expected break tag values.
170 // - Vectors of actual break positions and tag values.
171 // - Functions for comparing actual with expected and
172 // reporting errors.
173 //
174 //----------------------------------------------------------------------------
175 class BITestData {
176 public:
177 UnicodeString fDataToBreak;
178 UVector fExpectedBreakPositions;
179 UVector fExpectedTags;
180 UVector fLineNum;
181 UVector fActualBreakPositions; // Test Results.
182 UVector fActualTags;
183
184 BITestData(UErrorCode &status);
185 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
186 void checkResults(const char *heading, RBBITest *test);
187 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
188 void clearResults();
189 };
190
191 //
192 // Constructor.
193 //
194 BITestData::BITestData(UErrorCode &status)
195 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
196 fActualTags(status)
197 {
198 }
199
200 //
201 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
202 // The macro form collects the line number, which is helpful
203 // when tracking down failures.
204 //
205 // A null data item is inserted at the start of each test's data
206 // to put the starting zero into the data list. The position saved for
207 // each non-null item is its ending position.
208 //
209 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
210 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
211 if (U_FAILURE(status)) {return;}
212 if (data != NULL) {
213 fDataToBreak.append(CharsToUnicodeString(data));
214 }
215 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
216 fExpectedTags.addElement(tag, status);
217 fLineNum.addElement(lineNum, status);
218 }
219
220
221 //
222 // checkResults. Compare the actual and expected break positions, report any differences.
223 //
224 void BITestData::checkResults(const char *heading, RBBITest *test) {
225 int32_t expectedIndex = 0;
226 int32_t actualIndex = 0;
227
228 for (;;) {
229 // If we've run through both the expected and actual results vectors, we're done.
230 // break out of the loop.
231 if (expectedIndex >= fExpectedBreakPositions.size() &&
232 actualIndex >= fActualBreakPositions.size()) {
233 break;
234 }
235
236
237 if (expectedIndex >= fExpectedBreakPositions.size()) {
238 err(heading, test, expectedIndex-1, actualIndex);
239 actualIndex++;
240 continue;
241 }
242
243 if (actualIndex >= fActualBreakPositions.size()) {
244 err(heading, test, expectedIndex, actualIndex-1);
245 expectedIndex++;
246 continue;
247 }
248
249 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
250 err(heading, test, expectedIndex, actualIndex);
251 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
252 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
253 actualIndex++;
254 } else {
255 expectedIndex++;
256 }
257 continue;
258 }
259
260 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
261 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
262 heading, fLineNum.elementAt(expectedIndex),
263 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
264 }
265
266 actualIndex++;
267 expectedIndex++;
268 }
269 }
270
271 //
272 // err - An error was found. Report it, along with information about where the
273 // incorrectly broken test data appeared in the source file.
274 //
275 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
276 {
277 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
278 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
279 int32_t o = 0;
280 int32_t line = fLineNum.elementAti(expectedIdx);
281 if (expectedIdx > 0) {
282 // The line numbers are off by one because a premature break occurs somewhere
283 // within the previous item, rather than at the start of the current (expected) item.
284 // We want to report the offset of the unexpected break from the start of
285 // this previous item.
286 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
287 }
288 if (actual < expected) {
289 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
290 } else {
291 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
292 }
293 }
294
295
296 void BITestData::clearResults() {
297 fActualBreakPositions.removeAllElements();
298 fActualTags.removeAllElements();
299 }
300
301
302 //--------------------------------------------------------------------------------------
303 //
304 // RBBITest constructor and destructor
305 //
306 //--------------------------------------------------------------------------------------
307
308 RBBITest::RBBITest() {
309 }
310
311
312 RBBITest::~RBBITest() {
313 }
314
315 //-----------------------------------------------------------------------------------
316 //
317 // Test for status {tag} return value from break rules.
318 // TODO: a more thorough test.
319 //
320 //-----------------------------------------------------------------------------------
321 void RBBITest::TestStatusReturn() {
322 UnicodeString rulesString1("$Letters = [:L:];\n"
323 "$Numbers = [:N:];\n"
324 "$Letters+{1};\n"
325 "$Numbers+{2};\n"
326 "Help\\ /me\\!{4};\n"
327 "[^$Letters $Numbers];\n"
328 "!.*;\n", -1, US_INV);
329 UnicodeString testString1 = "abc123..abc Help me Help me!";
330 // 01234567890123456789012345678
331 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
332 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
333
334 UErrorCode status=U_ZERO_ERROR;
335 UParseError parseError;
336
337 LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
338 if(U_FAILURE(status)) {
339 dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status));
340 return;
341 }
342 int32_t pos;
343 int32_t i = 0;
344 bi->setText(testString1);
345 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
346 if (pos != bounds1[i]) {
347 errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
348 break;
349 }
350
351 int tag = bi->getRuleStatus();
352 if (tag != brkStatus[i]) {
353 errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
354 break;
355 }
356 i++;
357 }
358 }
359
360
361 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
362 UErrorCode status = U_ZERO_ERROR;
363 char name[100];
364 printf("code alpha extend alphanum type word sent line name\n");
365 int nextExpectedIndex = 0;
366 utext_setNativeIndex(tstr, 0);
367 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
368 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
369 printf("------------------------------------------------ %d\n", j);
370 ++nextExpectedIndex;
371 }
372
373 UChar32 c = utext_next32(tstr);
374 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
375 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
376 u_isUAlphabetic(c),
377 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
378 u_isalnum(c),
379 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
380 u_charType(c),
381 U_SHORT_PROPERTY_NAME),
382 u_getPropertyValueName(UCHAR_WORD_BREAK,
383 u_getIntPropertyValue(c,
384 UCHAR_WORD_BREAK),
385 U_SHORT_PROPERTY_NAME),
386 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
387 u_getIntPropertyValue(c,
388 UCHAR_SENTENCE_BREAK),
389 U_SHORT_PROPERTY_NAME),
390 u_getPropertyValueName(UCHAR_LINE_BREAK,
391 u_getIntPropertyValue(c,
392 UCHAR_LINE_BREAK),
393 U_SHORT_PROPERTY_NAME),
394 name);
395 }
396 }
397
398
399 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
400 UErrorCode status = U_ZERO_ERROR;
401 UText *tstr = NULL;
402 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
403 if (U_FAILURE(status)) {
404 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
405 return;
406 }
407 printStringBreaks(tstr, expected, expectedCount);
408 utext_close(tstr);
409 }
410
411
412 void RBBITest::TestBug3818() {
413 UErrorCode status = U_ZERO_ERROR;
414
415 // Four Thai words...
416 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
417 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
418 UnicodeString thaiStr(thaiWordData);
419
420 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
421 if (U_FAILURE(status) || bi == NULL) {
422 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
423 return;
424 }
425 bi->setText(thaiStr);
426
427 int32_t startOfSecondWord = bi->following(1);
428 if (startOfSecondWord != 4) {
429 errln("Fail at file %s, line %d expected start of word at 4, got %d",
430 __FILE__, __LINE__, startOfSecondWord);
431 }
432 startOfSecondWord = bi->following(0);
433 if (startOfSecondWord != 4) {
434 errln("Fail at file %s, line %d expected start of word at 4, got %d",
435 __FILE__, __LINE__, startOfSecondWord);
436 }
437 delete bi;
438 }
439
440 //----------------------------------------------------------------------------
441 //
442 // generalIteratorTest Given a break iterator and a set of test data,
443 // Run the tests and report the results.
444 //
445 //----------------------------------------------------------------------------
446 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
447 {
448
449 bi.setText(td.fDataToBreak);
450
451 testFirstAndNext(bi, td);
452
453 testLastAndPrevious(bi, td);
454
455 testFollowing(bi, td);
456 testPreceding(bi, td);
457 testIsBoundary(bi, td);
458 doMultipleSelectionTest(bi, td);
459 }
460
461
462 //
463 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
464 // kind of loop.
465 //
466 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
467 {
468 UErrorCode status = U_ZERO_ERROR;
469 int32_t p;
470 int32_t lastP = -1;
471 int32_t tag;
472
473 logln("Test first and next");
474 bi.setText(td.fDataToBreak);
475 td.clearResults();
476
477 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
478 td.fActualBreakPositions.addElement(p, status); // Save result.
479 tag = bi.getRuleStatus();
480 td.fActualTags.addElement(tag, status);
481 if (p <= lastP) {
482 // If the iterator is not making forward progress, stop.
483 // No need to raise an error here, it'll be detected in the normal check of results.
484 break;
485 }
486 lastP = p;
487 }
488 td.checkResults("testFirstAndNext", this);
489 }
490
491
492 //
493 // TestLastAndPrevious. Run the iterator backwards, starting with last().
494 //
495 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
496 {
497 UErrorCode status = U_ZERO_ERROR;
498 int32_t p;
499 int32_t lastP = 0x7ffffffe;
500 int32_t tag;
501
502 logln("Test last and previous");
503 bi.setText(td.fDataToBreak);
504 td.clearResults();
505
506 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
507 // Save break position. Insert it at start of vector of results, shoving
508 // already-saved results further towards the end.
509 td.fActualBreakPositions.insertElementAt(p, 0, status);
510 // bi.previous(); // TODO: Why does this fix things up????
511 // bi.next();
512 tag = bi.getRuleStatus();
513 td.fActualTags.insertElementAt(tag, 0, status);
514 if (p >= lastP) {
515 // If the iterator is not making progress, stop.
516 // No need to raise an error here, it'll be detected in the normal check of results.
517 break;
518 }
519 lastP = p;
520 }
521 td.checkResults("testLastAndPrevious", this);
522 }
523
524
525 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
526 {
527 UErrorCode status = U_ZERO_ERROR;
528 int32_t p;
529 int32_t tag;
530 int32_t lastP = -2; // A value that will never be returned as a break position.
531 // cannot be -1; that is returned for DONE.
532 int i;
533
534 logln("testFollowing():");
535 bi.setText(td.fDataToBreak);
536 td.clearResults();
537
538 // Save the starting point, since we won't get that out of following.
539 p = bi.first();
540 td.fActualBreakPositions.addElement(p, status); // Save result.
541 tag = bi.getRuleStatus();
542 td.fActualTags.addElement(tag, status);
543
544 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
545 p = bi.following(i);
546 if (p != lastP) {
547 if (p == RuleBasedBreakIterator::DONE) {
548 break;
549 }
550 // We've reached a new break position. Save it.
551 td.fActualBreakPositions.addElement(p, status); // Save result.
552 tag = bi.getRuleStatus();
553 td.fActualTags.addElement(tag, status);
554 lastP = p;
555 }
556 }
557 // The loop normally exits by means of the break in the middle.
558 // Make sure that the index was at the correct position for the break iterator to have
559 // returned DONE.
560 if (i != td.fDataToBreak.length()) {
561 errln("testFollowing(): iterator returned DONE prematurely.");
562 }
563
564 // Full check of all results.
565 td.checkResults("testFollowing", this);
566 }
567
568
569
570 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
571 UErrorCode status = U_ZERO_ERROR;
572 int32_t p;
573 int32_t tag;
574 int32_t lastP = 0x7ffffffe;
575 int i;
576
577 logln("testPreceding():");
578 bi.setText(td.fDataToBreak);
579 td.clearResults();
580
581 p = bi.last();
582 td.fActualBreakPositions.addElement(p, status);
583 tag = bi.getRuleStatus();
584 td.fActualTags.addElement(tag, status);
585
586 for (i = td.fDataToBreak.length(); i>=-1; i--) {
587 p = bi.preceding(i);
588 if (p != lastP) {
589 if (p == RuleBasedBreakIterator::DONE) {
590 break;
591 }
592 // We've reached a new break position. Save it.
593 td.fActualBreakPositions.insertElementAt(p, 0, status);
594 lastP = p;
595 tag = bi.getRuleStatus();
596 td.fActualTags.insertElementAt(tag, 0, status);
597 }
598 }
599 // The loop normally exits by means of the break in the middle.
600 // Make sure that the index was at the correct position for the break iterator to have
601 // returned DONE.
602 if (i != 0) {
603 errln("testPreceding(): iterator returned DONE prematurely.");
604 }
605
606 // Full check of all results.
607 td.checkResults("testPreceding", this);
608 }
609
610
611
612 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
613 UErrorCode status = U_ZERO_ERROR;
614 int i;
615 int32_t tag;
616
617 logln("testIsBoundary():");
618 bi.setText(td.fDataToBreak);
619 td.clearResults();
620
621 for (i = 0; i <= td.fDataToBreak.length(); i++) {
622 if (bi.isBoundary(i)) {
623 td.fActualBreakPositions.addElement(i, status); // Save result.
624 tag = bi.getRuleStatus();
625 td.fActualTags.addElement(tag, status);
626 }
627 }
628 td.checkResults("testIsBoundary: ", this);
629 }
630
631
632
633 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
634 {
635 iterator.setText(td.fDataToBreak);
636
637 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
638 int32_t offset = iterator.first();
639 int32_t testOffset;
640 int32_t count = 0;
641
642 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
643
644 if (*testIterator != iterator)
645 errln("clone() or operator!= failed: two clones compared unequal");
646
647 do {
648 testOffset = testIterator->first();
649 testOffset = testIterator->next(count);
650 if (offset != testOffset)
651 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
652
653 if (offset != RuleBasedBreakIterator::DONE) {
654 count++;
655 offset = iterator.next();
656
657 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
658 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
659 if (count > 10000 || offset == -1) {
660 errln("operator== failed too many times. Stopping test.");
661 if (offset == -1) {
662 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
663 }
664 return;
665 }
666 }
667 }
668 } while (offset != RuleBasedBreakIterator::DONE);
669
670 // now do it backwards...
671 offset = iterator.last();
672 count = 0;
673
674 do {
675 testOffset = testIterator->last();
676 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
677 if (offset != testOffset)
678 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
679
680 if (offset != RuleBasedBreakIterator::DONE) {
681 count--;
682 offset = iterator.previous();
683 }
684 } while (offset != RuleBasedBreakIterator::DONE);
685
686 delete testIterator;
687 }
688
689
690 //---------------------------------------------
691 //
692 // other tests
693 //
694 //---------------------------------------------
695 void RBBITest::TestEmptyString()
696 {
697 UnicodeString text = "";
698 UErrorCode status = U_ZERO_ERROR;
699
700 BITestData x(status);
701 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
702 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
703 if (U_FAILURE(status))
704 {
705 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
706 return;
707 }
708 generalIteratorTest(*bi, x);
709 delete bi;
710 }
711
712 void RBBITest::TestGetAvailableLocales()
713 {
714 int32_t locCount = 0;
715 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
716
717 if (locCount == 0)
718 dataerrln("getAvailableLocales() returned an empty list!");
719 // Just make sure that it's returning good memory.
720 int32_t i;
721 for (i = 0; i < locCount; ++i) {
722 logln(locList[i].getName());
723 }
724 }
725
726 //Testing the BreakIterator::getDisplayName() function
727 void RBBITest::TestGetDisplayName()
728 {
729 UnicodeString result;
730
731 BreakIterator::getDisplayName(Locale::getUS(), result);
732 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
733 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
734 + result);
735
736 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
737 if (result != "French (France)")
738 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
739 + result);
740 }
741 /**
742 * Test End Behaviour
743 * @bug 4068137
744 */
745 void RBBITest::TestEndBehaviour()
746 {
747 UErrorCode status = U_ZERO_ERROR;
748 UnicodeString testString("boo.");
749 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
750 if (U_FAILURE(status))
751 {
752 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
753 return;
754 }
755 wb->setText(testString);
756
757 if (wb->first() != 0)
758 errln("Didn't get break at beginning of string.");
759 if (wb->next() != 3)
760 errln("Didn't get break before period in \"boo.\"");
761 if (wb->current() != 4 && wb->next() != 4)
762 errln("Didn't get break at end of string.");
763 delete wb;
764 }
765 /*
766 * @bug 4153072
767 */
768 void RBBITest::TestBug4153072() {
769 UErrorCode status = U_ZERO_ERROR;
770 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
771 if (U_FAILURE(status))
772 {
773 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
774 return;
775 }
776 UnicodeString str("...Hello, World!...");
777 int32_t begin = 3;
778 int32_t end = str.length() - 3;
779 UBool onBoundary;
780
781 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
782 iter->adoptText(textIterator);
783 int index;
784 // Note: with the switch to UText, there is no way to restrict the
785 // iteration range to begin at an index other than zero.
786 // String character iterators created with a non-zero bound are
787 // treated by RBBI as being empty.
788 for (index = -1; index < begin + 1; ++index) {
789 onBoundary = iter->isBoundary(index);
790 if (index == 0? !onBoundary : onBoundary) {
791 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
792 " and begin index = " + begin);
793 }
794 }
795 delete iter;
796 }
797
798
799 //
800 // Test for problem reported by Ashok Matoria on 9 July 2007
801 // One.<kSoftHyphen><kSpace>Two.
802 //
803 // Sentence break at start (0) and then on calling next() it breaks at
804 // 'T' of "Two". Now, at this point if I do next() and
805 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
806 //
807 void RBBITest::TestBug5775() {
808 UErrorCode status = U_ZERO_ERROR;
809 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
810 TEST_ASSERT_SUCCESS(status);
811 if (U_FAILURE(status)) {
812 return;
813 }
814 // Check for status first for better handling of no data errors.
815 TEST_ASSERT(bi != NULL);
816 if (bi == NULL) {
817 return;
818 }
819
820 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
821 // 01234 56789
822 s = s.unescape();
823 bi->setText(s);
824 int pos = bi->next();
825 TEST_ASSERT(pos == 6);
826 pos = bi->next();
827 TEST_ASSERT(pos == 10);
828 pos = bi->previous();
829 TEST_ASSERT(pos == 6);
830 delete bi;
831 }
832
833
834
835 //------------------------------------------------------------------------------
836 //
837 // RBBITest::Extended Run RBBI Tests from an external test data file
838 //
839 //------------------------------------------------------------------------------
840
841 struct TestParams {
842 BreakIterator *bi; // Break iterator is set while parsing test source.
843 // Changed out whenever test data changes break type.
844
845 UnicodeString dataToBreak; // Data that is built up while parsing the test.
846 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
847 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
848 UVector32 *srcCol;
849
850 UText *textToBreak; // UText, could be UTF8 or UTF16.
851 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
852 CharString utf8String; // UTF-8 form of text to break.
853
854 TestParams(UErrorCode &status) : dataToBreak() {
855 bi = NULL;
856 expectedBreaks = new UVector32(status);
857 srcLine = new UVector32(status);
858 srcCol = new UVector32(status);
859 textToBreak = NULL;
860 textMap = new UVector32(status);
861 }
862
863 ~TestParams() {
864 delete bi;
865 delete expectedBreaks;
866 delete srcLine;
867 delete srcCol;
868 utext_close(textToBreak);
869 delete textMap;
870 }
871
872 int32_t getSrcLine(int32_t bp);
873 int32_t getExpectedBreak(int32_t bp);
874 int32_t getSrcCol(int32_t bp);
875
876 void setUTF16(UErrorCode &status);
877 void setUTF8(UErrorCode &status);
878 };
879
880 // Append a UnicodeString to a CharString with UTF-8 encoding.
881 // Substitute any invalid chars.
882 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
883 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
884 if (U_FAILURE(status)) {
885 return;
886 }
887 int32_t utf8Length;
888 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
889 src.getBuffer(), src.length(), // UTF-16 data
890 0xfffd, NULL, // Substitution char, number of subs.
891 &status);
892 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
893 return;
894 }
895 status = U_ZERO_ERROR;
896 int32_t capacity;
897 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
898 u_strToUTF8WithSub(buffer, utf8Length, NULL,
899 src.getBuffer(), src.length(),
900 0xfffd, NULL, &status);
901 dest.append(buffer, utf8Length, status);
902 }
903
904
905 void TestParams::setUTF16(UErrorCode &status) {
906 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
907 textMap->removeAllElements();
908 for (int32_t i=0; i<dataToBreak.length(); i++) {
909 if (i == dataToBreak.getChar32Start(i)) {
910 textMap->addElement(i, status);
911 } else {
912 textMap->addElement(-1, status);
913 }
914 }
915 textMap->addElement(dataToBreak.length(), status);
916 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
917 }
918
919
920 void TestParams::setUTF8(UErrorCode &status) {
921 if (U_FAILURE(status)) {
922 return;
923 }
924 utf8String.clear();
925 CharStringAppend(utf8String, dataToBreak, status);
926 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
927 if (U_FAILURE(status)) {
928 return;
929 }
930
931 textMap->removeAllElements();
932 int32_t utf16Index = 0;
933 for (;;) {
934 textMap->addElement(utf16Index, status);
935 UChar32 c32 = utext_current32(textToBreak);
936 if (c32 < 0) {
937 break;
938 }
939 utf16Index += U16_LENGTH(c32);
940 utext_next32(textToBreak);
941 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
942 textMap->addElement(-1, status);
943 }
944 }
945 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
946 }
947
948
949 int32_t TestParams::getSrcLine(int bp) {
950 if (bp >= textMap->size()) {
951 bp = textMap->size() - 1;
952 }
953 int32_t i = 0;
954 for(; bp >= 0 ; --bp) {
955 // Move to a character boundary if we are not on one already.
956 i = textMap->elementAti(bp);
957 if (i >= 0) {
958 break;
959 }
960 }
961 return srcLine->elementAti(i);
962 }
963
964
965 int32_t TestParams::getExpectedBreak(int bp) {
966 if (bp >= textMap->size()) {
967 return 0;
968 }
969 int32_t i = textMap->elementAti(bp);
970 int32_t retVal = 0;
971 if (i >= 0) {
972 retVal = expectedBreaks->elementAti(i);
973 }
974 return retVal;
975 }
976
977
978 int32_t TestParams::getSrcCol(int bp) {
979 if (bp >= textMap->size()) {
980 bp = textMap->size() - 1;
981 }
982 int32_t i = 0;
983 for(; bp >= 0; --bp) {
984 // Move bp to a character boundary if we are not on one already.
985 i = textMap->elementAti(bp);
986 if (i >= 0) {
987 break;
988 }
989 }
990 return srcCol->elementAti(i);
991 }
992
993
994 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
995 int32_t bp;
996 int32_t prevBP;
997 int32_t i;
998
999 TEST_ASSERT_SUCCESS(status);
1000 if (U_FAILURE(status)) {
1001 return;
1002 }
1003
1004 if (t->bi == NULL) {
1005 return;
1006 }
1007
1008 t->bi->setText(t->textToBreak, status);
1009 //
1010 // Run the iterator forward
1011 //
1012 prevBP = -1;
1013 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1014 if (prevBP == bp) {
1015 // Fail for lack of forward progress.
1016 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1017 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1018 break;
1019 }
1020
1021 // Check that there we didn't miss an expected break between the last one
1022 // and this one.
1023 for (i=prevBP+1; i<bp; i++) {
1024 if (t->getExpectedBreak(i) != 0) {
1025 int expected[] = {0, i};
1026 printStringBreaks(t->dataToBreak, expected, 2);
1027 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1028 i, t->getSrcLine(i), t->getSrcCol(i));
1029 }
1030 }
1031
1032 // Check that the break we did find was expected
1033 if (t->getExpectedBreak(bp) == 0) {
1034 int expected[] = {0, bp};
1035 printStringBreaks(t->textToBreak, expected, 2);
1036 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1037 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1038 } else {
1039 // The break was expected.
1040 // Check that the {nnn} tag value is correct.
1041 int32_t expectedTagVal = t->getExpectedBreak(bp);
1042 if (expectedTagVal == -1) {
1043 expectedTagVal = 0;
1044 }
1045 int32_t line = t->getSrcLine(bp);
1046 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1047 if (rs != expectedTagVal) {
1048 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1049 " Actual, Expected status = %4d, %4d",
1050 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1051 }
1052 }
1053
1054 prevBP = bp;
1055 }
1056
1057 // Verify that there were no missed expected breaks after the last one found
1058 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1059 if (t->getExpectedBreak(i) != 0) {
1060 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1061 i, t->getSrcLine(i), t->getSrcCol(i));
1062 }
1063 }
1064
1065 //
1066 // Run the iterator backwards, verify that the same breaks are found.
1067 //
1068 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
1069 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1070 if (prevBP == bp) {
1071 // Fail for lack of progress.
1072 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1073 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1074 break;
1075 }
1076
1077 // Check that we didn't miss an expected break between the last one
1078 // and this one. (UVector returns zeros for index out of bounds.)
1079 for (i=prevBP-1; i>bp; i--) {
1080 if (t->getExpectedBreak(i) != 0) {
1081 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1082 i, t->getSrcLine(i), t->getSrcCol(i));
1083 }
1084 }
1085
1086 // Check that the break we did find was expected
1087 if (t->getExpectedBreak(bp) == 0) {
1088 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1089 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1090 } else {
1091 // The break was expected.
1092 // Check that the {nnn} tag value is correct.
1093 int32_t expectedTagVal = t->getExpectedBreak(bp);
1094 if (expectedTagVal == -1) {
1095 expectedTagVal = 0;
1096 }
1097 int line = t->getSrcLine(bp);
1098 int32_t rs = t->bi->getRuleStatus();
1099 if (rs != expectedTagVal) {
1100 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1101 " Actual, Expected status = %4d, %4d",
1102 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1103 }
1104 }
1105
1106 prevBP = bp;
1107 }
1108
1109 // Verify that there were no missed breaks prior to the last one found
1110 for (i=prevBP-1; i>=0; i--) {
1111 if (t->getExpectedBreak(i) != 0) {
1112 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1113 i, t->getSrcLine(i), t->getSrcCol(i));
1114 }
1115 }
1116
1117 // Check isBoundary()
1118 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1119 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1120 UBool boundaryFound = t->bi->isBoundary(i);
1121 if (boundaryExpected != boundaryFound) {
1122 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1123 " Expected, Actual= %s, %s",
1124 i, t->getSrcLine(i), t->getSrcCol(i),
1125 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1126 }
1127 }
1128
1129 // Check following()
1130 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1131 int32_t actualBreak = t->bi->following(i);
1132 int32_t expectedBreak = BreakIterator::DONE;
1133 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1134 if (t->getExpectedBreak(j) != 0) {
1135 expectedBreak = j;
1136 break;
1137 }
1138 }
1139 if (expectedBreak != actualBreak) {
1140 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1141 " Expected, Actual= %d, %d",
1142 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1143 }
1144 }
1145
1146 // Check preceding()
1147 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1148 int32_t actualBreak = t->bi->preceding(i);
1149 int32_t expectedBreak = BreakIterator::DONE;
1150
1151 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1152 // preceding(trailing byte) will return the index of some preceding code point,
1153 // not the lead byte of the current code point, even though that has a smaller index.
1154 // Therefore, start looking at the expected break data not at i-1, but at
1155 // the start of code point index - 1.
1156 utext_setNativeIndex(t->textToBreak, i);
1157 int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1158 for (; j >= 0; j--) {
1159 if (t->getExpectedBreak(j) != 0) {
1160 expectedBreak = j;
1161 break;
1162 }
1163 }
1164 if (expectedBreak != actualBreak) {
1165 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1166 " Expected, Actual= %d, %d",
1167 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1168 }
1169 }
1170 }
1171
1172
1173 void RBBITest::TestExtended() {
1174 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1175 UErrorCode status = U_ZERO_ERROR;
1176 Locale locale("");
1177
1178 UnicodeString rules;
1179 TestParams tp(status);
1180
1181 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
1182 if (U_FAILURE(status)) {
1183 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1184 }
1185
1186
1187 //
1188 // Open and read the test data file.
1189 //
1190 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1191 char testFileName[1000];
1192 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1193 errln("Can't open test data. Path too long.");
1194 return;
1195 }
1196 strcpy(testFileName, testDataDirectory);
1197 strcat(testFileName, "rbbitst.txt");
1198
1199 int len;
1200 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1201 if (U_FAILURE(status)) {
1202 return; /* something went wrong, error already output */
1203 }
1204
1205
1206 bool skipTest = false; // Skip this test?
1207
1208 //
1209 // Put the test data into a UnicodeString
1210 //
1211 UnicodeString testString(FALSE, testFile, len);
1212
1213 enum EParseState{
1214 PARSE_COMMENT,
1215 PARSE_TAG,
1216 PARSE_DATA,
1217 PARSE_NUM
1218 }
1219 parseState = PARSE_TAG;
1220
1221 EParseState savedState = PARSE_TAG;
1222
1223 static const UChar CH_LF = 0x0a;
1224 static const UChar CH_CR = 0x0d;
1225 static const UChar CH_HASH = 0x23;
1226 /*static const UChar CH_PERIOD = 0x2e;*/
1227 static const UChar CH_LT = 0x3c;
1228 static const UChar CH_GT = 0x3e;
1229 static const UChar CH_BACKSLASH = 0x5c;
1230 static const UChar CH_BULLET = 0x2022;
1231
1232 int32_t lineNum = 1;
1233 int32_t colStart = 0;
1234 int32_t column = 0;
1235 int32_t charIdx = 0;
1236
1237 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1238
1239 for (charIdx = 0; charIdx < len; ) {
1240 status = U_ZERO_ERROR;
1241 UChar c = testString.charAt(charIdx);
1242 charIdx++;
1243 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1244 // treat CRLF as a unit
1245 c = CH_LF;
1246 charIdx++;
1247 }
1248 if (c == CH_LF || c == CH_CR) {
1249 lineNum++;
1250 colStart = charIdx;
1251 }
1252 column = charIdx - colStart + 1;
1253
1254 switch (parseState) {
1255 case PARSE_COMMENT:
1256 if (c == 0x0a || c == 0x0d) {
1257 parseState = savedState;
1258 }
1259 break;
1260
1261 case PARSE_TAG:
1262 {
1263 if (c == CH_HASH) {
1264 parseState = PARSE_COMMENT;
1265 savedState = PARSE_TAG;
1266 break;
1267 }
1268 if (u_isUWhiteSpace(c)) {
1269 break;
1270 }
1271 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1272 delete tp.bi;
1273 tp.bi = BreakIterator::createWordInstance(locale, status);
1274 skipTest = false;
1275 charIdx += 5;
1276 break;
1277 }
1278 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1279 delete tp.bi;
1280 tp.bi = BreakIterator::createCharacterInstance(locale, status);
1281 skipTest = false;
1282 charIdx += 5;
1283 break;
1284 }
1285 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1286 delete tp.bi;
1287 tp.bi = BreakIterator::createLineInstance(locale, status);
1288 skipTest = false;
1289 charIdx += 5;
1290 break;
1291 }
1292 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1293 delete tp.bi;
1294 tp.bi = BreakIterator::createSentenceInstance(locale, status);
1295 skipTest = false;
1296 charIdx += 5;
1297 break;
1298 }
1299 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1300 delete tp.bi;
1301 tp.bi = BreakIterator::createTitleInstance(locale, status);
1302 charIdx += 6;
1303 break;
1304 }
1305
1306 // <locale loc_name>
1307 localeMatcher.reset(testString);
1308 if (localeMatcher.lookingAt(charIdx-1, status)) {
1309 UnicodeString localeName = localeMatcher.group(1, status);
1310 char localeName8[100];
1311 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1312 locale = Locale::createFromName(localeName8);
1313 charIdx += localeMatcher.group(0, status).length() - 1;
1314 TEST_ASSERT_SUCCESS(status);
1315 break;
1316 }
1317 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1318 parseState = PARSE_DATA;
1319 charIdx += 5;
1320 tp.dataToBreak = "";
1321 tp.expectedBreaks->removeAllElements();
1322 tp.srcCol ->removeAllElements();
1323 tp.srcLine->removeAllElements();
1324 break;
1325 }
1326
1327 errln("line %d: Tag expected in test file.", lineNum);
1328 parseState = PARSE_COMMENT;
1329 savedState = PARSE_DATA;
1330 goto end_test; // Stop the test.
1331 }
1332 break;
1333
1334 case PARSE_DATA:
1335 if (c == CH_BULLET) {
1336 int32_t breakIdx = tp.dataToBreak.length();
1337 tp.expectedBreaks->setSize(breakIdx+1);
1338 tp.expectedBreaks->setElementAt(-1, breakIdx);
1339 tp.srcLine->setSize(breakIdx+1);
1340 tp.srcLine->setElementAt(lineNum, breakIdx);
1341 tp.srcCol ->setSize(breakIdx+1);
1342 tp.srcCol ->setElementAt(column, breakIdx);
1343 break;
1344 }
1345
1346 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1347 // Add final entry to mappings from break location to source file position.
1348 // Need one extra because last break position returned is after the
1349 // last char in the data, not at the last char.
1350 tp.srcLine->addElement(lineNum, status);
1351 tp.srcCol ->addElement(column, status);
1352
1353 parseState = PARSE_TAG;
1354 charIdx += 6;
1355
1356 if (!skipTest) {
1357 // RUN THE TEST!
1358 status = U_ZERO_ERROR;
1359 tp.setUTF16(status);
1360 executeTest(&tp, status);
1361 TEST_ASSERT_SUCCESS(status);
1362
1363 // Run again, this time with UTF-8 text wrapped in a UText.
1364 status = U_ZERO_ERROR;
1365 tp.setUTF8(status);
1366 TEST_ASSERT_SUCCESS(status);
1367 executeTest(&tp, status);
1368 }
1369 break;
1370 }
1371
1372 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1373 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1374 // Get the code point from the name and insert it into the test data.
1375 // (Damn, no API takes names in Unicode !!!
1376 // we've got to take it back to char *)
1377 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1378 int32_t nameLength = nameEndIdx - (charIdx+2);
1379 char charNameBuf[200];
1380 UChar32 theChar = -1;
1381 if (nameEndIdx != -1) {
1382 UErrorCode status = U_ZERO_ERROR;
1383 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1384 charNameBuf[sizeof(charNameBuf)-1] = 0;
1385 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1386 if (U_FAILURE(status)) {
1387 theChar = -1;
1388 }
1389 }
1390 if (theChar == -1) {
1391 errln("Error in named character in test file at line %d, col %d",
1392 lineNum, column);
1393 } else {
1394 // Named code point was recognized. Insert it
1395 // into the test data.
1396 tp.dataToBreak.append(theChar);
1397 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1398 tp.srcLine->addElement(lineNum, status);
1399 tp.srcCol ->addElement(column, status);
1400 }
1401 }
1402 if (nameEndIdx > charIdx) {
1403 charIdx = nameEndIdx+1;
1404
1405 }
1406 break;
1407 }
1408
1409
1410
1411
1412 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1413 charIdx++;
1414 int32_t breakIdx = tp.dataToBreak.length();
1415 tp.expectedBreaks->setSize(breakIdx+1);
1416 tp.expectedBreaks->setElementAt(-1, breakIdx);
1417 tp.srcLine->setSize(breakIdx+1);
1418 tp.srcLine->setElementAt(lineNum, breakIdx);
1419 tp.srcCol ->setSize(breakIdx+1);
1420 tp.srcCol ->setElementAt(column, breakIdx);
1421 break;
1422 }
1423
1424 if (c == CH_LT) {
1425 tagValue = 0;
1426 parseState = PARSE_NUM;
1427 break;
1428 }
1429
1430 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
1431 parseState = PARSE_COMMENT;
1432 savedState = PARSE_DATA;
1433 break;
1434 }
1435
1436 if (c == CH_BACKSLASH) {
1437 // Check for \ at end of line, a line continuation.
1438 // Advance over (discard) the newline
1439 UChar32 cp = testString.char32At(charIdx);
1440 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1441 // We have a CR LF
1442 // Need an extra increment of the input ptr to move over both of them
1443 charIdx++;
1444 }
1445 if (cp == CH_LF || cp == CH_CR) {
1446 lineNum++;
1447 colStart = charIdx;
1448 charIdx++;
1449 break;
1450 }
1451
1452 // Let unescape handle the back slash.
1453 cp = testString.unescapeAt(charIdx);
1454 if (cp != -1) {
1455 // Escape sequence was recognized. Insert the char
1456 // into the test data.
1457 tp.dataToBreak.append(cp);
1458 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1459 tp.srcLine->addElement(lineNum, status);
1460 tp.srcCol ->addElement(column, status);
1461 }
1462 break;
1463 }
1464
1465
1466 // Not a recognized backslash escape sequence.
1467 // Take the next char as a literal.
1468 // TODO: Should this be an error?
1469 c = testString.charAt(charIdx);
1470 charIdx = testString.moveIndex32(charIdx, 1);
1471 }
1472
1473 // Normal, non-escaped data char.
1474 tp.dataToBreak.append(c);
1475
1476 // Save the mapping from offset in the data to line/column numbers in
1477 // the original input file. Will be used for better error messages only.
1478 // If there's an expected break before this char, the slot in the mapping
1479 // vector will already be set for this char; don't overwrite it.
1480 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1481 tp.srcLine->addElement(lineNum, status);
1482 tp.srcCol ->addElement(column, status);
1483 }
1484 break;
1485
1486
1487 case PARSE_NUM:
1488 // We are parsing an expected numeric tag value, like <1234>,
1489 // within a chunk of data.
1490 if (u_isUWhiteSpace(c)) {
1491 break;
1492 }
1493
1494 if (c == CH_GT) {
1495 // Finished the number. Add the info to the expected break data,
1496 // and switch parse state back to doing plain data.
1497 parseState = PARSE_DATA;
1498 if (tagValue == 0) {
1499 tagValue = -1;
1500 }
1501 int32_t breakIdx = tp.dataToBreak.length();
1502 tp.expectedBreaks->setSize(breakIdx+1);
1503 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1504 tp.srcLine->setSize(breakIdx+1);
1505 tp.srcLine->setElementAt(lineNum, breakIdx);
1506 tp.srcCol ->setSize(breakIdx+1);
1507 tp.srcCol ->setElementAt(column, breakIdx);
1508 break;
1509 }
1510
1511 if (u_isdigit(c)) {
1512 tagValue = tagValue*10 + u_charDigitValue(c);
1513 break;
1514 }
1515
1516 errln("Syntax Error in test file at line %d, col %d",
1517 lineNum, column);
1518 parseState = PARSE_COMMENT;
1519 goto end_test; // Stop the test
1520 break;
1521 }
1522
1523
1524 if (U_FAILURE(status)) {
1525 dataerrln("ICU Error %s while parsing test file at line %d.",
1526 u_errorName(status), lineNum);
1527 status = U_ZERO_ERROR;
1528 goto end_test; // Stop the test
1529 }
1530
1531 }
1532
1533 end_test:
1534 delete [] testFile;
1535 #endif
1536 }
1537
1538
1539 //-------------------------------------------------------------------------------
1540 //
1541 // TestDictRules create a break iterator from source rules that includes a
1542 // dictionary range. Regression for bug #7130. Source rules
1543 // do not declare a break iterator type (word, line, sentence, etc.
1544 // but the dictionary code, without a type, would loop.
1545 //
1546 //-------------------------------------------------------------------------------
1547 void RBBITest::TestDictRules() {
1548 const char *rules = "$dictionary = [a-z]; \n"
1549 "!!forward; \n"
1550 "$dictionary $dictionary; \n"
1551 "!!reverse; \n"
1552 "$dictionary $dictionary; \n";
1553 const char *text = "aa";
1554 UErrorCode status = U_ZERO_ERROR;
1555 UParseError parseError;
1556
1557 RuleBasedBreakIterator bi(rules, parseError, status);
1558 if (U_SUCCESS(status)) {
1559 UnicodeString utext = text;
1560 bi.setText(utext);
1561 int32_t position;
1562 int32_t loops;
1563 for (loops = 0; loops<10; loops++) {
1564 position = bi.next();
1565 if (position == RuleBasedBreakIterator::DONE) {
1566 break;
1567 }
1568 }
1569 TEST_ASSERT(loops == 1);
1570 } else {
1571 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1572 }
1573 }
1574
1575
1576
1577 //-------------------------------------------------------------------------------
1578 //
1579 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1580 // return the data in one big UChar * buffer, which the caller must delete.
1581 //
1582 // parameters:
1583 // fileName: the name of the file, with no directory part. The test data directory
1584 // is assumed.
1585 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1586 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1587 // specified here. The BOM, if it exists, will be stripped from the returned data.
1588 // Pass NULL for the system default encoding.
1589 // status
1590 // returns:
1591 // The file data, converted to UChar.
1592 // The caller must delete this when done with
1593 // delete [] theBuffer;
1594 //
1595 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1596 // Move this function to some common place.
1597 //
1598 //--------------------------------------------------------------------------------
1599 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1600 UChar *retPtr = NULL;
1601 char *fileBuf = NULL;
1602 UConverter* conv = NULL;
1603 FILE *f = NULL;
1604
1605 ulen = 0;
1606 if (U_FAILURE(status)) {
1607 return retPtr;
1608 }
1609
1610 //
1611 // Open the file.
1612 //
1613 f = fopen(fileName, "rb");
1614 if (f == 0) {
1615 dataerrln("Error opening test data file %s\n", fileName);
1616 status = U_FILE_ACCESS_ERROR;
1617 return NULL;
1618 }
1619 //
1620 // Read it in
1621 //
1622 int fileSize;
1623 int amt_read;
1624
1625 fseek( f, 0, SEEK_END);
1626 fileSize = ftell(f);
1627 fileBuf = new char[fileSize];
1628 fseek(f, 0, SEEK_SET);
1629 amt_read = fread(fileBuf, 1, fileSize, f);
1630 if (amt_read != fileSize || fileSize <= 0) {
1631 errln("Error reading test data file.");
1632 goto cleanUpAndReturn;
1633 }
1634
1635 //
1636 // Look for a Unicode Signature (BOM) on the data just read
1637 //
1638 int32_t signatureLength;
1639 const char * fileBufC;
1640 const char* bomEncoding;
1641
1642 fileBufC = fileBuf;
1643 bomEncoding = ucnv_detectUnicodeSignature(
1644 fileBuf, fileSize, &signatureLength, &status);
1645 if(bomEncoding!=NULL ){
1646 fileBufC += signatureLength;
1647 fileSize -= signatureLength;
1648 encoding = bomEncoding;
1649 }
1650
1651 //
1652 // Open a converter to take the rule file to UTF-16
1653 //
1654 conv = ucnv_open(encoding, &status);
1655 if (U_FAILURE(status)) {
1656 goto cleanUpAndReturn;
1657 }
1658
1659 //
1660 // Convert the rules to UChar.
1661 // Preflight first to determine required buffer size.
1662 //
1663 ulen = ucnv_toUChars(conv,
1664 NULL, // dest,
1665 0, // destCapacity,
1666 fileBufC,
1667 fileSize,
1668 &status);
1669 if (status == U_BUFFER_OVERFLOW_ERROR) {
1670 // Buffer Overflow is expected from the preflight operation.
1671 status = U_ZERO_ERROR;
1672
1673 retPtr = new UChar[ulen+1];
1674 ucnv_toUChars(conv,
1675 retPtr, // dest,
1676 ulen+1,
1677 fileBufC,
1678 fileSize,
1679 &status);
1680 }
1681
1682 cleanUpAndReturn:
1683 fclose(f);
1684 delete []fileBuf;
1685 ucnv_close(conv);
1686 if (U_FAILURE(status)) {
1687 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1688 delete []retPtr;
1689 retPtr = 0;
1690 ulen = 0;
1691 };
1692 return retPtr;
1693 }
1694
1695
1696
1697 //--------------------------------------------------------------------------------------------
1698 //
1699 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1700 //
1701 //-------------------------------------------------------------------------------------------
1702 void RBBITest::TestUnicodeFiles() {
1703 RuleBasedBreakIterator *bi;
1704 UErrorCode status = U_ZERO_ERROR;
1705
1706 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1707 TEST_ASSERT_SUCCESS(status);
1708 if (U_SUCCESS(status)) {
1709 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1710 }
1711 delete bi;
1712
1713 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1714 TEST_ASSERT_SUCCESS(status);
1715 if (U_SUCCESS(status)) {
1716 runUnicodeTestData("WordBreakTest.txt", bi);
1717 }
1718 delete bi;
1719
1720 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1721 TEST_ASSERT_SUCCESS(status);
1722 if (U_SUCCESS(status)) {
1723 runUnicodeTestData("SentenceBreakTest.txt", bi);
1724 }
1725 delete bi;
1726
1727 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1728 TEST_ASSERT_SUCCESS(status);
1729 if (U_SUCCESS(status)) {
1730 runUnicodeTestData("LineBreakTest.txt", bi);
1731 }
1732 delete bi;
1733 }
1734
1735
1736 // Check for test cases from the Unicode test data files that are known to fail
1737 // and should be skipped because ICU is not yet able to fully implement the spec.
1738 // See ticket #7270.
1739
1740 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1741 static const UChar badTestCases[][4] = { // Line Numbers from Unicode 7.0.0 file.
1742 {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000}, // Line 5198
1743 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000}, // Line 5202
1744 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000}, // Line 5214
1745 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000}, // Line 5246
1746 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000}, // Line 5298
1747 {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000} // Line 5302
1748 };
1749 if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1750 return FALSE;
1751 }
1752
1753 for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1754 if (testCase == UnicodeString(badTestCases[i])) {
1755 return logKnownIssue("7270");
1756 }
1757 }
1758 return FALSE;
1759 }
1760
1761
1762 //--------------------------------------------------------------------------------------------
1763 //
1764 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1765 //
1766 //-------------------------------------------------------------------------------------------
1767 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1768 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1769 UErrorCode status = U_ZERO_ERROR;
1770
1771 //
1772 // Open and read the test data file, put it into a UnicodeString.
1773 //
1774 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1775 char testFileName[1000];
1776 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1777 dataerrln("Can't open test data. Path too long.");
1778 return;
1779 }
1780 strcpy(testFileName, testDataDirectory);
1781 strcat(testFileName, fileName);
1782
1783 logln("Opening data file %s\n", fileName);
1784
1785 int len;
1786 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1787 if (status != U_FILE_ACCESS_ERROR) {
1788 TEST_ASSERT_SUCCESS(status);
1789 TEST_ASSERT(testFile != NULL);
1790 }
1791 if (U_FAILURE(status) || testFile == NULL) {
1792 return; /* something went wrong, error already output */
1793 }
1794 UnicodeString testFileAsString(TRUE, testFile, len);
1795
1796 //
1797 // Parse the test data file using a regular expression.
1798 // Each kind of token is recognized in its own capture group; what type of item was scanned
1799 // is identified by which group had a match.
1800 //
1801 // Caputure Group # 1 2 3 4 5
1802 // Parses this item: divide x hex digits comment \n unrecognized \n
1803 //
1804 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1805 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1806 UnicodeString testString;
1807 UVector32 breakPositions(status);
1808 int lineNumber = 1;
1809 TEST_ASSERT_SUCCESS(status);
1810 if (U_FAILURE(status)) {
1811 return;
1812 }
1813
1814 //
1815 // Scan through each test case, building up the string to be broken in testString,
1816 // and the positions that should be boundaries in the breakPositions vector.
1817 //
1818 int spin = 0;
1819 while (tokenMatcher.find()) {
1820 if(tokenMatcher.hitEnd()) {
1821 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1822 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1823 and caused an infinite loop here on EBCDIC systems!
1824 */
1825 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1826 // return;
1827 }
1828 if (tokenMatcher.start(1, status) >= 0) {
1829 // Scanned a divide sign, indicating a break position in the test data.
1830 if (testString.length()>0) {
1831 breakPositions.addElement(testString.length(), status);
1832 }
1833 }
1834 else if (tokenMatcher.start(2, status) >= 0) {
1835 // Scanned an 'x', meaning no break at this position in the test data
1836 // Nothing to be done here.
1837 }
1838 else if (tokenMatcher.start(3, status) >= 0) {
1839 // Scanned Hex digits. Convert them to binary, append to the character data string.
1840 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1841 int length = hexNumber.length();
1842 if (length<=8) {
1843 char buf[10];
1844 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1845 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1846 if (c<=0x10ffff) {
1847 testString.append(c);
1848 } else {
1849 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1850 fileName, lineNumber);
1851 }
1852 } else {
1853 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1854 fileName, lineNumber);
1855 }
1856 }
1857 else if (tokenMatcher.start(4, status) >= 0) {
1858 // Scanned to end of a line, possibly skipping over a comment in the process.
1859 // If the line from the file contained test data, run the test now.
1860 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1861 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1862 }
1863
1864 // Clear out this test case.
1865 // The string and breakPositions vector will be refilled as the next
1866 // test case is parsed.
1867 testString.remove();
1868 breakPositions.removeAllElements();
1869 lineNumber++;
1870 } else {
1871 // Scanner catchall. Something unrecognized appeared on the line.
1872 char token[16];
1873 UnicodeString uToken = tokenMatcher.group(0, status);
1874 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1875 token[sizeof(token)-1] = 0;
1876 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1877
1878 // Clean up, in preparation for continuing with the next line.
1879 testString.remove();
1880 breakPositions.removeAllElements();
1881 lineNumber++;
1882 }
1883 TEST_ASSERT_SUCCESS(status);
1884 if (U_FAILURE(status)) {
1885 break;
1886 }
1887 }
1888
1889 delete [] testFile;
1890 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1891 }
1892
1893 //--------------------------------------------------------------------------------------------
1894 //
1895 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1896 // test data files. Do only a simple, forward-only check -
1897 // this test is mostly to check that ICU and the Unicode
1898 // data agree with each other.
1899 //
1900 //--------------------------------------------------------------------------------------------
1901 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1902 const UnicodeString &testString, // Text data to be broken
1903 UVector32 *breakPositions, // Positions where breaks should be found.
1904 RuleBasedBreakIterator *bi) {
1905 int32_t pos; // Break Position in the test string
1906 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1907 int32_t expectedPos; // Expected break position (index into test string)
1908
1909 bi->setText(testString);
1910 pos = bi->first();
1911 pos = bi->next();
1912
1913 while (pos != BreakIterator::DONE) {
1914 if (expectedI >= breakPositions->size()) {
1915 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1916 testFileName, lineNumber, pos);
1917 break;
1918 }
1919 expectedPos = breakPositions->elementAti(expectedI);
1920 if (pos < expectedPos) {
1921 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1922 testFileName, lineNumber, pos);
1923 break;
1924 }
1925 if (pos > expectedPos) {
1926 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1927 testFileName, lineNumber, expectedPos);
1928 break;
1929 }
1930 pos = bi->next();
1931 expectedI++;
1932 }
1933
1934 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1935 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1936 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1937 }
1938 }
1939
1940
1941
1942 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1943 //---------------------------------------------------------------------------------------
1944 //
1945 // classs RBBIMonkeyKind
1946 //
1947 // Monkey Test for Break Iteration
1948 // Abstract interface class. Concrete derived classes independently
1949 // implement the break rules for different iterator types.
1950 //
1951 // The Monkey Test itself uses doesn't know which type of break iterator it is
1952 // testing, but works purely in terms of the interface defined here.
1953 //
1954 //---------------------------------------------------------------------------------------
1955 class RBBIMonkeyKind {
1956 public:
1957 // Return a UVector of UnicodeSets, representing the character classes used
1958 // for this type of iterator.
1959 virtual UVector *charClasses() = 0;
1960
1961 // Set the test text on which subsequent calls to next() will operate
1962 virtual void setText(const UnicodeString &s) = 0;
1963
1964 // Find the next break postion, starting from the prev break position, or from zero.
1965 // Return -1 after reaching end of string.
1966 virtual int32_t next(int32_t i) = 0;
1967
1968 virtual ~RBBIMonkeyKind();
1969 UErrorCode deferredStatus;
1970
1971
1972 protected:
1973 RBBIMonkeyKind();
1974
1975 private:
1976 };
1977
1978 RBBIMonkeyKind::RBBIMonkeyKind() {
1979 deferredStatus = U_ZERO_ERROR;
1980 }
1981
1982 RBBIMonkeyKind::~RBBIMonkeyKind() {
1983 }
1984
1985
1986 //----------------------------------------------------------------------------------------
1987 //
1988 // Random Numbers. Similar to standard lib rand() and srand()
1989 // Not using library to
1990 // 1. Get same results on all platforms.
1991 // 2. Get access to current seed, to more easily reproduce failures.
1992 //
1993 //---------------------------------------------------------------------------------------
1994 static uint32_t m_seed = 1;
1995
1996 static uint32_t m_rand()
1997 {
1998 m_seed = m_seed * 1103515245 + 12345;
1999 return (uint32_t)(m_seed/65536) % 32768;
2000 }
2001
2002
2003 //------------------------------------------------------------------------------------------
2004 //
2005 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
2006 // of RBBIMonkeyKind.
2007 //
2008 //------------------------------------------------------------------------------------------
2009 class RBBICharMonkey: public RBBIMonkeyKind {
2010 public:
2011 RBBICharMonkey();
2012 virtual ~RBBICharMonkey();
2013 virtual UVector *charClasses();
2014 virtual void setText(const UnicodeString &s);
2015 virtual int32_t next(int32_t i);
2016 private:
2017 UVector *fSets;
2018
2019 UnicodeSet *fCRLFSet;
2020 UnicodeSet *fControlSet;
2021 UnicodeSet *fExtendSet;
2022 UnicodeSet *fRegionalIndicatorSet;
2023 UnicodeSet *fPrependSet;
2024 UnicodeSet *fSpacingSet;
2025 UnicodeSet *fLSet;
2026 UnicodeSet *fVSet;
2027 UnicodeSet *fTSet;
2028 UnicodeSet *fLVSet;
2029 UnicodeSet *fLVTSet;
2030 UnicodeSet *fHangulSet;
2031 UnicodeSet *fAnySet;
2032 UnicodeSet *fEmojiModifierSet;
2033 UnicodeSet *fEmojiBaseSet;
2034 UnicodeSet *fZWJSet;
2035 UnicodeSet *fGAZSet;
2036
2037 const UnicodeString *fText;
2038 };
2039
2040
2041 RBBICharMonkey::RBBICharMonkey() {
2042 UErrorCode status = U_ZERO_ERROR;
2043
2044 fText = NULL;
2045
2046 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2047 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]"), status);
2048 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"), status);
2049 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2050 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2051 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2052 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2053 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2054 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2055 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2056 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2057 fHangulSet = new UnicodeSet();
2058 fHangulSet->addAll(*fLSet);
2059 fHangulSet->addAll(*fVSet);
2060 fHangulSet->addAll(*fTSet);
2061 fHangulSet->addAll(*fLVSet);
2062 fHangulSet->addAll(*fLVTSet);
2063 fAnySet = new UnicodeSet(0, 0x10ffff);
2064
2065
2066
2067 fEmojiBaseSet = new UnicodeSet(UnicodeString(
2068 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C2-\\U0001F3C4\\U0001F3C7\\U0001F3CA-\\U0001F3CC"
2069 "\\U0001F442-\\U0001F443\\U0001F446-\\U0001F450\\U0001F466-\\U0001F478\\U0001F47C"
2070 "\\U0001F481-\\U0001F483\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F574-\\U0001F575\\U0001F57A\\U0001F590\\U0001F595-\\U0001F596"
2071 "\\U0001F645-\\U0001F647\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F6CC"
2072 "\\U0001F918-\\U0001F91E\\U0001F926\\U0001F930\\U0001F933-\\U0001F939\\U0001F93C-\\U0001F93E]"), status);
2073
2074 fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF);
2075 fZWJSet = new UnicodeSet(0x200D, 0x200D);
2076 fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2695-\\u2696\\u2708\\u2764"
2077 "\\U0001F308\\U0001F33E\\U0001F373\\U0001F393\\U0001F3A4\\U0001F3A8\\U0001F3EB\\U0001F3ED"
2078 "\\U0001F466-\\U0001F469\\U0001F48B\\U0001F4BB-\\U0001F4BC\\U0001F527\\U0001F52C\\U0001F5E8"
2079 "\\U0001F680\\U0001F692]"), status);
2080
2081 fSets = new UVector(status);
2082 fSets->addElement(fCRLFSet, status);
2083 fSets->addElement(fControlSet, status);
2084 fSets->addElement(fExtendSet, status);
2085 fSets->addElement(fRegionalIndicatorSet, status);
2086 if (!fPrependSet->isEmpty()) {
2087 fSets->addElement(fPrependSet, status);
2088 }
2089 fSets->addElement(fSpacingSet, status);
2090 fSets->addElement(fHangulSet, status);
2091 fSets->addElement(fAnySet, status);
2092 fSets->addElement(fEmojiBaseSet, status);
2093 fSets->addElement(fEmojiModifierSet, status);
2094 fSets->addElement(fZWJSet, status);
2095 fSets->addElement(fGAZSet, status);
2096 if (U_FAILURE(status)) {
2097 deferredStatus = status;
2098 }
2099 }
2100
2101
2102 void RBBICharMonkey::setText(const UnicodeString &s) {
2103 fText = &s;
2104 }
2105
2106
2107
2108 int32_t RBBICharMonkey::next(int32_t prevPos) {
2109 int p0, p1, p2, p3; // Indices of the significant code points around the
2110 // break position being tested. The candidate break
2111 // location is before p2.
2112
2113 int breakPos = -1;
2114
2115 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2116 UChar32 cBase; // for (X Extend*) patterns, the X character.
2117
2118 if (U_FAILURE(deferredStatus)) {
2119 return -1;
2120 }
2121
2122 // Previous break at end of string. return DONE.
2123 if (prevPos >= fText->length()) {
2124 return -1;
2125 }
2126 p0 = p1 = p2 = p3 = prevPos;
2127 c3 = fText->char32At(prevPos);
2128 c0 = c1 = c2 = cBase = 0;
2129 (void)p0; // suppress set but not used warning.
2130 (void)c0;
2131
2132 // Loop runs once per "significant" character position in the input text.
2133 for (;;) {
2134 // Move all of the positions forward in the input string.
2135 p0 = p1; c0 = c1;
2136 p1 = p2; c1 = c2;
2137 p2 = p3; c2 = c3;
2138
2139 // Advancd p3 by one codepoint
2140 p3 = fText->moveIndex32(p3, 1);
2141 c3 = fText->char32At(p3);
2142
2143 if (p1 == p2) {
2144 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2145 continue;
2146 }
2147 if (p2 == fText->length()) {
2148 // Reached end of string. Always a break position.
2149 break;
2150 }
2151
2152 // Rule GB3 CR x LF
2153 // No Extend or Format characters may appear between the CR and LF,
2154 // which requires the additional check for p2 immediately following p1.
2155 //
2156 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2157 continue;
2158 }
2159
2160 // Rule (GB4). ( Control | CR | LF ) <break>
2161 if (fControlSet->contains(c1) ||
2162 c1 == 0x0D ||
2163 c1 == 0x0A) {
2164 break;
2165 }
2166
2167 // Rule (GB5) <break> ( Control | CR | LF )
2168 //
2169 if (fControlSet->contains(c2) ||
2170 c2 == 0x0D ||
2171 c2 == 0x0A) {
2172 break;
2173 }
2174
2175
2176 // Rule (GB6) L x ( L | V | LV | LVT )
2177 if (fLSet->contains(c1) &&
2178 (fLSet->contains(c2) ||
2179 fVSet->contains(c2) ||
2180 fLVSet->contains(c2) ||
2181 fLVTSet->contains(c2))) {
2182 continue;
2183 }
2184
2185 // Rule (GB7) ( LV | V ) x ( V | T )
2186 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2187 (fVSet->contains(c2) || fTSet->contains(c2))) {
2188 continue;
2189 }
2190
2191 // Rule (GB8) ( LVT | T) x T
2192 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2193 fTSet->contains(c2)) {
2194 continue;
2195 }
2196
2197 // Rule (GB9) x (Extend | ZWJ)
2198 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
2199 if (!fExtendSet->contains(c1)) {
2200 cBase = c1;
2201 }
2202 continue;
2203 }
2204
2205 // Rule (GB9a) x SpacingMark
2206 if (fSpacingSet->contains(c2)) {
2207 continue;
2208 }
2209
2210 // Rule (GB9b) Prepend x
2211 if (fPrependSet->contains(c1)) {
2212 continue;
2213 }
2214
2215 // Rule (GB10) ($E_Base | $GAZ) $Extend* $E_Modifier;
2216 if ((fEmojiBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
2217 continue;
2218 }
2219 if ((fEmojiBaseSet->contains(cBase) || fGAZSet->contains(cBase)) &&
2220 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
2221 continue;
2222 }
2223
2224 // Rule (GB11) ZWJ x Glue_After_Zwj
2225 if (fZWJSet->contains(c1) && fGAZSet->contains(c2)) {
2226 continue;
2227 }
2228
2229 // Rule (GB12-13) Regional_Indicator x Regional_Indicator
2230 // Note: The first if condition is a little tricky. We only need to force
2231 // a break if there are three or more contiguous RIs. If there are
2232 // only two, a break following will occur via other rules, and will include
2233 // any trailing extend characters, which is needed behavior.
2234 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
2235 && fRegionalIndicatorSet->contains(c2)) {
2236 break;
2237 }
2238 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2239 continue;
2240 }
2241
2242 // Rule (GB999) Any <break> Any
2243 break;
2244 }
2245
2246 breakPos = p2;
2247 return breakPos;
2248 }
2249
2250
2251
2252 UVector *RBBICharMonkey::charClasses() {
2253 return fSets;
2254 }
2255
2256
2257 RBBICharMonkey::~RBBICharMonkey() {
2258 delete fSets;
2259 delete fCRLFSet;
2260 delete fControlSet;
2261 delete fExtendSet;
2262 delete fRegionalIndicatorSet;
2263 delete fPrependSet;
2264 delete fSpacingSet;
2265 delete fLSet;
2266 delete fVSet;
2267 delete fTSet;
2268 delete fLVSet;
2269 delete fLVTSet;
2270 delete fHangulSet;
2271 delete fAnySet;
2272 delete fEmojiBaseSet;
2273 delete fEmojiModifierSet;
2274 delete fZWJSet;
2275 delete fGAZSet;
2276 }
2277
2278 //------------------------------------------------------------------------------------------
2279 //
2280 // class RBBIWordMonkey Word Break specific implementation
2281 // of RBBIMonkeyKind.
2282 //
2283 //------------------------------------------------------------------------------------------
2284 class RBBIWordMonkey: public RBBIMonkeyKind {
2285 public:
2286 RBBIWordMonkey();
2287 virtual ~RBBIWordMonkey();
2288 virtual UVector *charClasses();
2289 virtual void setText(const UnicodeString &s);
2290 virtual int32_t next(int32_t i);
2291 private:
2292 UVector *fSets;
2293
2294 UnicodeSet *fCRSet;
2295 UnicodeSet *fLFSet;
2296 UnicodeSet *fNewlineSet;
2297 UnicodeSet *fRegionalIndicatorSet;
2298 UnicodeSet *fKatakanaSet;
2299 UnicodeSet *fHebrew_LetterSet;
2300 UnicodeSet *fALetterSet;
2301 // TODO(jungshik): Do we still need this change?
2302 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
2303 UnicodeSet *fSingle_QuoteSet;
2304 UnicodeSet *fDouble_QuoteSet;
2305 UnicodeSet *fMidNumLetSet;
2306 UnicodeSet *fMidLetterSet;
2307 UnicodeSet *fMidNumSet;
2308 UnicodeSet *fNumericSet;
2309 UnicodeSet *fFormatSet;
2310 UnicodeSet *fOtherSet;
2311 UnicodeSet *fExtendSet;
2312 UnicodeSet *fExtendNumLetSet;
2313 UnicodeSet *fDictionaryCjkSet;
2314 UnicodeSet *fEBaseSet;
2315 UnicodeSet *fEModifierSet;
2316 UnicodeSet *fZWSSet;
2317 UnicodeSet *fGAZSet;
2318
2319 const UnicodeString *fText;
2320 };
2321
2322
2323 RBBIWordMonkey::RBBIWordMonkey()
2324 {
2325 UErrorCode status = U_ZERO_ERROR;
2326
2327 fSets = new UVector(status);
2328
2329 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
2330 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
2331 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
2332 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2333 // Exclude Hangul syllables from ALetterSet during testing.
2334 // Leave CJK dictionary characters out from the monkey tests!
2335 #if 0
2336 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
2337 "[\\p{Line_Break = Complex_Context}"
2338 "-\\p{Grapheme_Cluster_Break = Extend}"
2339 "-\\p{Grapheme_Cluster_Break = Control}"
2340 "]]",
2341 status);
2342 #endif
2343 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2344 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
2345 fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2346 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2347 fALetterSet->removeAll(*fDictionaryCjkSet);
2348 fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status);
2349 fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status);
2350 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
2351 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter} - [\\:]]"), status);
2352 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
2353 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2354 // we should figure out why
2355 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
2356 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
2357 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2358 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
2359
2360 fEBaseSet = new UnicodeSet(UnicodeString(
2361 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C2-\\U0001F3C4\\U0001F3C7\\U0001F3CA-\\U0001F3CC"
2362 "\\U0001F442-\\U0001F443\\U0001F446-\\U0001F450\\U0001F466-\\U0001F478\\U0001F47C"
2363 "\\U0001F481-\\U0001F483\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F574-\\U0001F575\\U0001F57A\\U0001F590\\U0001F595-\\U0001F596"
2364 "\\U0001F645-\\U0001F647\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F6CC"
2365 "\\U0001F918-\\U0001F91E\\U0001F926\\U0001F930\\U0001F933-\\U0001F939\\U0001F93C-\\U0001F93E]"), status);
2366
2367 fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
2368 fZWSSet = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);;
2369 fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2695-\\u2696\\u2708\\u2764"
2370 "\\U0001F308\\U0001F33E\\U0001F373\\U0001F393\\U0001F3A4\\U0001F3A8\\U0001F3EB\\U0001F3ED"
2371 "\\U0001F466-\\U0001F469\\U0001F48B\\U0001F4BB-\\U0001F4BC\\U0001F527\\U0001F52C\\U0001F5E8"
2372 "\\U0001F680\\U0001F692]"), status);
2373 fExtendSet->removeAll(*fZWSSet);
2374
2375
2376 fOtherSet = new UnicodeSet();
2377 if(U_FAILURE(status)) {
2378 deferredStatus = status;
2379 return;
2380 }
2381
2382 fOtherSet->complement();
2383 fOtherSet->removeAll(*fCRSet);
2384 fOtherSet->removeAll(*fLFSet);
2385 fOtherSet->removeAll(*fNewlineSet);
2386 fOtherSet->removeAll(*fKatakanaSet);
2387 fOtherSet->removeAll(*fHebrew_LetterSet);
2388 fOtherSet->removeAll(*fALetterSet);
2389 fOtherSet->removeAll(*fSingle_QuoteSet);
2390 fOtherSet->removeAll(*fDouble_QuoteSet);
2391 fOtherSet->removeAll(*fMidLetterSet);
2392 fOtherSet->removeAll(*fMidNumSet);
2393 fOtherSet->removeAll(*fNumericSet);
2394 fOtherSet->removeAll(*fExtendNumLetSet);
2395 fOtherSet->removeAll(*fFormatSet);
2396 fOtherSet->removeAll(*fExtendSet);
2397 fOtherSet->removeAll(*fRegionalIndicatorSet);
2398 fOtherSet->removeAll(*fEBaseSet);
2399 fOtherSet->removeAll(*fEModifierSet);
2400 fOtherSet->removeAll(*fZWSSet);
2401 fOtherSet->removeAll(*fGAZSet);
2402
2403 // Inhibit dictionary characters from being tested at all.
2404 fOtherSet->removeAll(*fDictionaryCjkSet);
2405 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2406
2407 fSets->addElement(fCRSet, status);
2408 fSets->addElement(fLFSet, status);
2409 fSets->addElement(fNewlineSet, status);
2410 fSets->addElement(fRegionalIndicatorSet, status);
2411 fSets->addElement(fHebrew_LetterSet, status);
2412 fSets->addElement(fALetterSet, status);
2413 fSets->addElement(fSingle_QuoteSet, status);
2414 fSets->addElement(fDouble_QuoteSet, status);
2415 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
2416 fSets->addElement(fMidLetterSet, status);
2417 fSets->addElement(fMidNumLetSet, status);
2418 fSets->addElement(fMidNumSet, status);
2419 fSets->addElement(fNumericSet, status);
2420 fSets->addElement(fFormatSet, status);
2421 fSets->addElement(fExtendSet, status);
2422 fSets->addElement(fOtherSet, status);
2423 fSets->addElement(fExtendNumLetSet, status);
2424
2425 fSets->addElement(fEBaseSet, status);
2426 fSets->addElement(fEModifierSet, status);
2427 fSets->addElement(fZWSSet, status);
2428 fSets->addElement(fGAZSet, status);
2429
2430 if (U_FAILURE(status)) {
2431 deferredStatus = status;
2432 }
2433 }
2434
2435 void RBBIWordMonkey::setText(const UnicodeString &s) {
2436 fText = &s;
2437 }
2438
2439
2440 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2441 int p0, p1, p2, p3; // Indices of the significant code points around the
2442 // break position being tested. The candidate break
2443 // location is before p2.
2444
2445 int breakPos = -1;
2446
2447 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2448
2449 if (U_FAILURE(deferredStatus)) {
2450 return -1;
2451 }
2452
2453 // Prev break at end of string. return DONE.
2454 if (prevPos >= fText->length()) {
2455 return -1;
2456 }
2457 p0 = p1 = p2 = p3 = prevPos;
2458 c3 = fText->char32At(prevPos);
2459 c0 = c1 = c2 = 0;
2460 (void)p0; // Suppress set but not used warning.
2461
2462 // Loop runs once per "significant" character position in the input text.
2463 for (;;) {
2464 // Move all of the positions forward in the input string.
2465 p0 = p1; c0 = c1;
2466 p1 = p2; c1 = c2;
2467 p2 = p3; c2 = c3;
2468
2469 // Advancd p3 by X(Extend | Format)* Rule 4
2470 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2471 do {
2472 p3 = fText->moveIndex32(p3, 1);
2473 c3 = fText->char32At(p3);
2474 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2475 break;
2476 };
2477 }
2478 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWSSet->contains(c3));
2479
2480
2481 if (p1 == p2) {
2482 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2483 continue;
2484 }
2485 if (p2 == fText->length()) {
2486 // Reached end of string. Always a break position.
2487 break;
2488 }
2489
2490 // Rule (3) CR x LF
2491 // No Extend or Format characters may appear between the CR and LF,
2492 // which requires the additional check for p2 immediately following p1.
2493 //
2494 if (c1==0x0D && c2==0x0A) {
2495 continue;
2496 }
2497
2498 // Rule (3a) Break before and after newlines (including CR and LF)
2499 //
2500 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2501 break;
2502 };
2503 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2504 break;
2505 };
2506
2507 // Rule (3c) ZWJ x GAZ (Glue after ZWJ).
2508 // Not ignoring extend chars, so peek into input text to
2509 // get the potential ZWJ, the character immediately preceding c2.
2510 // Sloppy UChar32 indexing: p2-1 may reference trail half
2511 // but char32At will get the full code point.
2512 if (fZWSSet->contains(fText->char32At(p2-1)) && fGAZSet->contains(c2)) {
2513 continue;
2514 }
2515
2516 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2517 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2518 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2519 continue;
2520 }
2521
2522 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2523 //
2524 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2525 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2526 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2527 continue;
2528 }
2529
2530 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2531 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2532 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2533 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2534 continue;
2535 }
2536
2537 // Rule (7a) Hebrew_Letter x Single_Quote
2538 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2539 continue;
2540 }
2541
2542 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2543 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2544 continue;
2545 }
2546
2547 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2548 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2549 continue;
2550 }
2551
2552 // Rule (8) Numeric x Numeric
2553 if (fNumericSet->contains(c1) &&
2554 fNumericSet->contains(c2)) {
2555 continue;
2556 }
2557
2558 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2559 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2560 fNumericSet->contains(c2)) {
2561 continue;
2562 }
2563
2564 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2565 if (fNumericSet->contains(c1) &&
2566 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2567 continue;
2568 }
2569
2570 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2571 if (fNumericSet->contains(c0) &&
2572 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2573 fNumericSet->contains(c2)) {
2574 continue;
2575 }
2576
2577 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2578 if (fNumericSet->contains(c1) &&
2579 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2580 fNumericSet->contains(c3)) {
2581 continue;
2582 }
2583
2584 // Rule (13) Katakana x Katakana
2585 if (fKatakanaSet->contains(c1) &&
2586 fKatakanaSet->contains(c2)) {
2587 continue;
2588 }
2589
2590 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2591 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2592 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2593 fExtendNumLetSet->contains(c2)) {
2594 continue;
2595 }
2596
2597 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2598 if (fExtendNumLetSet->contains(c1) &&
2599 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2600 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2601 continue;
2602 }
2603
2604 // Rule 13c
2605 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2606 break;
2607 }
2608 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2609 continue;
2610 }
2611
2612 // Rule 13d
2613 if ((fEBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEModifierSet->contains(c2)) {
2614 continue;
2615 }
2616
2617 // Rule 14. Break found here.
2618 break;
2619 }
2620
2621 breakPos = p2;
2622 return breakPos;
2623 }
2624
2625
2626 UVector *RBBIWordMonkey::charClasses() {
2627 return fSets;
2628 }
2629
2630
2631 RBBIWordMonkey::~RBBIWordMonkey() {
2632 delete fSets;
2633 delete fCRSet;
2634 delete fLFSet;
2635 delete fNewlineSet;
2636 delete fKatakanaSet;
2637 delete fHebrew_LetterSet;
2638 delete fALetterSet;
2639 delete fSingle_QuoteSet;
2640 delete fDouble_QuoteSet;
2641 delete fMidNumLetSet;
2642 delete fMidLetterSet;
2643 delete fMidNumSet;
2644 delete fNumericSet;
2645 delete fFormatSet;
2646 delete fExtendSet;
2647 delete fExtendNumLetSet;
2648 delete fRegionalIndicatorSet;
2649 delete fDictionaryCjkSet;
2650 delete fOtherSet;
2651 delete fEBaseSet;
2652 delete fEModifierSet;
2653 delete fZWSSet;
2654 delete fGAZSet;
2655 }
2656
2657
2658
2659
2660 //------------------------------------------------------------------------------------------
2661 //
2662 // class RBBISentMonkey Sentence Break specific implementation
2663 // of RBBIMonkeyKind.
2664 //
2665 //------------------------------------------------------------------------------------------
2666 class RBBISentMonkey: public RBBIMonkeyKind {
2667 public:
2668 RBBISentMonkey();
2669 virtual ~RBBISentMonkey();
2670 virtual UVector *charClasses();
2671 virtual void setText(const UnicodeString &s);
2672 virtual int32_t next(int32_t i);
2673 private:
2674 int moveBack(int posFrom);
2675 int moveForward(int posFrom);
2676 UChar32 cAt(int pos);
2677
2678 UVector *fSets;
2679
2680 UnicodeSet *fSepSet;
2681 UnicodeSet *fFormatSet;
2682 UnicodeSet *fSpSet;
2683 UnicodeSet *fLowerSet;
2684 UnicodeSet *fUpperSet;
2685 UnicodeSet *fOLetterSet;
2686 UnicodeSet *fNumericSet;
2687 UnicodeSet *fATermSet;
2688 UnicodeSet *fSContinueSet;
2689 UnicodeSet *fSTermSet;
2690 UnicodeSet *fCloseSet;
2691 UnicodeSet *fOtherSet;
2692 UnicodeSet *fExtendSet;
2693
2694 const UnicodeString *fText;
2695
2696 };
2697
2698 RBBISentMonkey::RBBISentMonkey()
2699 {
2700 UErrorCode status = U_ZERO_ERROR;
2701
2702 fSets = new UVector(status);
2703
2704 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2705 // set and made into character classes of their own. For the monkey impl,
2706 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2707 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2708 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2709 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2710 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2711 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2712 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2713 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2714 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2715 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2716 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2717 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2718 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2719 fOtherSet = new UnicodeSet();
2720
2721 if(U_FAILURE(status)) {
2722 deferredStatus = status;
2723 return;
2724 }
2725
2726 fOtherSet->complement();
2727 fOtherSet->removeAll(*fSepSet);
2728 fOtherSet->removeAll(*fFormatSet);
2729 fOtherSet->removeAll(*fSpSet);
2730 fOtherSet->removeAll(*fLowerSet);
2731 fOtherSet->removeAll(*fUpperSet);
2732 fOtherSet->removeAll(*fOLetterSet);
2733 fOtherSet->removeAll(*fNumericSet);
2734 fOtherSet->removeAll(*fATermSet);
2735 fOtherSet->removeAll(*fSContinueSet);
2736 fOtherSet->removeAll(*fSTermSet);
2737 fOtherSet->removeAll(*fCloseSet);
2738 fOtherSet->removeAll(*fExtendSet);
2739
2740 fSets->addElement(fSepSet, status);
2741 fSets->addElement(fFormatSet, status);
2742 fSets->addElement(fSpSet, status);
2743 fSets->addElement(fLowerSet, status);
2744 fSets->addElement(fUpperSet, status);
2745 fSets->addElement(fOLetterSet, status);
2746 fSets->addElement(fNumericSet, status);
2747 fSets->addElement(fATermSet, status);
2748 fSets->addElement(fSContinueSet, status);
2749 fSets->addElement(fSTermSet, status);
2750 fSets->addElement(fCloseSet, status);
2751 fSets->addElement(fOtherSet, status);
2752 fSets->addElement(fExtendSet, status);
2753
2754 if (U_FAILURE(status)) {
2755 deferredStatus = status;
2756 }
2757 }
2758
2759
2760
2761 void RBBISentMonkey::setText(const UnicodeString &s) {
2762 fText = &s;
2763 }
2764
2765 UVector *RBBISentMonkey::charClasses() {
2766 return fSets;
2767 }
2768
2769
2770 // moveBack() Find the "significant" code point preceding the index i.
2771 // Skips over ($Extend | $Format)* .
2772 //
2773 int RBBISentMonkey::moveBack(int i) {
2774 if (i <= 0) {
2775 return -1;
2776 }
2777 UChar32 c;
2778 int32_t j = i;
2779 do {
2780 j = fText->moveIndex32(j, -1);
2781 c = fText->char32At(j);
2782 }
2783 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2784 return j;
2785
2786 }
2787
2788
2789 int RBBISentMonkey::moveForward(int i) {
2790 if (i>=fText->length()) {
2791 return fText->length();
2792 }
2793 UChar32 c;
2794 int32_t j = i;
2795 do {
2796 j = fText->moveIndex32(j, 1);
2797 c = cAt(j);
2798 }
2799 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2800 return j;
2801 }
2802
2803 UChar32 RBBISentMonkey::cAt(int pos) {
2804 if (pos<0 || pos>=fText->length()) {
2805 return -1;
2806 } else {
2807 return fText->char32At(pos);
2808 }
2809 }
2810
2811 int32_t RBBISentMonkey::next(int32_t prevPos) {
2812 int p0, p1, p2, p3; // Indices of the significant code points around the
2813 // break position being tested. The candidate break
2814 // location is before p2.
2815
2816 int breakPos = -1;
2817
2818 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2819 UChar32 c;
2820
2821 if (U_FAILURE(deferredStatus)) {
2822 return -1;
2823 }
2824
2825 // Prev break at end of string. return DONE.
2826 if (prevPos >= fText->length()) {
2827 return -1;
2828 }
2829 p0 = p1 = p2 = p3 = prevPos;
2830 c3 = fText->char32At(prevPos);
2831 c0 = c1 = c2 = 0;
2832 (void)p0; // Suppress set but not used warning.
2833
2834 // Loop runs once per "significant" character position in the input text.
2835 for (;;) {
2836 // Move all of the positions forward in the input string.
2837 p0 = p1; c0 = c1;
2838 p1 = p2; c1 = c2;
2839 p2 = p3; c2 = c3;
2840
2841 // Advancd p3 by X(Extend | Format)* Rule 4
2842 p3 = moveForward(p3);
2843 c3 = cAt(p3);
2844
2845 // Rule (3) CR x LF
2846 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2847 continue;
2848 }
2849
2850 // Rule (4). Sep <break>
2851 if (fSepSet->contains(c1)) {
2852 p2 = p1+1; // Separators don't combine with Extend or Format.
2853 break;
2854 }
2855
2856 if (p2 >= fText->length()) {
2857 // Reached end of string. Always a break position.
2858 break;
2859 }
2860
2861 if (p2 == prevPos) {
2862 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2863 continue;
2864 }
2865
2866 // Rule (6). ATerm x Numeric
2867 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2868 continue;
2869 }
2870
2871 // Rule (7). (Upper | Lower) ATerm x Uppper
2872 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2873 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2874 continue;
2875 }
2876
2877 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2878 // Note: STerm | ATerm are added to the negated part of the expression by a
2879 // note to the Unicode 5.0 documents.
2880 int p8 = p1;
2881 while (fSpSet->contains(cAt(p8))) {
2882 p8 = moveBack(p8);
2883 }
2884 while (fCloseSet->contains(cAt(p8))) {
2885 p8 = moveBack(p8);
2886 }
2887 if (fATermSet->contains(cAt(p8))) {
2888 p8=p2;
2889 for (;;) {
2890 c = cAt(p8);
2891 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2892 fLowerSet->contains(c) || fSepSet->contains(c) ||
2893 fATermSet->contains(c) || fSTermSet->contains(c)) {
2894 break;
2895 }
2896 p8 = moveForward(p8);
2897 }
2898 if (fLowerSet->contains(cAt(p8))) {
2899 continue;
2900 }
2901 }
2902
2903 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2904 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2905 p8 = p1;
2906 while (fSpSet->contains(cAt(p8))) {
2907 p8 = moveBack(p8);
2908 }
2909 while (fCloseSet->contains(cAt(p8))) {
2910 p8 = moveBack(p8);
2911 }
2912 c = cAt(p8);
2913 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2914 continue;
2915 }
2916 }
2917
2918 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2919 int p9 = p1;
2920 while (fCloseSet->contains(cAt(p9))) {
2921 p9 = moveBack(p9);
2922 }
2923 c = cAt(p9);
2924 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2925 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2926 continue;
2927 }
2928 }
2929
2930 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2931 int p10 = p1;
2932 while (fSpSet->contains(cAt(p10))) {
2933 p10 = moveBack(p10);
2934 }
2935 while (fCloseSet->contains(cAt(p10))) {
2936 p10 = moveBack(p10);
2937 }
2938 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2939 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2940 continue;
2941 }
2942 }
2943
2944 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2945 int p11 = p1;
2946 if (fSepSet->contains(cAt(p11))) {
2947 p11 = moveBack(p11);
2948 }
2949 while (fSpSet->contains(cAt(p11))) {
2950 p11 = moveBack(p11);
2951 }
2952 while (fCloseSet->contains(cAt(p11))) {
2953 p11 = moveBack(p11);
2954 }
2955 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2956 break;
2957 }
2958
2959 // Rule (12) Any x Any
2960 continue;
2961 }
2962 breakPos = p2;
2963 return breakPos;
2964 }
2965
2966 RBBISentMonkey::~RBBISentMonkey() {
2967 delete fSets;
2968 delete fSepSet;
2969 delete fFormatSet;
2970 delete fSpSet;
2971 delete fLowerSet;
2972 delete fUpperSet;
2973 delete fOLetterSet;
2974 delete fNumericSet;
2975 delete fATermSet;
2976 delete fSContinueSet;
2977 delete fSTermSet;
2978 delete fCloseSet;
2979 delete fOtherSet;
2980 delete fExtendSet;
2981 }
2982
2983
2984
2985 //-------------------------------------------------------------------------------------------
2986 //
2987 // RBBILineMonkey
2988 //
2989 //-------------------------------------------------------------------------------------------
2990
2991 class RBBILineMonkey: public RBBIMonkeyKind {
2992 public:
2993 RBBILineMonkey();
2994 virtual ~RBBILineMonkey();
2995 virtual UVector *charClasses();
2996 virtual void setText(const UnicodeString &s);
2997 virtual int32_t next(int32_t i);
2998 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2999 private:
3000 UVector *fSets;
3001
3002 UnicodeSet *fBK;
3003 UnicodeSet *fCR;
3004 UnicodeSet *fLF;
3005 UnicodeSet *fCM;
3006 UnicodeSet *fNL;
3007 UnicodeSet *fSG;
3008 UnicodeSet *fWJ;
3009 UnicodeSet *fZW;
3010 UnicodeSet *fGL;
3011 UnicodeSet *fCB;
3012 UnicodeSet *fSP;
3013 UnicodeSet *fB2;
3014 UnicodeSet *fBA;
3015 UnicodeSet *fBB;
3016 UnicodeSet *fHY;
3017 UnicodeSet *fH2;
3018 UnicodeSet *fH3;
3019 UnicodeSet *fCL;
3020 UnicodeSet *fCP;
3021 UnicodeSet *fEX;
3022 UnicodeSet *fIN;
3023 UnicodeSet *fJL;
3024 UnicodeSet *fJV;
3025 UnicodeSet *fJT;
3026 UnicodeSet *fNS;
3027 UnicodeSet *fOP;
3028 UnicodeSet *fQU;
3029 UnicodeSet *fIS;
3030 UnicodeSet *fNU;
3031 UnicodeSet *fPO;
3032 UnicodeSet *fPR;
3033 UnicodeSet *fSY;
3034 UnicodeSet *fAI;
3035 UnicodeSet *fAL;
3036 UnicodeSet *fCJ;
3037 UnicodeSet *fHL;
3038 UnicodeSet *fID;
3039 UnicodeSet *fRI;
3040 UnicodeSet *fXX;
3041 UnicodeSet *fEB;
3042 UnicodeSet *fEM;
3043 UnicodeSet *fZJ;
3044
3045 BreakIterator *fCharBI;
3046 const UnicodeString *fText;
3047 RegexMatcher *fNumberMatcher;
3048 };
3049
3050 RBBILineMonkey::RBBILineMonkey() :
3051 RBBIMonkeyKind(),
3052 fSets(NULL),
3053
3054 fCharBI(NULL),
3055 fText(NULL),
3056 fNumberMatcher(NULL)
3057
3058 {
3059 if (U_FAILURE(deferredStatus)) {
3060 return;
3061 }
3062
3063 UErrorCode status = U_ZERO_ERROR;
3064
3065 fSets = new UVector(status);
3066
3067 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3068 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3069 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3070 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3071 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3072 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3073 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3074 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3075 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3076 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3077 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3078 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3079 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3080 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3081 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3082 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3083 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3084 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3085 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3086 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3087 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3088 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3089 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3090 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3091 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3092 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3093 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3094 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3095 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3096 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3097 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3098 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3099 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3100 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
3101 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
3102 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3103 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
3104 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3105 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3106 fEB = new UnicodeSet(UnicodeString(
3107 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C2-\\U0001F3C4\\U0001F3C7\\U0001F3CA-\\U0001F3CC"
3108 "\\U0001F442-\\U0001F443\\U0001F446-\\U0001F450\\U0001F466-\\U0001F478\\U0001F47C"
3109 "\\U0001F481-\\U0001F483\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F574-\\U0001F575\\U0001F57A\\U0001F590\\U0001F595-\\U0001F596"
3110 "\\U0001F645-\\U0001F647\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F6CC"
3111 "\\U0001F918-\\U0001F91E\\U0001F926\\U0001F930\\U0001F933-\\U0001F939\\U0001F93C-\\U0001F93E]"), status);
3112 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
3113 fZJ = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);
3114
3115 if (U_FAILURE(status)) {
3116 deferredStatus = status;
3117 return;
3118 }
3119
3120 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
3121 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
3122 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
3123
3124 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
3125
3126 fID->addAll(*fEB); // Emoji Base and Emoji Modifier behave as ID.
3127 fID->addAll(*fEM);
3128 fAL->removeAll(*fEM);
3129
3130
3131 fAL->remove((UChar32)0x2695); // move u2695 from Al to Id
3132 fAL->remove((UChar32)0x2696); // move u2696 from Al to Id
3133 fAL->remove((UChar32)0x2764); // Emoji Proposal: move u2764 from Al to Id
3134 fAI->remove((UChar32)0x2640); // new ZWJ seqs
3135 fAI->remove((UChar32)0x2642); // new ZWJ seqs
3136 fID->add((UChar32)0x2695);
3137 fID->add((UChar32)0x2696);
3138 fID->add((UChar32)0x2764);
3139 fID->add((UChar32)0x2640);
3140 fID->add((UChar32)0x2642);
3141
3142 fSets->addElement(fBK, status);
3143 fSets->addElement(fCR, status);
3144 fSets->addElement(fLF, status);
3145 fSets->addElement(fCM, status);
3146 fSets->addElement(fNL, status);
3147 fSets->addElement(fWJ, status);
3148 fSets->addElement(fZW, status);
3149 fSets->addElement(fGL, status);
3150 fSets->addElement(fCB, status);
3151 fSets->addElement(fSP, status);
3152 fSets->addElement(fB2, status);
3153 fSets->addElement(fBA, status);
3154 fSets->addElement(fBB, status);
3155 fSets->addElement(fHY, status);
3156 fSets->addElement(fH2, status);
3157 fSets->addElement(fH3, status);
3158 fSets->addElement(fCL, status);
3159 fSets->addElement(fCP, status);
3160 fSets->addElement(fEX, status);
3161 fSets->addElement(fIN, status);
3162 fSets->addElement(fJL, status);
3163 fSets->addElement(fJT, status);
3164 fSets->addElement(fJV, status);
3165 fSets->addElement(fNS, status);
3166 fSets->addElement(fOP, status);
3167 fSets->addElement(fQU, status);
3168 fSets->addElement(fIS, status);
3169 fSets->addElement(fNU, status);
3170 fSets->addElement(fPO, status);
3171 fSets->addElement(fPR, status);
3172 fSets->addElement(fSY, status);
3173 fSets->addElement(fAI, status);
3174 fSets->addElement(fAL, status);
3175 fSets->addElement(fHL, status);
3176 fSets->addElement(fID, status);
3177 fSets->addElement(fWJ, status);
3178 fSets->addElement(fRI, status);
3179 fSets->addElement(fSG, status);
3180 fSets->addElement(fEB, status);
3181 fSets->addElement(fEM, status);
3182 fSets->addElement(fZJ, status);
3183
3184 const char *rules =
3185 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3186 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3187 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3188 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3189 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3190 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3191
3192 fNumberMatcher = new RegexMatcher(
3193 UnicodeString(rules, -1, US_INV), 0, status);
3194
3195 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3196
3197 if (U_FAILURE(status)) {
3198 deferredStatus = status;
3199 }
3200 }
3201
3202
3203 void RBBILineMonkey::setText(const UnicodeString &s) {
3204 fText = &s;
3205 fCharBI->setText(s);
3206 fNumberMatcher->reset(s);
3207 }
3208
3209 //
3210 // rule9Adjust
3211 // Line Break TR rules 9 and 10 implementation.
3212 // This deals with combining marks and other sequences that
3213 // that must be treated as if they were something other than what they actually are.
3214 //
3215 // This is factored out into a separate function because it must be applied twice for
3216 // each potential break, once to the chars before the position being checked, then
3217 // again to the text following the possible break.
3218 //
3219 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3220 if (pos == -1) {
3221 // Invalid initial position. Happens during the warmup iteration of the
3222 // main loop in next().
3223 return;
3224 }
3225
3226 int32_t nPos = *nextPos;
3227
3228 // LB 9 Keep combining sequences together.
3229 // advance over any CM class chars. Note that Line Break CM is different
3230 // from the normal Grapheme Extend property.
3231 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3232 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3233 for (;;) {
3234 *nextChar = fText->char32At(nPos);
3235 if (!fCM->contains(*nextChar)) {
3236 break;
3237 }
3238 nPos = fText->moveIndex32(nPos, 1);
3239 }
3240 }
3241
3242
3243 // LB 9 Treat X CM* as if it were x.
3244 // No explicit action required.
3245
3246 // LB 10 Treat any remaining combining mark as AL
3247 if (fCM->contains(*posChar)) {
3248 *posChar = 0x41; // thisChar = 'A';
3249 }
3250
3251 // Push the updated nextPos and nextChar back to our caller.
3252 // This only makes a difference if posChar got bigger by consuming a
3253 // combining sequence.
3254 *nextPos = nPos;
3255 *nextChar = fText->char32At(nPos);
3256 }
3257
3258
3259
3260 int32_t RBBILineMonkey::next(int32_t startPos) {
3261 UErrorCode status = U_ZERO_ERROR;
3262 int32_t pos; // Index of the char following a potential break position
3263 UChar32 thisChar; // Character at above position "pos"
3264
3265 int32_t prevPos; // Index of the char preceding a potential break position
3266 UChar32 prevChar; // Character at above position. Note that prevChar
3267 // and thisChar may not be adjacent because combining
3268 // characters between them will be ignored.
3269
3270 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
3271 UChar32 prevCharX2;
3272
3273 int32_t nextPos; // Index of the next character following pos.
3274 // Usually skips over combining marks.
3275 int32_t nextCPPos; // Index of the code point following "pos."
3276 // May point to a combining mark.
3277 int32_t tPos; // temp value.
3278 UChar32 c;
3279
3280 if (U_FAILURE(deferredStatus)) {
3281 return -1;
3282 }
3283
3284 if (startPos >= fText->length()) {
3285 return -1;
3286 }
3287
3288
3289 // Initial values for loop. Loop will run the first time without finding breaks,
3290 // while the invalid values shift out and the "this" and
3291 // "prev" positions are filled in with good values.
3292 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
3293 thisChar = prevChar = prevCharX2 = 0;
3294 nextPos = nextCPPos = startPos;
3295
3296
3297 // Loop runs once per position in the test text, until a break position
3298 // is found.
3299 for (;;) {
3300 prevPosX2 = prevPos;
3301 prevCharX2 = prevChar;
3302
3303 prevPos = pos;
3304 prevChar = thisChar;
3305
3306 pos = nextPos;
3307 thisChar = fText->char32At(pos);
3308
3309 nextCPPos = fText->moveIndex32(pos, 1);
3310 nextPos = nextCPPos;
3311
3312 // Rule LB2 - Break at end of text.
3313 if (pos >= fText->length()) {
3314 break;
3315 }
3316
3317 // Rule LB 9 - adjust for combining sequences.
3318 // We do this one out-of-order because the adjustment does not change anything
3319 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3320 // be applied.
3321 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3322 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3323 c = fText->char32At(nextPos);
3324 rule9Adjust(pos, &thisChar, &nextPos, &c);
3325
3326 // If the loop is still warming up - if we haven't shifted the initial
3327 // -1 positions out of prevPos yet - loop back to advance the
3328 // position in the input without any further looking for breaks.
3329 if (prevPos == -1) {
3330 continue;
3331 }
3332
3333 // LB 4 Always break after hard line breaks,
3334 if (fBK->contains(prevChar)) {
3335 break;
3336 }
3337
3338 // LB 5 Break after CR, LF, NL, but not inside CR LF
3339 if (prevChar == 0x0d && thisChar == 0x0a) {
3340 continue;
3341 }
3342 if (prevChar == 0x0d ||
3343 prevChar == 0x0a ||
3344 prevChar == 0x85) {
3345 break;
3346 }
3347
3348 // LB 6 Don't break before hard line breaks
3349 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3350 fBK->contains(thisChar)) {
3351 continue;
3352 }
3353
3354
3355 // LB 7 Don't break before spaces or zero-width space.
3356 if (fSP->contains(thisChar)) {
3357 continue;
3358 }
3359
3360 if (fZW->contains(thisChar)) {
3361 continue;
3362 }
3363
3364 // LB 8 Break after zero width space
3365 if (fZW->contains(prevChar)) {
3366 break;
3367 }
3368
3369 // LB 8a ZJ x ID
3370 // The monkey test's way of ignoring combining characters doesn't work
3371 // for this rule. ZJ is also a CM. Need to get the actual character
3372 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3373 {
3374 int32_t prevIdx = fText->moveIndex32(pos, -1);
3375 UChar32 prevC = fText->char32At(prevIdx);
3376 if (fZJ->contains(prevC) && fID->contains(thisChar)) {
3377 continue;
3378 }
3379 }
3380
3381 // LB 9, 10 Already done, at top of loop.
3382 //
3383
3384
3385 // LB 11 Do not break before or after WORD JOINER and related characters.
3386 // x WJ
3387 // WJ x
3388 //
3389 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3390 continue;
3391 }
3392
3393 // LB 12
3394 // GL x
3395 if (fGL->contains(prevChar)) {
3396 continue;
3397 }
3398
3399 // LB 12a
3400 // [^SP BA HY] x GL
3401 if (!(fSP->contains(prevChar) ||
3402 fBA->contains(prevChar) ||
3403 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3404 continue;
3405 }
3406
3407
3408
3409 // LB 13 Don't break before closings.
3410 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3411 // fall into LB 17 and the more general number regular expression.
3412 //
3413 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3414 (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3415 fEX->contains(thisChar) ||
3416 (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3417 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
3418 continue;
3419 }
3420
3421 // LB 14 Don't break after OP SP*
3422 // Scan backwards, checking for this sequence.
3423 // The OP char could include combining marks, so we actually check for
3424 // OP CM* SP*
3425 // Another Twist: The Rule 67 fixes may have changed a SP CM
3426 // sequence into a ID char, so before scanning back through spaces,
3427 // verify that prevChar is indeed a space. The prevChar variable
3428 // may differ from fText[prevPos]
3429 tPos = prevPos;
3430 if (fSP->contains(prevChar)) {
3431 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3432 tPos=fText->moveIndex32(tPos, -1);
3433 }
3434 }
3435 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3436 tPos=fText->moveIndex32(tPos, -1);
3437 }
3438 if (fOP->contains(fText->char32At(tPos))) {
3439 continue;
3440 }
3441
3442
3443 // LB 15 QU SP* x OP
3444 if (fOP->contains(thisChar)) {
3445 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3446 int tPos = prevPos;
3447 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3448 tPos = fText->moveIndex32(tPos, -1);
3449 }
3450 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3451 tPos = fText->moveIndex32(tPos, -1);
3452 }
3453 if (fQU->contains(fText->char32At(tPos))) {
3454 continue;
3455 }
3456 }
3457
3458
3459
3460 // LB 16 (CL | CP) SP* x NS
3461 // Scan backwards for SP* CM* (CL | CP)
3462 if (fNS->contains(thisChar)) {
3463 int tPos = prevPos;
3464 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3465 tPos = fText->moveIndex32(tPos, -1);
3466 }
3467 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3468 tPos = fText->moveIndex32(tPos, -1);
3469 }
3470 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3471 continue;
3472 }
3473 }
3474
3475
3476 // LB 17 B2 SP* x B2
3477 if (fB2->contains(thisChar)) {
3478 // Scan backwards, checking for the B2 CM* SP* sequence.
3479 tPos = prevPos;
3480 if (fSP->contains(prevChar)) {
3481 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3482 tPos=fText->moveIndex32(tPos, -1);
3483 }
3484 }
3485 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3486 tPos=fText->moveIndex32(tPos, -1);
3487 }
3488 if (fB2->contains(fText->char32At(tPos))) {
3489 continue;
3490 }
3491 }
3492
3493
3494 // LB 18 break after space
3495 if (fSP->contains(prevChar)) {
3496 break;
3497 }
3498
3499 // LB 19
3500 // x QU
3501 // QU x
3502 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3503 continue;
3504 }
3505
3506 // LB 20 Break around a CB
3507 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3508 break;
3509 }
3510
3511 // LB 21
3512 if (fBA->contains(thisChar) ||
3513 fHY->contains(thisChar) ||
3514 fNS->contains(thisChar) ||
3515 fBB->contains(prevChar) ) {
3516 continue;
3517 }
3518
3519 // LB 21a
3520 // HL (HY | BA) x
3521 if (fHL->contains(prevCharX2) &&
3522 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3523 continue;
3524 }
3525
3526 // LB 21b
3527 // SY x HL
3528 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3529 continue;
3530 }
3531
3532 // LB 22
3533 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3534 (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3535 (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3536 (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3537 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3538 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
3539 continue;
3540 }
3541
3542
3543 // LB 23 ID x PO
3544 // AL x NU
3545 // HL x NU
3546 // NU x AL
3547 if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3548 (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3549 (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3550 (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3551 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) {
3552 continue;
3553 }
3554
3555 // LB 24 Do not break between prefix and letters or ideographs.
3556 // PR x ID
3557 // PR x (AL | HL)
3558 // PO x (AL | HL)
3559 // (AL | HL) x PR // Apple early addition
3560 // (AL | HL) x PO // Apple early addition
3561 if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3562 (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3563 (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3564 ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fPR->contains(thisChar)) ||
3565 ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fPO->contains(thisChar)) ) {
3566 continue;
3567 }
3568
3569
3570
3571 // LB 25 Numbers
3572 if (fNumberMatcher->lookingAt(prevPos, status)) {
3573 if (U_FAILURE(status)) {
3574 break;
3575 }
3576 // Matched a number. But could have been just a single digit, which would
3577 // not represent a "no break here" between prevChar and thisChar
3578 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3579 if (numEndIdx > pos) {
3580 // Number match includes at least our two chars being checked
3581 if (numEndIdx > nextPos) {
3582 // Number match includes additional chars. Update pos and nextPos
3583 // so that next loop iteration will continue at the end of the number,
3584 // checking for breaks between last char in number & whatever follows.
3585 pos = nextPos = numEndIdx;
3586 do {
3587 pos = fText->moveIndex32(pos, -1);
3588 thisChar = fText->char32At(pos);
3589 } while (fCM->contains(thisChar));
3590 }
3591 continue;
3592 }
3593 }
3594
3595
3596 // LB 26 Do not break a Korean syllable.
3597 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3598 fJV->contains(thisChar) ||
3599 fH2->contains(thisChar) ||
3600 fH3->contains(thisChar))) {
3601 continue;
3602 }
3603
3604 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3605 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3606 continue;
3607 }
3608
3609 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3610 fJT->contains(thisChar)) {
3611 continue;
3612 }
3613
3614 // LB 27 Treat a Korean Syllable Block the same as ID.
3615 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3616 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3617 fIN->contains(thisChar)) {
3618 continue;
3619 }
3620 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3621 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3622 fPO->contains(thisChar)) {
3623 continue;
3624 }
3625 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3626 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3627 continue;
3628 }
3629
3630
3631
3632 // LB 28 Do not break between alphabetics ("at").
3633 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3634 continue;
3635 }
3636
3637 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3638 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3639 continue;
3640 }
3641
3642 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3643 // (AL | NU) x OP
3644 // CP x (AL | NU)
3645 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3646 continue;
3647 }
3648 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3649 continue;
3650 }
3651
3652 // LB30a RI RI <break> RI
3653 // RI x RI
3654 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3655 break;
3656 }
3657 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3658 continue;
3659 }
3660
3661 // LB30b Emoji Base x Emoji Modifier
3662 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3663 continue;
3664 }
3665
3666 // LB 31 Break everywhere else
3667 break;
3668
3669 }
3670
3671 return pos;
3672 }
3673
3674
3675 UVector *RBBILineMonkey::charClasses() {
3676 return fSets;
3677 }
3678
3679
3680 RBBILineMonkey::~RBBILineMonkey() {
3681 delete fSets;
3682
3683 delete fBK;
3684 delete fCR;
3685 delete fLF;
3686 delete fCM;
3687 delete fNL;
3688 delete fWJ;
3689 delete fZW;
3690 delete fGL;
3691 delete fCB;
3692 delete fSP;
3693 delete fB2;
3694 delete fBA;
3695 delete fBB;
3696 delete fHY;
3697 delete fH2;
3698 delete fH3;
3699 delete fCL;
3700 delete fCP;
3701 delete fEX;
3702 delete fIN;
3703 delete fJL;
3704 delete fJV;
3705 delete fJT;
3706 delete fNS;
3707 delete fOP;
3708 delete fQU;
3709 delete fIS;
3710 delete fNU;
3711 delete fPO;
3712 delete fPR;
3713 delete fSY;
3714 delete fAI;
3715 delete fAL;
3716 delete fCJ;
3717 delete fHL;
3718 delete fID;
3719 delete fRI;
3720 delete fSG;
3721 delete fXX;
3722 delete fEB;
3723 delete fEM;
3724 delete fZJ;
3725
3726 delete fCharBI;
3727 delete fNumberMatcher;
3728 }
3729
3730
3731 //-------------------------------------------------------------------------------------------
3732 //
3733 // TestMonkey
3734 //
3735 // params
3736 // seed=nnnnn Random number starting seed.
3737 // Setting the seed allows errors to be reproduced.
3738 // loop=nnn Looping count. Controls running time.
3739 // -1: run forever.
3740 // 0 or greater: run length.
3741 //
3742 // type = char | word | line | sent | title
3743 //
3744 // Example:
3745 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3746 //
3747 //-------------------------------------------------------------------------------------------
3748
3749 static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3750 int32_t val = defaultVal;
3751 name.append(" *= *(-?\\d+)");
3752 UErrorCode status = U_ZERO_ERROR;
3753 RegexMatcher m(name, params, 0, status);
3754 if (m.find()) {
3755 // The param exists. Convert the string to an int.
3756 char valString[100];
3757 int32_t paramLength = m.end(1, status) - m.start(1, status);
3758 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3759 paramLength = (int32_t)(sizeof(valString)-2);
3760 }
3761 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3762 val = strtol(valString, NULL, 10);
3763
3764 // Delete this parameter from the params string.
3765 m.reset();
3766 params = m.replaceFirst("", status);
3767 }
3768 U_ASSERT(U_SUCCESS(status));
3769 return val;
3770 }
3771 #endif
3772
3773 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3774 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3775 BreakIterator *bi,
3776 int expected[],
3777 int expectedcount)
3778 {
3779 int count = 0;
3780 int i = 0;
3781 int forward[50];
3782 bi->setText(ustr);
3783 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3784 forward[count] = i;
3785 if (count < expectedcount && expected[count] != i) {
3786 test->errln("break forward test failed: expected %d but got %d",
3787 expected[count], i);
3788 break;
3789 }
3790 count ++;
3791 }
3792 if (count != expectedcount) {
3793 printStringBreaks(ustr, expected, expectedcount);
3794 test->errln("break forward test failed: missed %d match",
3795 expectedcount - count);
3796 return;
3797 }
3798 // testing boundaries
3799 for (i = 1; i < expectedcount; i ++) {
3800 int j = expected[i - 1];
3801 if (!bi->isBoundary(j)) {
3802 printStringBreaks(ustr, expected, expectedcount);
3803 test->errln("isBoundary() failed. Expected boundary at position %d", j);
3804 return;
3805 }
3806 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3807 if (bi->isBoundary(j)) {
3808 printStringBreaks(ustr, expected, expectedcount);
3809 test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
3810 return;
3811 }
3812 }
3813 }
3814
3815 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3816 count --;
3817 if (forward[count] != i) {
3818 printStringBreaks(ustr, expected, expectedcount);
3819 test->errln("happy break test previous() failed: expected %d but got %d",
3820 forward[count], i);
3821 break;
3822 }
3823 }
3824 if (count != 0) {
3825 printStringBreaks(ustr, expected, expectedcount);
3826 test->errln("break test previous() failed: missed a match");
3827 return;
3828 }
3829
3830 // testing preceding
3831 for (i = 0; i < expectedcount - 1; i ++) {
3832 // int j = expected[i] + 1;
3833 int j = ustr.moveIndex32(expected[i], 1);
3834 for (; j <= expected[i + 1]; j ++) {
3835 if (bi->preceding(j) != expected[i]) {
3836 printStringBreaks(ustr, expected, expectedcount);
3837 test->errln("preceding(): Not expecting boundary at position %d", j);
3838 return;
3839 }
3840 }
3841 }
3842 }
3843 #endif
3844
3845 void RBBITest::TestWordBreaks(void)
3846 {
3847 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3848
3849 Locale locale("en");
3850 UErrorCode status = U_ZERO_ERROR;
3851 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3852 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3853 // Replaced any C+J characters in a row with a random sequence of characters
3854 // of the same length to make our C+J segmentation not get in the way.
3855 static const char *strlist[] =
3856 {
3857 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3858 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3859 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3860 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3861 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3862 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3863 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3864 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3865 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3866 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3867 "\\u2027\\U000e0067\\u0a47\\u00b7",
3868 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3869 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3870 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3871 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3872 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3873 "\\u0027\\u11af\\U000e0057\\u0602",
3874 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3875 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3876 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3877 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3878 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3879 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3880 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3881 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3882 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3883 "\\u18f4\\U000e0049\\u20e7\\u2027",
3884 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3885 "\\ua183\\u102d\\u0bec\\u003a",
3886 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3887 "\\u003a\\u0e57\\u0fad\\u002e",
3888 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3889 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3890 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3891 "\\u003a\\u0664\\u00b7\\u1fba",
3892 "\\u003b\\u0027\\u00b7\\u47a3",
3893 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3894 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3895 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3896 };
3897 int loop;
3898 if (U_FAILURE(status)) {
3899 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3900 return;
3901 }
3902 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3903 // printf("looping %d\n", loop);
3904 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3905 // RBBICharMonkey monkey;
3906 RBBIWordMonkey monkey;
3907
3908 int expected[50];
3909 int expectedcount = 0;
3910
3911 monkey.setText(ustr);
3912 int i;
3913 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3914 expected[expectedcount ++] = i;
3915 }
3916
3917 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3918 }
3919 delete bi;
3920 #endif
3921 }
3922
3923 void RBBITest::TestWordBoundary(void)
3924 {
3925 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3926 Locale locale("en");
3927 UErrorCode status = U_ZERO_ERROR;
3928 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3929 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3930 UChar str[50];
3931 static const char *strlist[] =
3932 {
3933 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3934 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3935 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3936 "\\u2027\\U000e0067\\u0a47\\u00b7",
3937 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3938 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3939 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3940 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3941 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3942 "\\u0027\\u11af\\U000e0057\\u0602",
3943 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3944 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3945 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3946 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3947 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3948 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3949 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3950 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3951 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3952 "\\u58f4\\U000e0049\\u20e7\\u2027",
3953 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3954 "\\ua183\\u102d\\u0bec\\u003a",
3955 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3956 "\\u003a\\u0e57\\u0fad\\u002e",
3957 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3958 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3959 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3960 "\\u003a\\u0664\\u00b7\\u1fba",
3961 "\\u003b\\u0027\\u00b7\\u47a3",
3962 };
3963 int loop;
3964 if (U_FAILURE(status)) {
3965 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3966 return;
3967 }
3968 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3969 // printf("looping %d\n", loop);
3970 u_unescape(strlist[loop], str, 20);
3971 UnicodeString ustr(str);
3972 int forward[50];
3973 int count = 0;
3974
3975 bi->setText(ustr);
3976 int prev = 0;
3977 int i;
3978 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3979 forward[count ++] = i;
3980 if (i > prev) {
3981 int j;
3982 for (j = prev + 1; j < i; j ++) {
3983 if (bi->isBoundary(j)) {
3984 printStringBreaks(ustr, forward, count);
3985 errln("happy boundary test failed: expected %d not a boundary",
3986 j);
3987 return;
3988 }
3989 }
3990 }
3991 if (!bi->isBoundary(i)) {
3992 printStringBreaks(ustr, forward, count);
3993 errln("happy boundary test failed: expected %d a boundary",
3994 i);
3995 return;
3996 }
3997 prev = i;
3998 }
3999 }
4000 delete bi;
4001 }
4002
4003 void RBBITest::TestLineBreaks(void)
4004 {
4005 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4006 Locale locale("en");
4007 UErrorCode status = U_ZERO_ERROR;
4008 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4009 const int32_t STRSIZE = 50;
4010 UChar str[STRSIZE];
4011 static const char *strlist[] =
4012 {
4013 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
4014 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
4015 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
4016 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
4017 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
4018 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
4019 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4020 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4021 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4022 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4023 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4024 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4025 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4026 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4027 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4028 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4029 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4030 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4031 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4032 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4033 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4034 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4035 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4036 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4037 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4038 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4039 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4040 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4041 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4042 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4043 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4044 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4045 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4046 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4047 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4048 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4049 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4050 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4051 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4052 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4053 };
4054 int loop;
4055 TEST_ASSERT_SUCCESS(status);
4056 if (U_FAILURE(status)) {
4057 return;
4058 }
4059 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4060 // printf("looping %d\n", loop);
4061 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4062 if (t >= STRSIZE) {
4063 TEST_ASSERT(FALSE);
4064 continue;
4065 }
4066
4067
4068 UnicodeString ustr(str);
4069 RBBILineMonkey monkey;
4070 if (U_FAILURE(monkey.deferredStatus)) {
4071 continue;
4072 }
4073
4074 const int EXPECTEDSIZE = 50;
4075 int expected[EXPECTEDSIZE];
4076 int expectedcount = 0;
4077
4078 monkey.setText(ustr);
4079 int i;
4080 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4081 if (expectedcount >= EXPECTEDSIZE) {
4082 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4083 return;
4084 }
4085 expected[expectedcount ++] = i;
4086 }
4087
4088 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4089 }
4090 delete bi;
4091 #endif
4092 }
4093
4094 void RBBITest::TestSentBreaks(void)
4095 {
4096 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4097 Locale locale("en");
4098 UErrorCode status = U_ZERO_ERROR;
4099 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4100 UChar str[200];
4101 static const char *strlist[] =
4102 {
4103 "Now\ris\nthe\r\ntime\n\rfor\r\r",
4104 "This\n",
4105 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4106 "\"Sentence ending with a quote.\" Bye.",
4107 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
4108 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4109 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4110 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4111 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4112 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4113 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4114 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4115 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4116 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4117 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4118 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4119 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4120 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4121 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4122 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4123 };
4124 int loop;
4125 if (U_FAILURE(status)) {
4126 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4127 return;
4128 }
4129 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4130 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
4131 UnicodeString ustr(str);
4132
4133 RBBISentMonkey monkey;
4134 if (U_FAILURE(monkey.deferredStatus)) {
4135 continue;
4136 }
4137
4138 const int EXPECTEDSIZE = 50;
4139 int expected[EXPECTEDSIZE];
4140 int expectedcount = 0;
4141
4142 monkey.setText(ustr);
4143 int i;
4144 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4145 if (expectedcount >= EXPECTEDSIZE) {
4146 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4147 return;
4148 }
4149 expected[expectedcount ++] = i;
4150 }
4151
4152 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4153 }
4154 delete bi;
4155 #endif
4156 }
4157
4158 void RBBITest::TestMonkey(char *params) {
4159 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4160
4161 UErrorCode status = U_ZERO_ERROR;
4162 int32_t loopCount = 500;
4163 int32_t seed = 1;
4164 UnicodeString breakType = "all";
4165 Locale locale("en");
4166 UBool useUText = FALSE;
4167
4168 if (quick == FALSE) {
4169 loopCount = 10000;
4170 }
4171
4172 if (params) {
4173 UnicodeString p(params);
4174 loopCount = getIntParam("loop", p, loopCount);
4175 seed = getIntParam("seed", p, seed);
4176
4177 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4178 if (m.find()) {
4179 breakType = m.group(1, status);
4180 m.reset();
4181 p = m.replaceFirst("", status);
4182 }
4183
4184 RegexMatcher u(" *utext", p, 0, status);
4185 if (u.find()) {
4186 useUText = TRUE;
4187 u.reset();
4188 p = u.replaceFirst("", status);
4189 }
4190
4191
4192 // m.reset(p);
4193 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4194 // Each option is stripped out of the option string as it is processed.
4195 // All options have been checked. The option string should have been completely emptied..
4196 char buf[100];
4197 p.extract(buf, sizeof(buf), NULL, status);
4198 buf[sizeof(buf)-1] = 0;
4199 errln("Unrecognized or extra parameter: %s\n", buf);
4200 return;
4201 }
4202
4203 }
4204
4205 if (breakType == "char" || breakType == "all") {
4206 RBBICharMonkey m;
4207 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
4208 if (U_SUCCESS(status)) {
4209 RunMonkey(bi, m, "char", seed, loopCount, useUText);
4210 if (breakType == "all" && useUText==FALSE) {
4211 // Also run a quick test with UText when "all" is specified
4212 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4213 }
4214 }
4215 else {
4216 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4217 }
4218 delete bi;
4219 }
4220
4221 if (breakType == "word" || breakType == "all") {
4222 logln("Word Break Monkey Test");
4223 RBBIWordMonkey m;
4224 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4225 if (U_SUCCESS(status)) {
4226 RunMonkey(bi, m, "word", seed, loopCount, useUText);
4227 }
4228 else {
4229 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4230 }
4231 delete bi;
4232 }
4233
4234 if (breakType == "line" || breakType == "all") {
4235 logln("Line Break Monkey Test");
4236 RBBILineMonkey m;
4237 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4238 if (loopCount >= 10) {
4239 loopCount = loopCount / 5; // Line break runs slower than the others.
4240 }
4241 if (U_SUCCESS(status)) {
4242 RunMonkey(bi, m, "line", seed, loopCount, useUText);
4243 }
4244 else {
4245 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4246 }
4247 delete bi;
4248 }
4249
4250 if (breakType == "sent" || breakType == "all" ) {
4251 logln("Sentence Break Monkey Test");
4252 RBBISentMonkey m;
4253 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4254 if (loopCount >= 10) {
4255 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4256 }
4257 if (U_SUCCESS(status)) {
4258 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4259 }
4260 else {
4261 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4262 }
4263 delete bi;
4264 }
4265
4266 #endif
4267 }
4268
4269 //
4270 // Run a RBBI monkey test. Common routine, for all break iterator types.
4271 // Parameters:
4272 // bi - the break iterator to use
4273 // mk - MonkeyKind, abstraction for obtaining expected results
4274 // name - Name of test (char, word, etc.) for use in error messages
4275 // seed - Seed for starting random number generator (parameter from user)
4276 // numIterations
4277 //
4278 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
4279 int32_t numIterations, UBool useUText) {
4280
4281 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4282
4283 const int32_t TESTSTRINGLEN = 500;
4284 UnicodeString testText;
4285 int32_t numCharClasses;
4286 UVector *chClasses;
4287 int expected[TESTSTRINGLEN*2 + 1];
4288 int expectedCount = 0;
4289 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4290 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4291 char reverseBreaks[TESTSTRINGLEN*2+1];
4292 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4293 char followingBreaks[TESTSTRINGLEN*2+1];
4294 char precedingBreaks[TESTSTRINGLEN*2+1];
4295 int i;
4296 int loopCount = 0;
4297
4298 m_seed = seed;
4299
4300 numCharClasses = mk.charClasses()->size();
4301 chClasses = mk.charClasses();
4302
4303 // Check for errors that occured during the construction of the MonkeyKind object.
4304 // Can't report them where they occured because errln() is a method coming from intlTest,
4305 // and is not visible outside of RBBITest :-(
4306 if (U_FAILURE(mk.deferredStatus)) {
4307 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4308 return;
4309 }
4310
4311 // Verify that the character classes all have at least one member.
4312 for (i=0; i<numCharClasses; i++) {
4313 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4314 if (s == NULL || s->size() == 0) {
4315 errln("Character Class #%d is null or of zero size.", i);
4316 return;
4317 }
4318 }
4319
4320 while (loopCount < numIterations || numIterations == -1) {
4321 if (numIterations == -1 && loopCount % 10 == 0) {
4322 // If test is running in an infinite loop, display a periodic tic so
4323 // we can tell that it is making progress.
4324 fprintf(stderr, ".");
4325 }
4326 // Save current random number seed, so that we can recreate the random numbers
4327 // for this loop iteration in event of an error.
4328 seed = m_seed;
4329
4330 // Populate a test string with data.
4331 testText.truncate(0);
4332 for (i=0; i<TESTSTRINGLEN; i++) {
4333 int32_t aClassNum = m_rand() % numCharClasses;
4334 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4335 int32_t charIdx = m_rand() % classSet->size();
4336 UChar32 c = classSet->charAt(charIdx);
4337 if (c < 0) { // TODO: deal with sets containing strings.
4338 errln("%s:%d c < 0", __FILE__, __LINE__);
4339 break;
4340 }
4341 // Do not assemble a supplementary character from randomly generated separate surrogates.
4342 // (It could be a dictionary character)
4343 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4344 continue;
4345 }
4346
4347 testText.append(c);
4348 }
4349
4350 // Calculate the expected results for this test string.
4351 mk.setText(testText);
4352 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4353 expectedBreaks[0] = 1;
4354 int32_t breakPos = 0;
4355 expectedCount = 0;
4356 for (;;) {
4357 breakPos = mk.next(breakPos);
4358 if (breakPos == -1) {
4359 break;
4360 }
4361 if (breakPos > testText.length()) {
4362 errln("breakPos > testText.length()");
4363 }
4364 expectedBreaks[breakPos] = 1;
4365 U_ASSERT(expectedCount<testText.length());
4366 expected[expectedCount ++] = breakPos;
4367 (void)expected; // Set but not used warning.
4368 // TODO (andy): check it out.
4369 }
4370
4371 // Find the break positions using forward iteration
4372 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4373 if (useUText) {
4374 UErrorCode status = U_ZERO_ERROR;
4375 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4376 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4377 bi->setText(testUText, status);
4378 TEST_ASSERT_SUCCESS(status);
4379 utext_close(testUText); // The break iterator does a shallow clone of the UText
4380 // This UText can be closed immediately, so long as the
4381 // testText string continues to exist.
4382 } else {
4383 bi->setText(testText);
4384 }
4385
4386 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4387 if (i < 0 || i > testText.length()) {
4388 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4389 break;
4390 }
4391 forwardBreaks[i] = 1;
4392 }
4393
4394 // Find the break positions using reverse iteration
4395 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4396 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4397 if (i < 0 || i > testText.length()) {
4398 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4399 break;
4400 }
4401 reverseBreaks[i] = 1;
4402 }
4403
4404 // Find the break positions using isBoundary() tests.
4405 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4406 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4407 for (i=0; i<=testText.length(); i++) {
4408 isBoundaryBreaks[i] = bi->isBoundary(i);
4409 }
4410
4411
4412 // Find the break positions using the following() function.
4413 // printf(".");
4414 memset(followingBreaks, 0, sizeof(followingBreaks));
4415 int32_t lastBreakPos = 0;
4416 followingBreaks[0] = 1;
4417 for (i=0; i<testText.length(); i++) {
4418 breakPos = bi->following(i);
4419 if (breakPos <= i ||
4420 breakPos < lastBreakPos ||
4421 breakPos > testText.length() ||
4422 (breakPos > lastBreakPos && lastBreakPos > i)) {
4423 errln("%s break monkey test: "
4424 "Out of range value returned by BreakIterator::following().\n"
4425 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4426 name, seed, i, breakPos, lastBreakPos);
4427 break;
4428 }
4429 followingBreaks[breakPos] = 1;
4430 lastBreakPos = breakPos;
4431 }
4432
4433 // Find the break positions using the preceding() function.
4434 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4435 lastBreakPos = testText.length();
4436 precedingBreaks[testText.length()] = 1;
4437 for (i=testText.length(); i>0; i--) {
4438 breakPos = bi->preceding(i);
4439 if (breakPos >= i ||
4440 breakPos > lastBreakPos ||
4441 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4442 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4443 errln("%s break monkey test: "
4444 "Out of range value returned by BreakIterator::preceding().\n"
4445 "index=%d; prev returned %d; lastBreak=%d" ,
4446 name, i, breakPos, lastBreakPos);
4447 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4448 precedingBreaks[i] = 2; // Forces an error.
4449 }
4450 } else {
4451 if (breakPos >= 0) {
4452 precedingBreaks[breakPos] = 1;
4453 }
4454 lastBreakPos = breakPos;
4455 }
4456 }
4457
4458 // Compare the expected and actual results.
4459 for (i=0; i<=testText.length(); i++) {
4460 const char *errorType = NULL;
4461 if (forwardBreaks[i] != expectedBreaks[i]) {
4462 errorType = "next()";
4463 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4464 errorType = "previous()";
4465 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4466 errorType = "isBoundary()";
4467 } else if (followingBreaks[i] != expectedBreaks[i]) {
4468 errorType = "following()";
4469 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4470 errorType = "preceding()";
4471 }
4472
4473
4474 if (errorType != NULL) {
4475 // Format a range of the test text that includes the failure as
4476 // a data item that can be included in the rbbi test data file.
4477
4478 // Start of the range is the last point where expected and actual results
4479 // both agreed that there was a break position.
4480 int startContext = i;
4481 int32_t count = 0;
4482 for (;;) {
4483 if (startContext==0) { break; }
4484 startContext --;
4485 if (expectedBreaks[startContext] != 0) {
4486 if (count == 2) break;
4487 count ++;
4488 }
4489 }
4490
4491 // End of range is two expected breaks past the start position.
4492 int endContext = i + 1;
4493 int ci;
4494 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4495 for (;;) {
4496 if (endContext >= testText.length()) {break;}
4497 if (expectedBreaks[endContext-1] != 0) {
4498 if (count == 0) break;
4499 count --;
4500 }
4501 endContext ++;
4502 }
4503 }
4504
4505 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4506 UnicodeString errorText = "<data>";
4507 /***if (strcmp(errorType, "next()") == 0) {
4508 startContext = 0;
4509 endContext = testText.length();
4510
4511 printStringBreaks(testText, expected, expectedCount);
4512 }***/
4513
4514 for (ci=startContext; ci<endContext;) {
4515 UnicodeString hexChars("0123456789abcdef");
4516 UChar32 c;
4517 int bn;
4518 c = testText.char32At(ci);
4519 if (ci == i) {
4520 // This is the location of the error.
4521 errorText.append("<?>");
4522 } else if (expectedBreaks[ci] != 0) {
4523 // This a non-error expected break position.
4524 errorText.append("\\");
4525 }
4526 if (c < 0x10000) {
4527 errorText.append("\\u");
4528 for (bn=12; bn>=0; bn-=4) {
4529 errorText.append(hexChars.charAt((c>>bn)&0xf));
4530 }
4531 } else {
4532 errorText.append("\\U");
4533 for (bn=28; bn>=0; bn-=4) {
4534 errorText.append(hexChars.charAt((c>>bn)&0xf));
4535 }
4536 }
4537 ci = testText.moveIndex32(ci, 1);
4538 }
4539 errorText.append("\\");
4540 errorText.append("</data>\n");
4541
4542 // Output the error
4543 char charErrorTxt[500];
4544 UErrorCode status = U_ZERO_ERROR;
4545 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4546 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4547 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4548
4549 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4550 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4551 errorType, seed, i, charErrorTxt);
4552 break;
4553 }
4554 }
4555
4556 loopCount++;
4557 }
4558 #endif
4559 }
4560
4561
4562 // Bug 5532. UTF-8 based UText fails in dictionary code.
4563 // This test checks the initial patch,
4564 // which is to just keep it from crashing. Correct word boundaries
4565 // await a proper fix to the dictionary code.
4566 //
4567 void RBBITest::TestBug5532(void) {
4568 // Text includes a mixture of Thai and Latin.
4569 const unsigned char utf8Data[] = {
4570 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4571 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4572 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4573 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4574 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4575 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4576 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4577 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4578 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4579 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4580 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4581
4582 UErrorCode status = U_ZERO_ERROR;
4583 UText utext=UTEXT_INITIALIZER;
4584 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4585 TEST_ASSERT_SUCCESS(status);
4586
4587 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4588 TEST_ASSERT_SUCCESS(status);
4589 if (U_SUCCESS(status)) {
4590 bi->setText(&utext, status);
4591 TEST_ASSERT_SUCCESS(status);
4592
4593 int32_t breakCount = 0;
4594 int32_t previousBreak = -1;
4595 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4596 // For now, just make sure that the break iterator doesn't hang.
4597 TEST_ASSERT(previousBreak < bi->current());
4598 previousBreak = bi->current();
4599 }
4600 TEST_ASSERT(breakCount > 0);
4601 }
4602 delete bi;
4603 utext_close(&utext);
4604 }
4605
4606
4607 void RBBITest::TestBug9983(void) {
4608 UnicodeString text = UnicodeString("\\u002A" // * Other
4609 "\\uFF65" // Other
4610 "\\u309C" // Katakana
4611 "\\uFF9F" // Extend
4612 "\\uFF65" // Other
4613 "\\u0020" // Other
4614 "\\u0000").unescape();
4615
4616 UErrorCode status = U_ZERO_ERROR;
4617 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4618 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4619 TEST_ASSERT_SUCCESS(status);
4620 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4621 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4622 TEST_ASSERT_SUCCESS(status);
4623 if (U_FAILURE(status)) {
4624 return;
4625 }
4626 int32_t offset, rstatus, iterationCount;
4627
4628 brkiter->setText(text);
4629 brkiter->last();
4630 iterationCount = 0;
4631 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4632 iterationCount++;
4633 rstatus = brkiter->getRuleStatus();
4634 (void)rstatus; // Suppress set but not used warning.
4635 if (iterationCount >= 10) {
4636 break;
4637 }
4638 }
4639 TEST_ASSERT(iterationCount == 6);
4640
4641 brkiterPOSIX->setText(text);
4642 brkiterPOSIX->last();
4643 iterationCount = 0;
4644 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4645 iterationCount++;
4646 rstatus = brkiterPOSIX->getRuleStatus();
4647 (void)rstatus; // Suppress set but not used warning.
4648 if (iterationCount >= 10) {
4649 break;
4650 }
4651 }
4652 TEST_ASSERT(iterationCount == 6);
4653 }
4654
4655
4656 //
4657 // TestDebug - A place-holder test for debugging purposes.
4658 // For putting in fragments of other tests that can be invoked
4659 // for tracing without a lot of unwanted extra stuff happening.
4660 //
4661 void RBBITest::TestDebug(void) {
4662 #if 0
4663 UErrorCode status = U_ZERO_ERROR;
4664 int pos = 0;
4665 int ruleStatus = 0;
4666
4667 RuleBasedBreakIterator* bi =
4668 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4669 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4670 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4671 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4672 // UnicodeString s("Aaa. Bcd");
4673 s = s.unescape();
4674 bi->setText(s);
4675 UBool r = bi->isBoundary(8);
4676 printf("%s", r?"true":"false");
4677 return;
4678 pos = bi->last();
4679 do {
4680 // ruleStatus = bi->getRuleStatus();
4681 printf("%d\t%d\n", pos, ruleStatus);
4682 pos = bi->previous();
4683 } while (pos != BreakIterator::DONE);
4684 #endif
4685 }
4686
4687 void RBBITest::TestProperties() {
4688 UErrorCode errorCode = U_ZERO_ERROR;
4689 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4690 if (!prependSet.isEmpty()) {
4691 errln(
4692 "[:GCB=Prepend:] is not empty any more. "
4693 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4694 "change this test to the opposite condition.");
4695 }
4696 }
4697
4698 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */